Merge branch 'master' of ssh://tin.broadinstitute.org/humgen/gsa-scr1/gsa-engineering/git/unstable

This commit is contained in:
Christopher Hartl 2011-07-08 12:32:32 -04:00
commit f7d188232f
4 changed files with 23 additions and 17 deletions

View File

@ -5,6 +5,7 @@ import net.sf.samtools.*;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import java.util.*;
@ -113,9 +114,10 @@ public class ConstrainedMateFixingManager {
HashMap<String, SAMRecordHashObject> forMateMatching = new HashMap<String, SAMRecordHashObject>();
TreeSet<SAMRecord> waitingReads = new TreeSet<SAMRecord>(comparer);
private <T> T remove(TreeSet<T> treeSet) {
final T first = treeSet.first();
treeSet.remove(first);
private SAMRecord remove(TreeSet<SAMRecord> treeSet) {
final SAMRecord first = treeSet.first();
if ( !treeSet.remove(first) )
throw new UserException("Error caching SAM record " + first.getReadName() + ", which is usually caused by malformed SAM/BAM files in which multiple identical copies of a read are present.");
return first;
}

View File

@ -97,7 +97,7 @@ public class GaussianMixtureModel {
int ttt = 0;
while( ttt++ < numIterations ) {
// Estep: assign each variant to the nearest cluster
// E step: assign each variant to the nearest cluster
for( final VariantDatum datum : data ) {
double minDistance = Double.MAX_VALUE;
MultivariateGaussian minGaussian = null;
@ -112,7 +112,7 @@ public class GaussianMixtureModel {
datum.assignment = minGaussian;
}
// Mstep: update gaussian means based on assigned variants
// M step: update gaussian means based on assigned variants
for( final MultivariateGaussian gaussian : gaussians ) {
gaussian.zeroOutMu();
int numAssigned = 0;
@ -229,26 +229,29 @@ public class GaussianMixtureModel {
}
public double evaluateDatumMarginalized( final VariantDatum datum ) {
int numVals = 0;
int numSamples = 0;
double sumPVarInGaussian = 0.0;
int numIter = 10;
final int numIterPerMissingAnnotation = 10; // Trade off here between speed of computation and accuracy of the marginalization
final double[] pVarInGaussianLog10 = new double[gaussians.size()];
// for each dimension
for( int iii = 0; iii < datum.annotations.length; iii++ ) {
// marginalize over the missing dimension by drawing X random values for the missing annotation and averaging the lod
// if it is missing marginalize over the missing dimension by drawing X random values for the missing annotation and averaging the lod
if( datum.isNull[iii] ) {
for( int ttt = 0; ttt < numIter; ttt++ ) {
datum.annotations[iii] = Normal.staticNextDouble(0.0, 1.0);
for( int ttt = 0; ttt < numIterPerMissingAnnotation; ttt++ ) {
datum.annotations[iii] = GenomeAnalysisEngine.getRandomGenerator().nextGaussian(); // draw a random sample from the standard normal distribution
// evaluate this random data point
int gaussianIndex = 0;
for( final MultivariateGaussian gaussian : gaussians ) {
pVarInGaussianLog10[gaussianIndex++] = gaussian.pMixtureLog10 + gaussian.evaluateDatumLog10( datum );
}
sumPVarInGaussian += Math.pow(10.0, MathUtils.log10sumLog10(pVarInGaussianLog10));
numVals++;
// add this sample's probability to the pile in order to take an average in the end
sumPVarInGaussian += Math.pow(10.0, MathUtils.log10sumLog10(pVarInGaussianLog10)); // p = 10 ^ Sum(pi_k * p(v|n,k))
numSamples++;
}
}
}
return Math.log10( sumPVarInGaussian / ((double) numVals) );
return Math.log10( sumPVarInGaussian / ((double) numSamples) );
}
}

View File

@ -25,7 +25,6 @@
package org.broadinstitute.sting.gatk.walkers.variantrecalibration;
import cern.jet.random.Normal;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
@ -91,7 +90,7 @@ public class VariantDataManager {
meanVector[iii] = theMean;
varianceVector[iii] = theSTD;
for( final VariantDatum datum : data ) {
datum.annotations[iii] = ( datum.isNull[iii] ? Normal.staticNextDouble(0.0, 1.0) : ( datum.annotations[iii] - theMean ) / theSTD );
datum.annotations[iii] = ( datum.isNull[iii] ? GenomeAnalysisEngine.getRandomGenerator().nextGaussian() : ( datum.annotations[iii] - theMean ) / theSTD );
// Each data point is now [ (x - mean) / standard deviation ]
if( annotationKeys.get(iii).toLowerCase().contains("ranksum") && datum.isNull[iii] && datum.annotations[iii] > 0.0 ) {
datum.annotations[iii] /= 3.0;

View File

@ -225,7 +225,7 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec,
loc = pos + alleles.get(0).length() - 1;
} else if ( !isSingleNucleotideEvent(alleles) ) {
ArrayList<Allele> newAlleles = new ArrayList<Allele>();
loc = clipAlleles(pos, ref, alleles, newAlleles);
loc = clipAlleles(pos, ref, alleles, newAlleles, lineNo);
alleles = newAlleles;
}
@ -504,7 +504,7 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec,
* @param clippedAlleles output list of clipped alleles
* @return a list of alleles, clipped to the reference
*/
protected static long clipAlleles(long position, String ref, List<Allele> unclippedAlleles, List<Allele> clippedAlleles) {
protected static long clipAlleles(long position, String ref, List<Allele> unclippedAlleles, List<Allele> clippedAlleles, int lineNo) {
// Note that the computation of forward clipping here is meant only to see whether there is a common
// base to all alleles, and to correctly compute reverse clipping,
@ -522,6 +522,8 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec,
}
if (a.length() - reverseClipped <= forwardClipping || a.length() - forwardClipping == 0)
clipping = false;
else if (ref.length() == reverseClipped)
generateException("bad alleles encountered", lineNo);
else if (a.getBases()[a.length()-reverseClipped-1] != ref.getBytes()[ref.length()-reverseClipped-1])
clipping = false;
}