Merge branch 'master' of ssh://tin.broadinstitute.org/humgen/gsa-scr1/gsa-engineering/git/unstable
This commit is contained in:
commit
f7d188232f
|
|
@ -5,6 +5,7 @@ import net.sf.samtools.*;
|
||||||
import org.apache.log4j.Logger;
|
import org.apache.log4j.Logger;
|
||||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||||
|
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||||
|
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
|
@ -113,9 +114,10 @@ public class ConstrainedMateFixingManager {
|
||||||
HashMap<String, SAMRecordHashObject> forMateMatching = new HashMap<String, SAMRecordHashObject>();
|
HashMap<String, SAMRecordHashObject> forMateMatching = new HashMap<String, SAMRecordHashObject>();
|
||||||
TreeSet<SAMRecord> waitingReads = new TreeSet<SAMRecord>(comparer);
|
TreeSet<SAMRecord> waitingReads = new TreeSet<SAMRecord>(comparer);
|
||||||
|
|
||||||
private <T> T remove(TreeSet<T> treeSet) {
|
private SAMRecord remove(TreeSet<SAMRecord> treeSet) {
|
||||||
final T first = treeSet.first();
|
final SAMRecord first = treeSet.first();
|
||||||
treeSet.remove(first);
|
if ( !treeSet.remove(first) )
|
||||||
|
throw new UserException("Error caching SAM record " + first.getReadName() + ", which is usually caused by malformed SAM/BAM files in which multiple identical copies of a read are present.");
|
||||||
return first;
|
return first;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -97,7 +97,7 @@ public class GaussianMixtureModel {
|
||||||
|
|
||||||
int ttt = 0;
|
int ttt = 0;
|
||||||
while( ttt++ < numIterations ) {
|
while( ttt++ < numIterations ) {
|
||||||
// Estep: assign each variant to the nearest cluster
|
// E step: assign each variant to the nearest cluster
|
||||||
for( final VariantDatum datum : data ) {
|
for( final VariantDatum datum : data ) {
|
||||||
double minDistance = Double.MAX_VALUE;
|
double minDistance = Double.MAX_VALUE;
|
||||||
MultivariateGaussian minGaussian = null;
|
MultivariateGaussian minGaussian = null;
|
||||||
|
|
@ -112,7 +112,7 @@ public class GaussianMixtureModel {
|
||||||
datum.assignment = minGaussian;
|
datum.assignment = minGaussian;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Mstep: update gaussian means based on assigned variants
|
// M step: update gaussian means based on assigned variants
|
||||||
for( final MultivariateGaussian gaussian : gaussians ) {
|
for( final MultivariateGaussian gaussian : gaussians ) {
|
||||||
gaussian.zeroOutMu();
|
gaussian.zeroOutMu();
|
||||||
int numAssigned = 0;
|
int numAssigned = 0;
|
||||||
|
|
@ -229,26 +229,29 @@ public class GaussianMixtureModel {
|
||||||
}
|
}
|
||||||
|
|
||||||
public double evaluateDatumMarginalized( final VariantDatum datum ) {
|
public double evaluateDatumMarginalized( final VariantDatum datum ) {
|
||||||
int numVals = 0;
|
int numSamples = 0;
|
||||||
double sumPVarInGaussian = 0.0;
|
double sumPVarInGaussian = 0.0;
|
||||||
int numIter = 10;
|
final int numIterPerMissingAnnotation = 10; // Trade off here between speed of computation and accuracy of the marginalization
|
||||||
final double[] pVarInGaussianLog10 = new double[gaussians.size()];
|
final double[] pVarInGaussianLog10 = new double[gaussians.size()];
|
||||||
|
// for each dimension
|
||||||
for( int iii = 0; iii < datum.annotations.length; iii++ ) {
|
for( int iii = 0; iii < datum.annotations.length; iii++ ) {
|
||||||
// marginalize over the missing dimension by drawing X random values for the missing annotation and averaging the lod
|
// if it is missing marginalize over the missing dimension by drawing X random values for the missing annotation and averaging the lod
|
||||||
if( datum.isNull[iii] ) {
|
if( datum.isNull[iii] ) {
|
||||||
for( int ttt = 0; ttt < numIter; ttt++ ) {
|
for( int ttt = 0; ttt < numIterPerMissingAnnotation; ttt++ ) {
|
||||||
datum.annotations[iii] = Normal.staticNextDouble(0.0, 1.0);
|
datum.annotations[iii] = GenomeAnalysisEngine.getRandomGenerator().nextGaussian(); // draw a random sample from the standard normal distribution
|
||||||
|
|
||||||
|
// evaluate this random data point
|
||||||
int gaussianIndex = 0;
|
int gaussianIndex = 0;
|
||||||
for( final MultivariateGaussian gaussian : gaussians ) {
|
for( final MultivariateGaussian gaussian : gaussians ) {
|
||||||
pVarInGaussianLog10[gaussianIndex++] = gaussian.pMixtureLog10 + gaussian.evaluateDatumLog10( datum );
|
pVarInGaussianLog10[gaussianIndex++] = gaussian.pMixtureLog10 + gaussian.evaluateDatumLog10( datum );
|
||||||
}
|
}
|
||||||
|
|
||||||
sumPVarInGaussian += Math.pow(10.0, MathUtils.log10sumLog10(pVarInGaussianLog10));
|
// add this sample's probability to the pile in order to take an average in the end
|
||||||
numVals++;
|
sumPVarInGaussian += Math.pow(10.0, MathUtils.log10sumLog10(pVarInGaussianLog10)); // p = 10 ^ Sum(pi_k * p(v|n,k))
|
||||||
|
numSamples++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return Math.log10( sumPVarInGaussian / ((double) numVals) );
|
return Math.log10( sumPVarInGaussian / ((double) numSamples) );
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -25,7 +25,6 @@
|
||||||
|
|
||||||
package org.broadinstitute.sting.gatk.walkers.variantrecalibration;
|
package org.broadinstitute.sting.gatk.walkers.variantrecalibration;
|
||||||
|
|
||||||
import cern.jet.random.Normal;
|
|
||||||
import org.apache.log4j.Logger;
|
import org.apache.log4j.Logger;
|
||||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||||
|
|
@ -91,7 +90,7 @@ public class VariantDataManager {
|
||||||
meanVector[iii] = theMean;
|
meanVector[iii] = theMean;
|
||||||
varianceVector[iii] = theSTD;
|
varianceVector[iii] = theSTD;
|
||||||
for( final VariantDatum datum : data ) {
|
for( final VariantDatum datum : data ) {
|
||||||
datum.annotations[iii] = ( datum.isNull[iii] ? Normal.staticNextDouble(0.0, 1.0) : ( datum.annotations[iii] - theMean ) / theSTD );
|
datum.annotations[iii] = ( datum.isNull[iii] ? GenomeAnalysisEngine.getRandomGenerator().nextGaussian() : ( datum.annotations[iii] - theMean ) / theSTD );
|
||||||
// Each data point is now [ (x - mean) / standard deviation ]
|
// Each data point is now [ (x - mean) / standard deviation ]
|
||||||
if( annotationKeys.get(iii).toLowerCase().contains("ranksum") && datum.isNull[iii] && datum.annotations[iii] > 0.0 ) {
|
if( annotationKeys.get(iii).toLowerCase().contains("ranksum") && datum.isNull[iii] && datum.annotations[iii] > 0.0 ) {
|
||||||
datum.annotations[iii] /= 3.0;
|
datum.annotations[iii] /= 3.0;
|
||||||
|
|
|
||||||
|
|
@ -225,7 +225,7 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec,
|
||||||
loc = pos + alleles.get(0).length() - 1;
|
loc = pos + alleles.get(0).length() - 1;
|
||||||
} else if ( !isSingleNucleotideEvent(alleles) ) {
|
} else if ( !isSingleNucleotideEvent(alleles) ) {
|
||||||
ArrayList<Allele> newAlleles = new ArrayList<Allele>();
|
ArrayList<Allele> newAlleles = new ArrayList<Allele>();
|
||||||
loc = clipAlleles(pos, ref, alleles, newAlleles);
|
loc = clipAlleles(pos, ref, alleles, newAlleles, lineNo);
|
||||||
alleles = newAlleles;
|
alleles = newAlleles;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -504,7 +504,7 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec,
|
||||||
* @param clippedAlleles output list of clipped alleles
|
* @param clippedAlleles output list of clipped alleles
|
||||||
* @return a list of alleles, clipped to the reference
|
* @return a list of alleles, clipped to the reference
|
||||||
*/
|
*/
|
||||||
protected static long clipAlleles(long position, String ref, List<Allele> unclippedAlleles, List<Allele> clippedAlleles) {
|
protected static long clipAlleles(long position, String ref, List<Allele> unclippedAlleles, List<Allele> clippedAlleles, int lineNo) {
|
||||||
|
|
||||||
// Note that the computation of forward clipping here is meant only to see whether there is a common
|
// Note that the computation of forward clipping here is meant only to see whether there is a common
|
||||||
// base to all alleles, and to correctly compute reverse clipping,
|
// base to all alleles, and to correctly compute reverse clipping,
|
||||||
|
|
@ -522,6 +522,8 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec,
|
||||||
}
|
}
|
||||||
if (a.length() - reverseClipped <= forwardClipping || a.length() - forwardClipping == 0)
|
if (a.length() - reverseClipped <= forwardClipping || a.length() - forwardClipping == 0)
|
||||||
clipping = false;
|
clipping = false;
|
||||||
|
else if (ref.length() == reverseClipped)
|
||||||
|
generateException("bad alleles encountered", lineNo);
|
||||||
else if (a.getBases()[a.length()-reverseClipped-1] != ref.getBytes()[ref.length()-reverseClipped-1])
|
else if (a.getBases()[a.length()-reverseClipped-1] != ref.getBytes()[ref.length()-reverseClipped-1])
|
||||||
clipping = false;
|
clipping = false;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue