Merge branch 'master' of ssh://tin.broadinstitute.org/humgen/gsa-scr1/gsa-engineering/git/unstable
This commit is contained in:
commit
f7d188232f
|
|
@ -5,6 +5,7 @@ import net.sf.samtools.*;
|
|||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
|
||||
import java.util.*;
|
||||
|
|
@ -113,9 +114,10 @@ public class ConstrainedMateFixingManager {
|
|||
HashMap<String, SAMRecordHashObject> forMateMatching = new HashMap<String, SAMRecordHashObject>();
|
||||
TreeSet<SAMRecord> waitingReads = new TreeSet<SAMRecord>(comparer);
|
||||
|
||||
private <T> T remove(TreeSet<T> treeSet) {
|
||||
final T first = treeSet.first();
|
||||
treeSet.remove(first);
|
||||
private SAMRecord remove(TreeSet<SAMRecord> treeSet) {
|
||||
final SAMRecord first = treeSet.first();
|
||||
if ( !treeSet.remove(first) )
|
||||
throw new UserException("Error caching SAM record " + first.getReadName() + ", which is usually caused by malformed SAM/BAM files in which multiple identical copies of a read are present.");
|
||||
return first;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -97,7 +97,7 @@ public class GaussianMixtureModel {
|
|||
|
||||
int ttt = 0;
|
||||
while( ttt++ < numIterations ) {
|
||||
// Estep: assign each variant to the nearest cluster
|
||||
// E step: assign each variant to the nearest cluster
|
||||
for( final VariantDatum datum : data ) {
|
||||
double minDistance = Double.MAX_VALUE;
|
||||
MultivariateGaussian minGaussian = null;
|
||||
|
|
@ -112,7 +112,7 @@ public class GaussianMixtureModel {
|
|||
datum.assignment = minGaussian;
|
||||
}
|
||||
|
||||
// Mstep: update gaussian means based on assigned variants
|
||||
// M step: update gaussian means based on assigned variants
|
||||
for( final MultivariateGaussian gaussian : gaussians ) {
|
||||
gaussian.zeroOutMu();
|
||||
int numAssigned = 0;
|
||||
|
|
@ -229,26 +229,29 @@ public class GaussianMixtureModel {
|
|||
}
|
||||
|
||||
public double evaluateDatumMarginalized( final VariantDatum datum ) {
|
||||
int numVals = 0;
|
||||
int numSamples = 0;
|
||||
double sumPVarInGaussian = 0.0;
|
||||
int numIter = 10;
|
||||
final int numIterPerMissingAnnotation = 10; // Trade off here between speed of computation and accuracy of the marginalization
|
||||
final double[] pVarInGaussianLog10 = new double[gaussians.size()];
|
||||
// for each dimension
|
||||
for( int iii = 0; iii < datum.annotations.length; iii++ ) {
|
||||
// marginalize over the missing dimension by drawing X random values for the missing annotation and averaging the lod
|
||||
// if it is missing marginalize over the missing dimension by drawing X random values for the missing annotation and averaging the lod
|
||||
if( datum.isNull[iii] ) {
|
||||
for( int ttt = 0; ttt < numIter; ttt++ ) {
|
||||
datum.annotations[iii] = Normal.staticNextDouble(0.0, 1.0);
|
||||
for( int ttt = 0; ttt < numIterPerMissingAnnotation; ttt++ ) {
|
||||
datum.annotations[iii] = GenomeAnalysisEngine.getRandomGenerator().nextGaussian(); // draw a random sample from the standard normal distribution
|
||||
|
||||
// evaluate this random data point
|
||||
int gaussianIndex = 0;
|
||||
for( final MultivariateGaussian gaussian : gaussians ) {
|
||||
pVarInGaussianLog10[gaussianIndex++] = gaussian.pMixtureLog10 + gaussian.evaluateDatumLog10( datum );
|
||||
}
|
||||
|
||||
sumPVarInGaussian += Math.pow(10.0, MathUtils.log10sumLog10(pVarInGaussianLog10));
|
||||
numVals++;
|
||||
// add this sample's probability to the pile in order to take an average in the end
|
||||
sumPVarInGaussian += Math.pow(10.0, MathUtils.log10sumLog10(pVarInGaussianLog10)); // p = 10 ^ Sum(pi_k * p(v|n,k))
|
||||
numSamples++;
|
||||
}
|
||||
}
|
||||
}
|
||||
return Math.log10( sumPVarInGaussian / ((double) numVals) );
|
||||
return Math.log10( sumPVarInGaussian / ((double) numSamples) );
|
||||
}
|
||||
}
|
||||
|
|
@ -25,7 +25,6 @@
|
|||
|
||||
package org.broadinstitute.sting.gatk.walkers.variantrecalibration;
|
||||
|
||||
import cern.jet.random.Normal;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
|
|
@ -91,7 +90,7 @@ public class VariantDataManager {
|
|||
meanVector[iii] = theMean;
|
||||
varianceVector[iii] = theSTD;
|
||||
for( final VariantDatum datum : data ) {
|
||||
datum.annotations[iii] = ( datum.isNull[iii] ? Normal.staticNextDouble(0.0, 1.0) : ( datum.annotations[iii] - theMean ) / theSTD );
|
||||
datum.annotations[iii] = ( datum.isNull[iii] ? GenomeAnalysisEngine.getRandomGenerator().nextGaussian() : ( datum.annotations[iii] - theMean ) / theSTD );
|
||||
// Each data point is now [ (x - mean) / standard deviation ]
|
||||
if( annotationKeys.get(iii).toLowerCase().contains("ranksum") && datum.isNull[iii] && datum.annotations[iii] > 0.0 ) {
|
||||
datum.annotations[iii] /= 3.0;
|
||||
|
|
|
|||
|
|
@ -225,7 +225,7 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec,
|
|||
loc = pos + alleles.get(0).length() - 1;
|
||||
} else if ( !isSingleNucleotideEvent(alleles) ) {
|
||||
ArrayList<Allele> newAlleles = new ArrayList<Allele>();
|
||||
loc = clipAlleles(pos, ref, alleles, newAlleles);
|
||||
loc = clipAlleles(pos, ref, alleles, newAlleles, lineNo);
|
||||
alleles = newAlleles;
|
||||
}
|
||||
|
||||
|
|
@ -504,7 +504,7 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec,
|
|||
* @param clippedAlleles output list of clipped alleles
|
||||
* @return a list of alleles, clipped to the reference
|
||||
*/
|
||||
protected static long clipAlleles(long position, String ref, List<Allele> unclippedAlleles, List<Allele> clippedAlleles) {
|
||||
protected static long clipAlleles(long position, String ref, List<Allele> unclippedAlleles, List<Allele> clippedAlleles, int lineNo) {
|
||||
|
||||
// Note that the computation of forward clipping here is meant only to see whether there is a common
|
||||
// base to all alleles, and to correctly compute reverse clipping,
|
||||
|
|
@ -522,6 +522,8 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec,
|
|||
}
|
||||
if (a.length() - reverseClipped <= forwardClipping || a.length() - forwardClipping == 0)
|
||||
clipping = false;
|
||||
else if (ref.length() == reverseClipped)
|
||||
generateException("bad alleles encountered", lineNo);
|
||||
else if (a.getBases()[a.length()-reverseClipped-1] != ref.getBytes()[ref.length()-reverseClipped-1])
|
||||
clipping = false;
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue