Cleaning up / removing most of the monkeying around with annotation values that happens in VariantDataManager
This commit is contained in:
parent
9a14b914b7
commit
2a4b3ae4a2
|
|
@ -82,19 +82,11 @@ public class VariantDataManager {
|
|||
}
|
||||
|
||||
foundZeroVarianceAnnotation = foundZeroVarianceAnnotation || (theSTD < 1E-6);
|
||||
if( annotationKeys.get(iii).toLowerCase().contains("ranksum") ) { // BUGBUG: to clean up
|
||||
for( final VariantDatum datum : data ) {
|
||||
if( datum.annotations[iii] > 0.0 ) { datum.annotations[iii] /= 3.0; }
|
||||
}
|
||||
}
|
||||
meanVector[iii] = theMean;
|
||||
varianceVector[iii] = theSTD;
|
||||
for( final VariantDatum datum : data ) {
|
||||
// Transform each data point via: (x - mean) / standard deviation
|
||||
datum.annotations[iii] = ( datum.isNull[iii] ? GenomeAnalysisEngine.getRandomGenerator().nextGaussian() : ( datum.annotations[iii] - theMean ) / theSTD );
|
||||
// Each data point is now [ (x - mean) / standard deviation ]
|
||||
if( annotationKeys.get(iii).toLowerCase().contains("ranksum") && datum.isNull[iii] && datum.annotations[iii] > 0.0 ) {
|
||||
datum.annotations[iii] /= 3.0;
|
||||
}
|
||||
}
|
||||
}
|
||||
if( foundZeroVarianceAnnotation ) {
|
||||
|
|
@ -163,7 +155,7 @@ public class VariantDataManager {
|
|||
final int numBadSitesAdded = trainingData.size();
|
||||
logger.info( "Found " + numBadSitesAdded + " variants overlapping bad sites training tracks." );
|
||||
|
||||
// Next, sort the variants by the LOD coming from the positive model and add to the list the bottom X percent of variants
|
||||
// Next sort the variants by the LOD coming from the positive model and add to the list the bottom X percent of variants
|
||||
Collections.sort( data );
|
||||
final int numToAdd = Math.max( minimumNumber - trainingData.size(), Math.round((float)bottomPercentage * data.size()) );
|
||||
if( numToAdd > data.size() ) {
|
||||
|
|
@ -241,23 +233,15 @@ public class VariantDataManager {
|
|||
double value;
|
||||
|
||||
try {
|
||||
if( annotationKey.equalsIgnoreCase("QUAL") ) {
|
||||
value = vc.getPhredScaledQual();
|
||||
} else if( annotationKey.equalsIgnoreCase("DP") ) {
|
||||
value = Double.parseDouble( (String)vc.getAttribute( "DP" ) ) / Double.parseDouble( (String)vc.getAttribute( "AN" ) );
|
||||
} else {
|
||||
value = Double.parseDouble( (String)vc.getAttribute( annotationKey ) );
|
||||
if( Double.isInfinite(value) ) { value = Double.NaN; }
|
||||
if( annotationKey.equalsIgnoreCase("InbreedingCoeff") && value > 0.05 ) { value = Double.NaN; }
|
||||
if( jitter && annotationKey.equalsIgnoreCase("HRUN") ) { // Integer valued annotations must be jittered a bit to work in this GMM
|
||||
value += -0.25 + 0.5 * GenomeAnalysisEngine.getRandomGenerator().nextDouble();
|
||||
}
|
||||
if( annotationKey.equalsIgnoreCase("HaplotypeScore") && MathUtils.compareDoubles(value, 0.0, 0.0001) == 0 ) { value = -0.2 + 0.4*GenomeAnalysisEngine.getRandomGenerator().nextDouble(); }
|
||||
if( annotationKey.equalsIgnoreCase("FS") && MathUtils.compareDoubles(value, 0.0, 0.01) == 0 ) { value = -0.2 + 0.4*GenomeAnalysisEngine.getRandomGenerator().nextDouble(); }
|
||||
value = Double.parseDouble( (String)vc.getAttribute( annotationKey ) );
|
||||
if( Double.isInfinite(value) ) { value = Double.NaN; }
|
||||
if( jitter && annotationKey.equalsIgnoreCase("HRUN") ) { // Integer valued annotations must be jittered a bit to work in this GMM
|
||||
value += -0.25 + 0.5 * GenomeAnalysisEngine.getRandomGenerator().nextDouble();
|
||||
}
|
||||
|
||||
if( jitter && annotationKey.equalsIgnoreCase("HaplotypeScore") && MathUtils.compareDoubles(value, 0.0, 0.0001) == 0 ) { value = -0.2 + 0.4*GenomeAnalysisEngine.getRandomGenerator().nextDouble(); }
|
||||
if( jitter && annotationKey.equalsIgnoreCase("FS") && MathUtils.compareDoubles(value, 0.0, 0.001) == 0 ) { value = -0.2 + 0.4*GenomeAnalysisEngine.getRandomGenerator().nextDouble(); }
|
||||
} catch( Exception e ) {
|
||||
value = Double.NaN; // The VQSR works with missing data now by marginalizing over the missing dimension when evaluating Gaussians
|
||||
value = Double.NaN; // The VQSR works with missing data by marginalizing over the missing dimension when evaluating the Gaussian mixture model
|
||||
}
|
||||
|
||||
return value;
|
||||
|
|
|
|||
|
|
@ -284,7 +284,7 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
|
|||
throw new UserException.CouldNotCreateOutputFile(RSCRIPT_FILE, "", e);
|
||||
}
|
||||
|
||||
// We make extensive use of the ggplot2 library: http://had.co.nz/ggplot2/
|
||||
// We make extensive use of the ggplot2 R library: http://had.co.nz/ggplot2/
|
||||
stream.println("library(ggplot2)");
|
||||
|
||||
createArrangeFunction( stream );
|
||||
|
|
@ -378,6 +378,7 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
|
|||
}
|
||||
}
|
||||
|
||||
// The Arrange function is how we place the 4 model plots on one page
|
||||
// from http://gettinggeneticsdone.blogspot.com/2010/03/arrange-multiple-ggplot2-plots-in-same.html
|
||||
private void createArrangeFunction( final PrintStream stream ) {
|
||||
stream.println("vp.layout <- function(x, y) viewport(layout.pos.row=x, layout.pos.col=y)");
|
||||
|
|
@ -402,5 +403,4 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
|
|||
stream.println("}");
|
||||
stream.println("}");
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -60,7 +60,7 @@ public class VariantRecalibratorArgumentCollection {
|
|||
@Argument(fullName="priorCounts", shortName="priorCounts", doc="The number of prior counts to use in variational Bayes algorithm.", required=false)
|
||||
public double PRIOR_COUNTS = 20.0;
|
||||
@Argument(fullName="percentBadVariants", shortName="percentBad", doc="What percentage of the worst scoring variants to use when building the Gaussian mixture model of bad variants. 0.07 means bottom 7 percent.", required=false)
|
||||
public double PERCENT_BAD_VARIANTS = 0.015;
|
||||
public double PERCENT_BAD_VARIANTS = 0.03;
|
||||
@Argument(fullName="minNumBadVariants", shortName="minNumBad", doc="The minimum amount of worst scoring variants to use when building the Gaussian mixture model of bad variants. Will override -percentBad arugment if necessary.", required=false)
|
||||
public int MIN_NUM_BAD_VARIANTS = 2000;
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue