Cleaning up / removing most of the monkeying around with annotation values that happens in VariantDataManager

This commit is contained in:
Ryan Poplin 2011-07-08 12:48:33 -04:00
parent 9a14b914b7
commit 2a4b3ae4a2
3 changed files with 13 additions and 29 deletions

View File

@ -82,19 +82,11 @@ public class VariantDataManager {
}
foundZeroVarianceAnnotation = foundZeroVarianceAnnotation || (theSTD < 1E-6);
if( annotationKeys.get(iii).toLowerCase().contains("ranksum") ) { // BUGBUG: to clean up
for( final VariantDatum datum : data ) {
if( datum.annotations[iii] > 0.0 ) { datum.annotations[iii] /= 3.0; }
}
}
meanVector[iii] = theMean;
varianceVector[iii] = theSTD;
for( final VariantDatum datum : data ) {
// Transform each data point via: (x - mean) / standard deviation
datum.annotations[iii] = ( datum.isNull[iii] ? GenomeAnalysisEngine.getRandomGenerator().nextGaussian() : ( datum.annotations[iii] - theMean ) / theSTD );
// Each data point is now [ (x - mean) / standard deviation ]
if( annotationKeys.get(iii).toLowerCase().contains("ranksum") && datum.isNull[iii] && datum.annotations[iii] > 0.0 ) {
datum.annotations[iii] /= 3.0;
}
}
}
if( foundZeroVarianceAnnotation ) {
@ -163,7 +155,7 @@ public class VariantDataManager {
final int numBadSitesAdded = trainingData.size();
logger.info( "Found " + numBadSitesAdded + " variants overlapping bad sites training tracks." );
// Next, sort the variants by the LOD coming from the positive model and add to the list the bottom X percent of variants
// Next sort the variants by the LOD coming from the positive model and add to the list the bottom X percent of variants
Collections.sort( data );
final int numToAdd = Math.max( minimumNumber - trainingData.size(), Math.round((float)bottomPercentage * data.size()) );
if( numToAdd > data.size() ) {
@ -241,23 +233,15 @@ public class VariantDataManager {
double value;
try {
if( annotationKey.equalsIgnoreCase("QUAL") ) {
value = vc.getPhredScaledQual();
} else if( annotationKey.equalsIgnoreCase("DP") ) {
value = Double.parseDouble( (String)vc.getAttribute( "DP" ) ) / Double.parseDouble( (String)vc.getAttribute( "AN" ) );
} else {
value = Double.parseDouble( (String)vc.getAttribute( annotationKey ) );
if( Double.isInfinite(value) ) { value = Double.NaN; }
if( annotationKey.equalsIgnoreCase("InbreedingCoeff") && value > 0.05 ) { value = Double.NaN; }
if( jitter && annotationKey.equalsIgnoreCase("HRUN") ) { // Integer valued annotations must be jittered a bit to work in this GMM
value += -0.25 + 0.5 * GenomeAnalysisEngine.getRandomGenerator().nextDouble();
}
if( annotationKey.equalsIgnoreCase("HaplotypeScore") && MathUtils.compareDoubles(value, 0.0, 0.0001) == 0 ) { value = -0.2 + 0.4*GenomeAnalysisEngine.getRandomGenerator().nextDouble(); }
if( annotationKey.equalsIgnoreCase("FS") && MathUtils.compareDoubles(value, 0.0, 0.01) == 0 ) { value = -0.2 + 0.4*GenomeAnalysisEngine.getRandomGenerator().nextDouble(); }
value = Double.parseDouble( (String)vc.getAttribute( annotationKey ) );
if( Double.isInfinite(value) ) { value = Double.NaN; }
if( jitter && annotationKey.equalsIgnoreCase("HRUN") ) { // Integer valued annotations must be jittered a bit to work in this GMM
value += -0.25 + 0.5 * GenomeAnalysisEngine.getRandomGenerator().nextDouble();
}
if( jitter && annotationKey.equalsIgnoreCase("HaplotypeScore") && MathUtils.compareDoubles(value, 0.0, 0.0001) == 0 ) { value = -0.2 + 0.4*GenomeAnalysisEngine.getRandomGenerator().nextDouble(); }
if( jitter && annotationKey.equalsIgnoreCase("FS") && MathUtils.compareDoubles(value, 0.0, 0.001) == 0 ) { value = -0.2 + 0.4*GenomeAnalysisEngine.getRandomGenerator().nextDouble(); }
} catch( Exception e ) {
value = Double.NaN; // The VQSR works with missing data now by marginalizing over the missing dimension when evaluating Gaussians
value = Double.NaN; // The VQSR works with missing data by marginalizing over the missing dimension when evaluating the Gaussian mixture model
}
return value;

View File

@ -284,7 +284,7 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
throw new UserException.CouldNotCreateOutputFile(RSCRIPT_FILE, "", e);
}
// We make extensive use of the ggplot2 library: http://had.co.nz/ggplot2/
// We make extensive use of the ggplot2 R library: http://had.co.nz/ggplot2/
stream.println("library(ggplot2)");
createArrangeFunction( stream );
@ -378,6 +378,7 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
}
}
// The Arrange function is how we place the 4 model plots on one page
// from http://gettinggeneticsdone.blogspot.com/2010/03/arrange-multiple-ggplot2-plots-in-same.html
private void createArrangeFunction( final PrintStream stream ) {
stream.println("vp.layout <- function(x, y) viewport(layout.pos.row=x, layout.pos.col=y)");
@ -402,5 +403,4 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
stream.println("}");
stream.println("}");
}
}
}

View File

@ -60,7 +60,7 @@ public class VariantRecalibratorArgumentCollection {
@Argument(fullName="priorCounts", shortName="priorCounts", doc="The number of prior counts to use in variational Bayes algorithm.", required=false)
public double PRIOR_COUNTS = 20.0;
@Argument(fullName="percentBadVariants", shortName="percentBad", doc="What percentage of the worst scoring variants to use when building the Gaussian mixture model of bad variants. 0.07 means bottom 7 percent.", required=false)
public double PERCENT_BAD_VARIANTS = 0.015;
public double PERCENT_BAD_VARIANTS = 0.03;
@Argument(fullName="minNumBadVariants", shortName="minNumBad", doc="The minimum amount of worst scoring variants to use when building the Gaussian mixture model of bad variants. Will override -percentBad arugment if necessary.", required=false)
public int MIN_NUM_BAD_VARIANTS = 2000;
}