Cleaning up / removing most of the monkeying around with annotation values that happens in VariantDataManager
This commit is contained in:
parent
9a14b914b7
commit
2a4b3ae4a2
|
|
@ -82,19 +82,11 @@ public class VariantDataManager {
|
||||||
}
|
}
|
||||||
|
|
||||||
foundZeroVarianceAnnotation = foundZeroVarianceAnnotation || (theSTD < 1E-6);
|
foundZeroVarianceAnnotation = foundZeroVarianceAnnotation || (theSTD < 1E-6);
|
||||||
if( annotationKeys.get(iii).toLowerCase().contains("ranksum") ) { // BUGBUG: to clean up
|
|
||||||
for( final VariantDatum datum : data ) {
|
|
||||||
if( datum.annotations[iii] > 0.0 ) { datum.annotations[iii] /= 3.0; }
|
|
||||||
}
|
|
||||||
}
|
|
||||||
meanVector[iii] = theMean;
|
meanVector[iii] = theMean;
|
||||||
varianceVector[iii] = theSTD;
|
varianceVector[iii] = theSTD;
|
||||||
for( final VariantDatum datum : data ) {
|
for( final VariantDatum datum : data ) {
|
||||||
|
// Transform each data point via: (x - mean) / standard deviation
|
||||||
datum.annotations[iii] = ( datum.isNull[iii] ? GenomeAnalysisEngine.getRandomGenerator().nextGaussian() : ( datum.annotations[iii] - theMean ) / theSTD );
|
datum.annotations[iii] = ( datum.isNull[iii] ? GenomeAnalysisEngine.getRandomGenerator().nextGaussian() : ( datum.annotations[iii] - theMean ) / theSTD );
|
||||||
// Each data point is now [ (x - mean) / standard deviation ]
|
|
||||||
if( annotationKeys.get(iii).toLowerCase().contains("ranksum") && datum.isNull[iii] && datum.annotations[iii] > 0.0 ) {
|
|
||||||
datum.annotations[iii] /= 3.0;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if( foundZeroVarianceAnnotation ) {
|
if( foundZeroVarianceAnnotation ) {
|
||||||
|
|
@ -163,7 +155,7 @@ public class VariantDataManager {
|
||||||
final int numBadSitesAdded = trainingData.size();
|
final int numBadSitesAdded = trainingData.size();
|
||||||
logger.info( "Found " + numBadSitesAdded + " variants overlapping bad sites training tracks." );
|
logger.info( "Found " + numBadSitesAdded + " variants overlapping bad sites training tracks." );
|
||||||
|
|
||||||
// Next, sort the variants by the LOD coming from the positive model and add to the list the bottom X percent of variants
|
// Next sort the variants by the LOD coming from the positive model and add to the list the bottom X percent of variants
|
||||||
Collections.sort( data );
|
Collections.sort( data );
|
||||||
final int numToAdd = Math.max( minimumNumber - trainingData.size(), Math.round((float)bottomPercentage * data.size()) );
|
final int numToAdd = Math.max( minimumNumber - trainingData.size(), Math.round((float)bottomPercentage * data.size()) );
|
||||||
if( numToAdd > data.size() ) {
|
if( numToAdd > data.size() ) {
|
||||||
|
|
@ -241,23 +233,15 @@ public class VariantDataManager {
|
||||||
double value;
|
double value;
|
||||||
|
|
||||||
try {
|
try {
|
||||||
if( annotationKey.equalsIgnoreCase("QUAL") ) {
|
|
||||||
value = vc.getPhredScaledQual();
|
|
||||||
} else if( annotationKey.equalsIgnoreCase("DP") ) {
|
|
||||||
value = Double.parseDouble( (String)vc.getAttribute( "DP" ) ) / Double.parseDouble( (String)vc.getAttribute( "AN" ) );
|
|
||||||
} else {
|
|
||||||
value = Double.parseDouble( (String)vc.getAttribute( annotationKey ) );
|
value = Double.parseDouble( (String)vc.getAttribute( annotationKey ) );
|
||||||
if( Double.isInfinite(value) ) { value = Double.NaN; }
|
if( Double.isInfinite(value) ) { value = Double.NaN; }
|
||||||
if( annotationKey.equalsIgnoreCase("InbreedingCoeff") && value > 0.05 ) { value = Double.NaN; }
|
|
||||||
if( jitter && annotationKey.equalsIgnoreCase("HRUN") ) { // Integer valued annotations must be jittered a bit to work in this GMM
|
if( jitter && annotationKey.equalsIgnoreCase("HRUN") ) { // Integer valued annotations must be jittered a bit to work in this GMM
|
||||||
value += -0.25 + 0.5 * GenomeAnalysisEngine.getRandomGenerator().nextDouble();
|
value += -0.25 + 0.5 * GenomeAnalysisEngine.getRandomGenerator().nextDouble();
|
||||||
}
|
}
|
||||||
if( annotationKey.equalsIgnoreCase("HaplotypeScore") && MathUtils.compareDoubles(value, 0.0, 0.0001) == 0 ) { value = -0.2 + 0.4*GenomeAnalysisEngine.getRandomGenerator().nextDouble(); }
|
if( jitter && annotationKey.equalsIgnoreCase("HaplotypeScore") && MathUtils.compareDoubles(value, 0.0, 0.0001) == 0 ) { value = -0.2 + 0.4*GenomeAnalysisEngine.getRandomGenerator().nextDouble(); }
|
||||||
if( annotationKey.equalsIgnoreCase("FS") && MathUtils.compareDoubles(value, 0.0, 0.01) == 0 ) { value = -0.2 + 0.4*GenomeAnalysisEngine.getRandomGenerator().nextDouble(); }
|
if( jitter && annotationKey.equalsIgnoreCase("FS") && MathUtils.compareDoubles(value, 0.0, 0.001) == 0 ) { value = -0.2 + 0.4*GenomeAnalysisEngine.getRandomGenerator().nextDouble(); }
|
||||||
}
|
|
||||||
|
|
||||||
} catch( Exception e ) {
|
} catch( Exception e ) {
|
||||||
value = Double.NaN; // The VQSR works with missing data now by marginalizing over the missing dimension when evaluating Gaussians
|
value = Double.NaN; // The VQSR works with missing data by marginalizing over the missing dimension when evaluating the Gaussian mixture model
|
||||||
}
|
}
|
||||||
|
|
||||||
return value;
|
return value;
|
||||||
|
|
|
||||||
|
|
@ -284,7 +284,7 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
|
||||||
throw new UserException.CouldNotCreateOutputFile(RSCRIPT_FILE, "", e);
|
throw new UserException.CouldNotCreateOutputFile(RSCRIPT_FILE, "", e);
|
||||||
}
|
}
|
||||||
|
|
||||||
// We make extensive use of the ggplot2 library: http://had.co.nz/ggplot2/
|
// We make extensive use of the ggplot2 R library: http://had.co.nz/ggplot2/
|
||||||
stream.println("library(ggplot2)");
|
stream.println("library(ggplot2)");
|
||||||
|
|
||||||
createArrangeFunction( stream );
|
createArrangeFunction( stream );
|
||||||
|
|
@ -378,6 +378,7 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// The Arrange function is how we place the 4 model plots on one page
|
||||||
// from http://gettinggeneticsdone.blogspot.com/2010/03/arrange-multiple-ggplot2-plots-in-same.html
|
// from http://gettinggeneticsdone.blogspot.com/2010/03/arrange-multiple-ggplot2-plots-in-same.html
|
||||||
private void createArrangeFunction( final PrintStream stream ) {
|
private void createArrangeFunction( final PrintStream stream ) {
|
||||||
stream.println("vp.layout <- function(x, y) viewport(layout.pos.row=x, layout.pos.col=y)");
|
stream.println("vp.layout <- function(x, y) viewport(layout.pos.row=x, layout.pos.col=y)");
|
||||||
|
|
@ -402,5 +403,4 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
|
||||||
stream.println("}");
|
stream.println("}");
|
||||||
stream.println("}");
|
stream.println("}");
|
||||||
}
|
}
|
||||||
|
}
|
||||||
}
|
|
||||||
|
|
|
||||||
|
|
@ -60,7 +60,7 @@ public class VariantRecalibratorArgumentCollection {
|
||||||
@Argument(fullName="priorCounts", shortName="priorCounts", doc="The number of prior counts to use in variational Bayes algorithm.", required=false)
|
@Argument(fullName="priorCounts", shortName="priorCounts", doc="The number of prior counts to use in variational Bayes algorithm.", required=false)
|
||||||
public double PRIOR_COUNTS = 20.0;
|
public double PRIOR_COUNTS = 20.0;
|
||||||
@Argument(fullName="percentBadVariants", shortName="percentBad", doc="What percentage of the worst scoring variants to use when building the Gaussian mixture model of bad variants. 0.07 means bottom 7 percent.", required=false)
|
@Argument(fullName="percentBadVariants", shortName="percentBad", doc="What percentage of the worst scoring variants to use when building the Gaussian mixture model of bad variants. 0.07 means bottom 7 percent.", required=false)
|
||||||
public double PERCENT_BAD_VARIANTS = 0.015;
|
public double PERCENT_BAD_VARIANTS = 0.03;
|
||||||
@Argument(fullName="minNumBadVariants", shortName="minNumBad", doc="The minimum amount of worst scoring variants to use when building the Gaussian mixture model of bad variants. Will override -percentBad arugment if necessary.", required=false)
|
@Argument(fullName="minNumBadVariants", shortName="minNumBad", doc="The minimum amount of worst scoring variants to use when building the Gaussian mixture model of bad variants. Will override -percentBad arugment if necessary.", required=false)
|
||||||
public int MIN_NUM_BAD_VARIANTS = 2000;
|
public int MIN_NUM_BAD_VARIANTS = 2000;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue