Cleaning up / removing most of the monkeying around with annotation values that happens in VariantDataManager

2011-07-08 12:48:33 -04:00 · 2011-07-08 12:48:33 -04:00 · 2a4b3ae4a2
parent 9a14b914b7
commit 2a4b3ae4a2
3 changed files with 13 additions and 29 deletions
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java
@ -82,19 +82,11 @@ public class VariantDataManager {
            }

            foundZeroVarianceAnnotation = foundZeroVarianceAnnotation || (theSTD < 1E-6);
-            if( annotationKeys.get(iii).toLowerCase().contains("ranksum") ) { // BUGBUG: to clean up
-                for( final VariantDatum datum : data ) {
-                    if( datum.annotations[iii] > 0.0 ) { datum.annotations[iii] /= 3.0; }
-                }
-            }
            meanVector[iii] = theMean;
            varianceVector[iii] = theSTD;
            for( final VariantDatum datum : data ) {
+                // Transform each data point via: (x - mean) / standard deviation
                datum.annotations[iii] = ( datum.isNull[iii] ? GenomeAnalysisEngine.getRandomGenerator().nextGaussian() : ( datum.annotations[iii] - theMean ) / theSTD );
-                // Each data point is now [ (x - mean) / standard deviation ]
-                if( annotationKeys.get(iii).toLowerCase().contains("ranksum") && datum.isNull[iii] && datum.annotations[iii] > 0.0 ) {
-                    datum.annotations[iii] /= 3.0;
-                }
            }
        }
        if( foundZeroVarianceAnnotation ) {
@ -163,7 +155,7 @@ public class VariantDataManager {
        final int numBadSitesAdded = trainingData.size();
        logger.info( "Found " + numBadSitesAdded + " variants overlapping bad sites training tracks." );

-        // Next, sort the variants by the LOD coming from the positive model and add to the list the bottom X percent of variants
+        // Next sort the variants by the LOD coming from the positive model and add to the list the bottom X percent of variants
        Collections.sort( data );
        final int numToAdd = Math.max( minimumNumber - trainingData.size(), Math.round((float)bottomPercentage * data.size()) );
        if( numToAdd > data.size() ) {
@ -241,23 +233,15 @@ public class VariantDataManager {
        double value;

        try {
-            if( annotationKey.equalsIgnoreCase("QUAL") ) {
-                value = vc.getPhredScaledQual();
-            } else if( annotationKey.equalsIgnoreCase("DP") ) {
-                value = Double.parseDouble( (String)vc.getAttribute( "DP" ) ) / Double.parseDouble( (String)vc.getAttribute( "AN" ) );
-            } else {
-                value = Double.parseDouble( (String)vc.getAttribute( annotationKey ) );
-                if( Double.isInfinite(value) ) { value = Double.NaN; }
-                if( annotationKey.equalsIgnoreCase("InbreedingCoeff") && value > 0.05 ) { value = Double.NaN; }
-                if( jitter && annotationKey.equalsIgnoreCase("HRUN") ) { // Integer valued annotations must be jittered a bit to work in this GMM
-                      value += -0.25 + 0.5 * GenomeAnalysisEngine.getRandomGenerator().nextDouble();
-                }
-                if( annotationKey.equalsIgnoreCase("HaplotypeScore") && MathUtils.compareDoubles(value, 0.0, 0.0001) == 0 ) { value = -0.2 + 0.4*GenomeAnalysisEngine.getRandomGenerator().nextDouble(); }
-                if( annotationKey.equalsIgnoreCase("FS") && MathUtils.compareDoubles(value, 0.0, 0.01) == 0 ) { value = -0.2 + 0.4*GenomeAnalysisEngine.getRandomGenerator().nextDouble(); }
+            value = Double.parseDouble( (String)vc.getAttribute( annotationKey ) );
+            if( Double.isInfinite(value) ) { value = Double.NaN; }
+            if( jitter && annotationKey.equalsIgnoreCase("HRUN") ) { // Integer valued annotations must be jittered a bit to work in this GMM
+                  value += -0.25 + 0.5 * GenomeAnalysisEngine.getRandomGenerator().nextDouble();
            }
-
+            if( jitter && annotationKey.equalsIgnoreCase("HaplotypeScore") && MathUtils.compareDoubles(value, 0.0, 0.0001) == 0 ) { value = -0.2 + 0.4*GenomeAnalysisEngine.getRandomGenerator().nextDouble(); }
+            if( jitter && annotationKey.equalsIgnoreCase("FS") && MathUtils.compareDoubles(value, 0.0, 0.001) == 0 ) { value = -0.2 + 0.4*GenomeAnalysisEngine.getRandomGenerator().nextDouble(); }
        } catch( Exception e ) {
-            value = Double.NaN; // The VQSR works with missing data now by marginalizing over the missing dimension when evaluating Gaussians
+            value = Double.NaN; // The VQSR works with missing data by marginalizing over the missing dimension when evaluating the Gaussian mixture model
        }

        return value;
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java
@ -284,7 +284,7 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
            throw new UserException.CouldNotCreateOutputFile(RSCRIPT_FILE, "", e);
        }

-        // We make extensive use of the ggplot2 library: http://had.co.nz/ggplot2/
+        // We make extensive use of the ggplot2 R library: http://had.co.nz/ggplot2/
        stream.println("library(ggplot2)");

        createArrangeFunction( stream );
@ -378,6 +378,7 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
        }
     }

+    // The Arrange function is how we place the 4 model plots on one page
    // from http://gettinggeneticsdone.blogspot.com/2010/03/arrange-multiple-ggplot2-plots-in-same.html
    private void createArrangeFunction( final PrintStream stream ) {
        stream.println("vp.layout <- function(x, y) viewport(layout.pos.row=x, layout.pos.col=y)");
@ -402,5 +403,4 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
        stream.println("}");
        stream.println("}");
    }
-
- }
+}
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibratorArgumentCollection.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibratorArgumentCollection.java
@ -60,7 +60,7 @@ public class VariantRecalibratorArgumentCollection {
    @Argument(fullName="priorCounts", shortName="priorCounts", doc="The number of prior counts to use in variational Bayes algorithm.", required=false)
    public double PRIOR_COUNTS = 20.0;
    @Argument(fullName="percentBadVariants", shortName="percentBad", doc="What percentage of the worst scoring variants to use when building the Gaussian mixture model of bad variants. 0.07 means bottom 7 percent.", required=false)
-    public double PERCENT_BAD_VARIANTS = 0.015;
+    public double PERCENT_BAD_VARIANTS = 0.03;
    @Argument(fullName="minNumBadVariants", shortName="minNumBad", doc="The minimum amount of worst scoring variants to use when building the Gaussian mixture model of bad variants. Will override -percentBad arugment if necessary.", required=false)
    public int MIN_NUM_BAD_VARIANTS = 2000;
 }