Merge pull request #438 from broadinstitute/rp_vqsr_num_bad_stability_fixes_and_runtime_optimizations

Various VQSR optimizations in runtime and stability.
2013-12-02 08:37:37 -08:00 · 2013-12-02 08:37:37 -08:00 · 84ddfb41b5
parent 6a922e7aca b57054c63c
commit 84ddfb41b5
10 changed files with 364 additions and 174 deletions
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java
@ -46,10 +46,8 @@

 package org.broadinstitute.sting.gatk.walkers.variantrecalibration;

-import org.broadinstitute.sting.commandline.Argument;
-import org.broadinstitute.sting.commandline.Input;
-import org.broadinstitute.sting.commandline.Output;
-import org.broadinstitute.sting.commandline.RodBinding;
+import org.apache.commons.math.util.MathUtils;
+import org.broadinstitute.sting.commandline.*;
 import org.broadinstitute.sting.gatk.CommandLineGATK;
 import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
 import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
@ -112,6 +110,9 @@ import java.util.*;
@PartitionBy(PartitionType.LOCUS)
 public class ApplyRecalibration extends RodWalker<Integer, Integer> implements TreeReducible<Integer> {

+    public static final String LOW_VQSLOD_FILTER_NAME = "LOW_VQSLOD";
+    private final double DEFAULT_VQSLOD_CUTOFF = 0.0;
+
    /////////////////////////////
    // Inputs
    /////////////////////////////
@ -122,7 +123,7 @@ public class ApplyRecalibration extends RodWalker<Integer, Integer> implements T
    public List<RodBinding<VariantContext>> input;
    @Input(fullName="recal_file", shortName="recalFile", doc="The input recal file used by ApplyRecalibration", required=true)
    protected RodBinding<VariantContext> recal;
-    @Input(fullName="tranches_file", shortName="tranchesFile", doc="The input tranches file describing where to cut the data", required=true)
+    @Input(fullName="tranches_file", shortName="tranchesFile", doc="The input tranches file describing where to cut the data", required=false)
    protected File TRANCHES_FILE;

    /////////////////////////////
@ -134,8 +135,13 @@ public class ApplyRecalibration extends RodWalker<Integer, Integer> implements T
    /////////////////////////////
    // Command Line Arguments
    /////////////////////////////
+    @Advanced
    @Argument(fullName="ts_filter_level", shortName="ts_filter_level", doc="The truth sensitivity level at which to start filtering", required=false)
-    protected double TS_FILTER_LEVEL = 99.0;
+    protected Double TS_FILTER_LEVEL = null;
+    @Advanced
+    @Argument(fullName="lodCutoff", shortName="lodCutoff", doc="The VQSLOD score below which to start filtering", required=false)
+    protected Double VQSLOD_CUTOFF = null;
+
    /**
     * For this to work properly, the -ignoreFilter argument should also be applied to the VariantRecalibration command.
     */
@ -160,13 +166,15 @@ public class ApplyRecalibration extends RodWalker<Integer, Integer> implements T
    //---------------------------------------------------------------------------------------------------------------

    public void initialize() {
-        for ( final Tranche t : Tranche.readTranches(TRANCHES_FILE) ) {
-            if ( t.ts >= TS_FILTER_LEVEL ) {
-                tranches.add(t);
+        if( TS_FILTER_LEVEL != null ) {
+            for ( final Tranche t : Tranche.readTranches(TRANCHES_FILE) ) {
+                if ( t.ts >= TS_FILTER_LEVEL ) {
+                    tranches.add(t);
+                }
+                logger.info(String.format("Read tranche " + t));
            }
-            logger.info(String.format("Read tranche " + t));
+            Collections.reverse(tranches); // this algorithm wants the tranches ordered from best (lowest truth sensitivity) to worst (highest truth sensitivity)
        }
-        Collections.reverse(tranches); // this algorithm wants the tranches ordered from best (lowest truth sensitivity) to worst (highest truth sensitivity)

        for( final RodBinding rod : input ) {
            inputNames.add( rod.getName() );
@ -183,19 +191,32 @@ public class ApplyRecalibration extends RodWalker<Integer, Integer> implements T
        final TreeSet<String> samples = new TreeSet<>();
        samples.addAll(SampleUtils.getUniqueSamplesFromRods(getToolkit(), inputNames));

-        if( tranches.size() >= 2 ) {
-            for( int iii = 0; iii < tranches.size() - 1; iii++ ) {
-                final Tranche t = tranches.get(iii);
-                hInfo.add(new VCFFilterHeaderLine(t.name, String.format("Truth sensitivity tranche level for " + t.model.toString() + " model at VQS Lod: " + t.minVQSLod + " <= x < " + tranches.get(iii+1).minVQSLod)));
+        if( TS_FILTER_LEVEL != null ) {
+            // if the user specifies both ts_filter_level and lodCutoff then throw a user error
+            if( VQSLOD_CUTOFF != null ) {
+                throw new UserException("Arguments --ts_filter_level and --lodCutoff are mutually exclusive. Please only specify one option.");
            }
-        }
-        if( tranches.size() >= 1 ) {
-            hInfo.add(new VCFFilterHeaderLine(tranches.get(0).name + "+", String.format("Truth sensitivity tranche level for " + tranches.get(0).model.toString() + " model at VQS Lod < " + tranches.get(0).minVQSLod)));
-        } else {
-            throw new UserException("No tranches were found in the file or were above the truth sensitivity filter level " + TS_FILTER_LEVEL);
-        }

-        logger.info("Keeping all variants in tranche " + tranches.get(tranches.size()-1));
+            if( tranches.size() >= 2 ) {
+                for( int iii = 0; iii < tranches.size() - 1; iii++ ) {
+                    final Tranche t = tranches.get(iii);
+                    hInfo.add(new VCFFilterHeaderLine(t.name, String.format("Truth sensitivity tranche level for " + t.model.toString() + " model at VQS Lod: " + t.minVQSLod + " <= x < " + tranches.get(iii+1).minVQSLod)));
+                }
+            }
+            if( tranches.size() >= 1 ) {
+                hInfo.add(new VCFFilterHeaderLine(tranches.get(0).name + "+", String.format("Truth sensitivity tranche level for " + tranches.get(0).model.toString() + " model at VQS Lod < " + tranches.get(0).minVQSLod)));
+            } else {
+                throw new UserException("No tranches were found in the file or were above the truth sensitivity filter level " + TS_FILTER_LEVEL);
+            }
+
+            logger.info("Keeping all variants in tranche " + tranches.get(tranches.size()-1));
+        } else {
+            if( VQSLOD_CUTOFF == null ) {
+                VQSLOD_CUTOFF = DEFAULT_VQSLOD_CUTOFF;
+            }
+            hInfo.add(new VCFFilterHeaderLine(LOW_VQSLOD_FILTER_NAME, "VQSLOD < " + VQSLOD_CUTOFF));
+            logger.info("Keeping all variants with VQSLOD >= " + VQSLOD_CUTOFF);
+        }

        final VCFHeader vcfHeader = new VCFHeader(hInfo, samples);
        vcfWriter.writeHeader(vcfHeader);
@ -245,7 +266,6 @@ public class ApplyRecalibration extends RodWalker<Integer, Integer> implements T
                }

                VariantContextBuilder builder = new VariantContextBuilder(vc);
-                String filterString = null;

                // Annotate the new record with its VQSLOD and the worst performing annotation
                builder.attribute(VariantRecalibrator.VQS_LOD_KEY, lod);
@ -255,21 +275,7 @@ public class ApplyRecalibration extends RodWalker<Integer, Integer> implements T
                if ( recalDatum.hasAttribute(VariantRecalibrator.NEGATIVE_LABEL_KEY))
                    builder.attribute(VariantRecalibrator.NEGATIVE_LABEL_KEY, true);

-                for( int i = tranches.size() - 1; i >= 0; i-- ) {
-                    final Tranche tranche = tranches.get(i);
-                    if( lod >= tranche.minVQSLod ) {
-                        if( i == tranches.size() - 1 ) {
-                            filterString = VCFConstants.PASSES_FILTERS_v4;
-                        } else {
-                            filterString = tranche.name;
-                        }
-                        break;
-                    }
-                }
-
-                if( filterString == null ) {
-                    filterString = tranches.get(0).name+"+";
-                }
+                final String filterString = generateFilterString(lod);

                if( filterString.equals(VCFConstants.PASSES_FILTERS_v4) ) {
                    builder.passFilters();
@ -289,6 +295,36 @@ public class ApplyRecalibration extends RodWalker<Integer, Integer> implements T
        return 1; // This value isn't used for anything
    }

+    /**
+     * Generate the VCF filter string for this record based on the provided lod score
+     * @param lod non-null double
+     * @return the String to use as the VCF filter field
+     */
+    protected String generateFilterString( final double lod ) {
+        String filterString = null;
+        if( TS_FILTER_LEVEL != null ) {
+            for( int i = tranches.size() - 1; i >= 0; i-- ) {
+                final Tranche tranche = tranches.get(i);
+                if( lod >= tranche.minVQSLod ) {
+                    if( i == tranches.size() - 1 ) {
+                        filterString = VCFConstants.PASSES_FILTERS_v4;
+                    } else {
+                        filterString = tranche.name;
+                    }
+                    break;
+                }
+            }
+
+            if( filterString == null ) {
+                filterString = tranches.get(0).name+"+";
+            }
+        } else {
+            filterString = ( lod < VQSLOD_CUTOFF ? LOW_VQSLOD_FILTER_NAME : VCFConstants.PASSES_FILTERS_v4 );
+        }
+
+        return filterString;
+    }
+
    private static VariantContext getMatchingRecalVC(final VariantContext target, final List<VariantContext> recalVCs) {
        for( final VariantContext recalVC : recalVCs ) {
            if ( target.getEnd() == recalVC.getEnd() ) {
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/GaussianMixtureModel.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/GaussianMixtureModel.java
@ -267,7 +267,7 @@ public class GaussianMixtureModel {
    public double evaluateDatumMarginalized( final VariantDatum datum ) {
        int numRandomDraws = 0;
        double sumPVarInGaussian = 0.0;
-        final int numIterPerMissingAnnotation = 10; // Trade off here between speed of computation and accuracy of the marginalization
+        final int numIterPerMissingAnnotation = 20; // Trade off here between speed of computation and accuracy of the marginalization
        final double[] pVarInGaussianLog10 = new double[gaussians.size()];
        // for each dimension
        for( int iii = 0; iii < datum.annotations.length; iii++ ) {
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/TrancheManager.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/TrancheManager.java
@ -160,11 +160,11 @@ public class TrancheManager {
        }
    }

-    public static List<Tranche> findTranches( final ArrayList<VariantDatum> data, final double[] tranches, final SelectionMetric metric, final VariantRecalibratorArgumentCollection.Mode model ) {
+    public static List<Tranche> findTranches( final List<VariantDatum> data, final double[] tranches, final SelectionMetric metric, final VariantRecalibratorArgumentCollection.Mode model ) {
        return findTranches( data, tranches, metric, model, null );
    }

-    public static List<Tranche> findTranches( final ArrayList<VariantDatum> data, final double[] trancheThresholds, final SelectionMetric metric, final VariantRecalibratorArgumentCollection.Mode model, final File debugFile ) {
+    public static List<Tranche> findTranches( final List<VariantDatum> data, final double[] trancheThresholds, final SelectionMetric metric, final VariantRecalibratorArgumentCollection.Mode model, final File debugFile ) {
        logger.info(String.format("Finding %d tranches for %d variants", trancheThresholds.length, data.size()));

        Collections.sort( data, new VariantDatum.VariantDatumLODComparator() );
@ -172,7 +172,7 @@ public class TrancheManager {

        if ( debugFile != null) { writeTranchesDebuggingInfo(debugFile, data, metric); }

-        List<Tranche> tranches = new ArrayList<Tranche>();
+        List<Tranche> tranches = new ArrayList<>();
        for ( double trancheThreshold : trancheThresholds ) {
            Tranche t = findTranche(data, metric, trancheThreshold, model);

--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java
@ -71,7 +71,7 @@ import java.util.*;
 */

 public class VariantDataManager {
-    private ExpandingArrayList<VariantDatum> data;
+    private List<VariantDatum> data;
    private double[] meanVector;
    private double[] varianceVector; // this is really the standard deviation
    public List<String> annotationKeys;
@ -88,30 +88,30 @@ public class VariantDataManager {
        trainingSets = new ArrayList<>();
    }

-    public void setData( final ExpandingArrayList<VariantDatum> data ) {
+    public void setData( final List<VariantDatum> data ) {
        this.data = data;
    }

-    public ExpandingArrayList<VariantDatum> getData() {
+    public List<VariantDatum> getData() {
        return data;
    }

    public void normalizeData() {
        boolean foundZeroVarianceAnnotation = false;
        for( int iii = 0; iii < meanVector.length; iii++ ) {
-            final double theMean = mean(iii);
-            final double theSTD = standardDeviation(theMean, iii);
+            final double theMean = mean(iii, true);
+            final double theSTD = standardDeviation(theMean, iii, true);
            logger.info( annotationKeys.get(iii) + String.format(": \t mean = %.2f\t standard deviation = %.2f", theMean, theSTD) );
            if( Double.isNaN(theMean) ) {
                throw new UserException.BadInput("Values for " + annotationKeys.get(iii) + " annotation not detected for ANY training variant in the input callset. VariantAnnotator may be used to add these annotations. See " + HelpConstants.forumPost("discussion/49/using-variant-annotator"));
            }

-            foundZeroVarianceAnnotation = foundZeroVarianceAnnotation || (theSTD < 1E-6);
+            foundZeroVarianceAnnotation = foundZeroVarianceAnnotation || (theSTD < 1E-5);
            meanVector[iii] = theMean;
            varianceVector[iii] = theSTD;
            for( final VariantDatum datum : data ) {
                // Transform each data point via: (x - mean) / standard deviation
-                datum.annotations[iii] = ( datum.isNull[iii] ? GenomeAnalysisEngine.getRandomGenerator().nextGaussian() : ( datum.annotations[iii] - theMean ) / theSTD );
+                datum.annotations[iii] = ( datum.isNull[iii] ? 0.1 * GenomeAnalysisEngine.getRandomGenerator().nextGaussian() : ( datum.annotations[iii] - theMean ) / theSTD );
            }
        }
        if( foundZeroVarianceAnnotation ) {
@ -129,7 +129,7 @@ public class VariantDataManager {

        // re-order the data by increasing standard deviation so that the results don't depend on the order things were specified on the command line
        // standard deviation over the training points is used as a simple proxy for information content, perhaps there is a better thing to use here
-        final List<Integer> theOrder = calculateSortOrder(varianceVector);
+        final List<Integer> theOrder = calculateSortOrder(meanVector);
        annotationKeys = reorderList(annotationKeys, theOrder);
        varianceVector = ArrayUtils.toPrimitive(reorderArray(ArrayUtils.toObject(varianceVector), theOrder));
        meanVector = ArrayUtils.toPrimitive(reorderArray(ArrayUtils.toObject(meanVector), theOrder));
@ -137,40 +137,41 @@ public class VariantDataManager {
            datum.annotations = ArrayUtils.toPrimitive(reorderArray(ArrayUtils.toObject(datum.annotations), theOrder));
            datum.isNull = ArrayUtils.toPrimitive(reorderArray(ArrayUtils.toObject(datum.isNull), theOrder));
        }
+        logger.info("Annotations are now ordered by their information content: " + annotationKeys.toString());
    }

    /**
     * Get a list of indices which give the ascending sort order of the data array
-     * @param data the data to consider
+     * @param inputVector the data to consider
     * @return a non-null list of integers with length matching the length of the input array
     */
-    protected List<Integer> calculateSortOrder(final double[] data) {
-        final List<Integer> theOrder = new ArrayList<>(data.length);
-        final List<MyStandardDeviation> sortedData = new ArrayList<>(data.length);
+    protected List<Integer> calculateSortOrder(final double[] inputVector) {
+        final List<Integer> theOrder = new ArrayList<>(inputVector.length);
+        final List<MyDoubleForSorting> toBeSorted = new ArrayList<>(inputVector.length);
        int count = 0;
-        for( final double d : data ) {
-            sortedData.add(new MyStandardDeviation(d, count++));
+        for( int iii = 0; iii < inputVector.length; iii++ ) {
+            toBeSorted.add(new MyDoubleForSorting(-1.0 * Math.abs(inputVector[iii] - mean(iii, false)), count++));
        }
-        Collections.sort(sortedData); // sort the data in ascending order
-        for( final MyStandardDeviation d : sortedData ) {
+        Collections.sort(toBeSorted);
+        for( final MyDoubleForSorting d : toBeSorted ) {
            theOrder.add(d.originalIndex); // read off the sort order by looking at the index field
        }
        return theOrder;
    }

-    // small private class to assist in reading off the new ordering of the standard deviation array
-    private class MyStandardDeviation implements Comparable<MyStandardDeviation> {
-        final Double standardDeviation;
+    // small private class to assist in reading off the new ordering of the annotation array
+    private class MyDoubleForSorting implements Comparable<MyDoubleForSorting> {
+        final Double myData;
        final int originalIndex;

-        public MyStandardDeviation( final double standardDeviation, final int originalIndex ) {
-            this.standardDeviation = standardDeviation;
+        public MyDoubleForSorting(final double myData, final int originalIndex) {
+            this.myData = myData;
            this.originalIndex = originalIndex;
        }

        @Override
-        public int compareTo(final MyStandardDeviation other) {
-            return standardDeviation.compareTo(other.standardDeviation);
+        public int compareTo(final MyDoubleForSorting other) {
+            return myData.compareTo(other.myData);
        }
    }

@ -233,92 +234,77 @@ public class VariantDataManager {
        return false;
    }

-    public ExpandingArrayList<VariantDatum> getTrainingData() {
-        final ExpandingArrayList<VariantDatum> trainingData = new ExpandingArrayList<>();
+    public List<VariantDatum> getTrainingData() {
+        final List<VariantDatum> trainingData = new ExpandingArrayList<>();
        for( final VariantDatum datum : data ) {
-            if( datum.atTrainingSite && !datum.failingSTDThreshold && datum.originalQual > VRAC.QUAL_THRESHOLD ) {
+            if( datum.atTrainingSite && !datum.failingSTDThreshold ) {
                trainingData.add( datum );
            }
        }
        logger.info( "Training with " + trainingData.size() + " variants after standard deviation thresholding." );
-        if( trainingData.size() < VRAC.NUM_BAD_VARIANTS) {
+        if( trainingData.size() < VRAC.MIN_NUM_BAD_VARIANTS ) {
            logger.warn( "WARNING: Training with very few variant sites! Please check the model reporting PDF to ensure the quality of the model is reliable." );
+        } else if( trainingData.size() > VRAC.MAX_NUM_TRAINING_DATA ) {
+            logger.warn( "WARNING: Very large training set detected. Downsampling to " + VRAC.MAX_NUM_TRAINING_DATA + " training variants." );
+            Collections.shuffle(trainingData);
+            return trainingData.subList(0, VRAC.MAX_NUM_TRAINING_DATA);
        }
        return trainingData;
    }

-    public ExpandingArrayList<VariantDatum> selectWorstVariants( final int minimumNumber ) {
-        // The return value is the list of training variants
-        final ExpandingArrayList<VariantDatum> trainingData = new ExpandingArrayList<>();
+    public List<VariantDatum> selectWorstVariants() {
+        final List<VariantDatum> trainingData = new ExpandingArrayList<>();

-        // First add to the training list all sites overlapping any bad sites training tracks
        for( final VariantDatum datum : data ) {
-            if( datum.atAntiTrainingSite && !datum.failingSTDThreshold && !Double.isInfinite(datum.lod) ) {
-                trainingData.add( datum );
-            }
-        }
-        final int numBadSitesAdded = trainingData.size();
-        logger.info( "Found " + numBadSitesAdded + " variants overlapping bad sites training tracks." );
-
-        // Next sort the variants by the LOD coming from the positive model and add to the list the bottom X percent of variants
-        Collections.sort( data, new VariantDatum.VariantDatumLODComparator() );
-        final int numToAdd = minimumNumber - trainingData.size();
-        if( numToAdd > data.size() ) {
-            throw new UserException.BadInput( "Error during negative model training. Minimum number of variants to use in training is larger than the whole call set. You can try lowering the --numBadVariants argument but this is unsafe." );
-        }
-        int index = 0, numAdded = 0;
-        while( numAdded < numToAdd && index < data.size() ) {
-            final VariantDatum datum = data.get(index++);
-            if( datum != null && !datum.atAntiTrainingSite && !datum.failingSTDThreshold && !Double.isInfinite(datum.lod) ) {
+            if( datum != null && !datum.failingSTDThreshold && !Double.isInfinite(datum.lod) && datum.lod < VRAC.BAD_LOD_CUTOFF ) {
                datum.atAntiTrainingSite = true;
                trainingData.add( datum );
-                numAdded++;
            }
        }
-        logger.info( "Additionally training with worst " + numToAdd + " scoring variants --> " + (trainingData.size() - numBadSitesAdded) + " variants with LOD <= " + String.format("%.4f", data.get(index).lod) + "." );
+
+        logger.info( "Training with worst " + trainingData.size() + " scoring variants --> variants with LOD <= " + String.format("%.4f", VRAC.BAD_LOD_CUTOFF) + "." );
+
        return trainingData;
    }

-    public ExpandingArrayList<VariantDatum> getRandomDataForPlotting( int numToAdd ) {
-        numToAdd = Math.min(numToAdd, data.size());
-        final ExpandingArrayList<VariantDatum> returnData = new ExpandingArrayList<>();
-        // add numToAdd non-anti training sites to plot
-        for( int iii = 0; iii < numToAdd; iii++) {
-            final VariantDatum datum = data.get(GenomeAnalysisEngine.getRandomGenerator().nextInt(data.size()));
-            if( ! datum.atAntiTrainingSite && !datum.failingSTDThreshold ) {
-                returnData.add(datum);
-            }
-        }
+    public List<VariantDatum> getEvaluationData() {
+        final List<VariantDatum> evaluationData = new ExpandingArrayList<>();

-        final int MAX_ANTI_TRAINING_SITES = 10000;
-        int nAntiTrainingAdded = 0;
-        // Add all anti-training sites to visual
        for( final VariantDatum datum : data ) {
-            if ( nAntiTrainingAdded > MAX_ANTI_TRAINING_SITES )
-                break;
-            else if ( datum.atAntiTrainingSite ) {
-                returnData.add(datum);
-                nAntiTrainingAdded++;
+            if( datum != null && !datum.failingSTDThreshold && !datum.atTrainingSite && !datum.atAntiTrainingSite ) {
+                evaluationData.add( datum );
            }
        }

+        return evaluationData;
+    }
+
+    public List<VariantDatum> getRandomDataForPlotting( final int numToAdd, final List<VariantDatum> trainingData, final List<VariantDatum> antiTrainingData, final List<VariantDatum> evaluationData ) {
+        final List<VariantDatum> returnData = new ExpandingArrayList<>();
+        Collections.shuffle(trainingData);
+        Collections.shuffle(antiTrainingData);
+        Collections.shuffle(evaluationData);
+        returnData.addAll(trainingData.subList(0, Math.min(numToAdd, trainingData.size())));
+        returnData.addAll(antiTrainingData.subList(0, Math.min(numToAdd, antiTrainingData.size())));
+        returnData.addAll(evaluationData.subList(0, Math.min(numToAdd, evaluationData.size())));
+        Collections.shuffle(returnData);
        return returnData;
    }

-    private double mean( final int index ) {
+    protected double mean( final int index, final boolean trainingData ) {
        double sum = 0.0;
        int numNonNull = 0;
        for( final VariantDatum datum : data ) {
-            if( datum.atTrainingSite && !datum.isNull[index] ) { sum += datum.annotations[index]; numNonNull++; }
+            if( (trainingData == datum.atTrainingSite) && !datum.isNull[index] ) { sum += datum.annotations[index]; numNonNull++; }
        }
        return sum / ((double) numNonNull);
    }

-    private double standardDeviation( final double mean, final int index ) {
+    protected double standardDeviation( final double mean, final int index, final boolean trainingData ) {
        double sum = 0.0;
        int numNonNull = 0;
        for( final VariantDatum datum : data ) {
-            if( datum.atTrainingSite && !datum.isNull[index] ) { sum += ((datum.annotations[index] - mean)*(datum.annotations[index] - mean)); numNonNull++; }
+            if( (trainingData == datum.atTrainingSite) && !datum.isNull[index] ) { sum += ((datum.annotations[index] - mean)*(datum.annotations[index] - mean)); numNonNull++; }
        }
        return Math.sqrt( sum / ((double) numNonNull) );
    }
@ -343,12 +329,9 @@ public class VariantDataManager {
        try {
            value = vc.getAttributeAsDouble( annotationKey, Double.NaN );
            if( Double.isInfinite(value) ) { value = Double.NaN; }
-            if( jitter && annotationKey.equalsIgnoreCase("HRUN") ) { // Integer valued annotations must be jittered a bit to work in this GMM
-                value += -0.25 + 0.5 * GenomeAnalysisEngine.getRandomGenerator().nextDouble();
-            }
-
-            if( jitter && annotationKey.equalsIgnoreCase("HaplotypeScore") && MathUtils.compareDoubles(value, 0.0, 0.0001) == 0 ) { value = -0.2 + 0.4*GenomeAnalysisEngine.getRandomGenerator().nextDouble(); }
-            if( jitter && annotationKey.equalsIgnoreCase("FS") && MathUtils.compareDoubles(value, 0.0, 0.001) == 0 ) { value = -0.2 + 0.4*GenomeAnalysisEngine.getRandomGenerator().nextDouble(); }
+            if( jitter && annotationKey.equalsIgnoreCase("HaplotypeScore") && MathUtils.compareDoubles(value, 0.0, 0.01) == 0 ) { value += 0.01 * GenomeAnalysisEngine.getRandomGenerator().nextGaussian(); }
+            if( jitter && annotationKey.equalsIgnoreCase("FS") && MathUtils.compareDoubles(value, 0.0, 0.01) == 0 ) { value += 0.01 * GenomeAnalysisEngine.getRandomGenerator().nextGaussian(); }
+            if( jitter && annotationKey.equalsIgnoreCase("InbreedingCoeff") && MathUtils.compareDoubles(value, 0.0, 0.01) == 0 ) { value += 0.01 * GenomeAnalysisEngine.getRandomGenerator().nextGaussian(); }
        } catch( Exception e ) {
            value = Double.NaN; // The VQSR works with missing data by marginalizing over the missing dimension when evaluating the Gaussian mixture model
        }
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java
@ -48,6 +48,7 @@ package org.broadinstitute.sting.gatk.walkers.variantrecalibration;

 import org.broadinstitute.sting.commandline.*;
 import org.broadinstitute.sting.gatk.CommandLineGATK;
+import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
 import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
 import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
 import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
@ -142,7 +143,7 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
    public static final String VQS_LOD_KEY = "VQSLOD"; // Log odds ratio of being a true variant versus being false under the trained gaussian mixture model
    public static final String CULPRIT_KEY = "culprit"; // The annotation which was the worst performing in the Gaussian mixture model, likely the reason why the variant was filtered out
    public static final String NEGATIVE_LABEL_KEY = "NEGATIVE_TRAIN_SITE"; // this variant was used in the negative training set
-    public static final String POSITIVE_LABEL_KEY = "POSITIVE_TRAIN_SITE"; // this variant was used in the positive traning set
+    public static final String POSITIVE_LABEL_KEY = "POSITIVE_TRAIN_SITE"; // this variant was used in the positive training set
    private static final String PLOT_TRANCHES_RSCRIPT = "plot_Tranches.R";

    @ArgumentCollection private VariantRecalibratorArgumentCollection VRAC = new VariantRecalibratorArgumentCollection();
@ -208,8 +209,11 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
    private String[] IGNORE_INPUT_FILTERS = null;
    @Output(fullName="rscript_file", shortName="rscriptFile", doc="The output rscript file generated by the VQSR to aid in visualization of the input data and learned model", required=false, defaultToStdout=false)
    private File RSCRIPT_FILE = null;
-    @Argument(fullName="ts_filter_level", shortName="ts_filter_level", doc="The truth sensitivity level at which to start filtering, used here to indicate filtered variants in the model reporting plots", required=false)
-    protected double TS_FILTER_LEVEL = 99.0;
+
+    @Hidden
+    @Argument(fullName="replicate", shortName="replicate", doc="Used to debug the random number generation inside the VQSR. Do not use.", required=false)
+    protected int REPLICATE = 200;
+    private ArrayList<Double> replicate = new ArrayList<>();

    /////////////////////////////
    // Debug Arguments
@ -223,7 +227,7 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
    /////////////////////////////
    private VariantDataManager dataManager;
    private PrintStream tranchesStream;
-    private final Set<String> ignoreInputFilterSet = new TreeSet<String>();
+    private final Set<String> ignoreInputFilterSet = new TreeSet<>();
    private final VariantRecalibratorEngine engine = new VariantRecalibratorEngine( VRAC );

    //---------------------------------------------------------------------------------------------------------------
@ -232,8 +236,9 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
    //
    //---------------------------------------------------------------------------------------------------------------

+    @Override
    public void initialize() {
-        dataManager = new VariantDataManager( new ArrayList<String>(Arrays.asList(USE_ANNOTATIONS)), VRAC );
+        dataManager = new VariantDataManager( new ArrayList<>(Arrays.asList(USE_ANNOTATIONS)), VRAC );

        if (RSCRIPT_FILE != null && !RScriptExecutor.RSCRIPT_EXISTS)
            Utils.warnUser(logger, String.format(
@ -262,9 +267,13 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
        }


-        final Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
+        final Set<VCFHeaderLine> hInfo = new HashSet<>();
        ApplyRecalibration.addVQSRStandardHeaderLines(hInfo);
        recalWriter.writeHeader( new VCFHeader(hInfo) );
+
+        for( int iii = 0; iii < REPLICATE * 2; iii++ ) {
+            replicate.add(GenomeAnalysisEngine.getRandomGenerator().nextDouble());
+        }
    }

    //---------------------------------------------------------------------------------------------------------------
@ -273,8 +282,9 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
    //
    //---------------------------------------------------------------------------------------------------------------

+    @Override
    public ExpandingArrayList<VariantDatum> map( final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context ) {
-        final ExpandingArrayList<VariantDatum> mapList = new ExpandingArrayList<VariantDatum>();
+        final ExpandingArrayList<VariantDatum> mapList = new ExpandingArrayList<>();

        if( tracker == null ) { // For some reason RodWalkers get map calls with null trackers
            return mapList;
@ -294,7 +304,7 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat

                    // Loop through the training data sets and if they overlap this loci then update the prior and training status appropriately
                    dataManager.parseTrainingSets( tracker, context.getLocation(), vc, datum, TRUST_ALL_POLYMORPHIC );
-                    double priorFactor = QualityUtils.qualToProb( datum.prior );
+                    final double priorFactor = QualityUtils.qualToProb( datum.prior );
                    datum.prior = Math.log10( priorFactor ) - Math.log10( 1.0 - priorFactor );

                    mapList.add( datum );
@ -311,15 +321,18 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
    //
    //---------------------------------------------------------------------------------------------------------------

+    @Override
    public ExpandingArrayList<VariantDatum> reduceInit() {
-        return new ExpandingArrayList<VariantDatum>();
+        return new ExpandingArrayList<>();
    }

+    @Override
    public ExpandingArrayList<VariantDatum> reduce( final ExpandingArrayList<VariantDatum> mapValue, final ExpandingArrayList<VariantDatum> reduceSum ) {
        reduceSum.addAll( mapValue );
        return reduceSum;
    }

+    @Override
    public ExpandingArrayList<VariantDatum> treeReduce( final ExpandingArrayList<VariantDatum> lhs, final ExpandingArrayList<VariantDatum> rhs ) {
        rhs.addAll( lhs );
        return rhs;
@ -331,21 +344,23 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
    //
    //---------------------------------------------------------------------------------------------------------------

+    @Override
    public void onTraversalDone( final ExpandingArrayList<VariantDatum> reduceSum ) {
        dataManager.setData( reduceSum );
        dataManager.normalizeData(); // Each data point is now (x - mean) / standard deviation

        // Generate the positive model using the training data and evaluate each variant
-        final GaussianMixtureModel goodModel = engine.generateModel( dataManager.getTrainingData(), VRAC.MAX_GAUSSIANS );
+        final List<VariantDatum> positiveTrainingData = dataManager.getTrainingData();
+        final GaussianMixtureModel goodModel = engine.generateModel( positiveTrainingData, VRAC.MAX_GAUSSIANS );
        engine.evaluateData( dataManager.getData(), goodModel, false );

        // Generate the negative model using the worst performing data and evaluate each variant contrastively
-        final ExpandingArrayList<VariantDatum> negativeTrainingData = dataManager.selectWorstVariants( VRAC.NUM_BAD_VARIANTS );
+        final List<VariantDatum> negativeTrainingData = dataManager.selectWorstVariants();
        final GaussianMixtureModel badModel = engine.generateModel( negativeTrainingData, Math.min(VRAC.MAX_GAUSSIANS_FOR_NEGATIVE_MODEL, VRAC.MAX_GAUSSIANS));
        engine.evaluateData( dataManager.getData(), badModel, true );

        if( badModel.failedToConverge || goodModel.failedToConverge ) {
-            throw new UserException("NaN LOD value assigned. Clustering with this few variants and these annotations is unsafe. Please consider " + (badModel.failedToConverge ? "raising the number of variants used to train the negative model (via --numBadVariants 3000, for example)." : "lowering the maximum number of Gaussians allowed for use in the model (via --maxGaussians 4, for example).") );
+            throw new UserException("NaN LOD value assigned. Clustering with this few variants and these annotations is unsafe. Please consider " + (badModel.failedToConverge ? "raising the number of variants used to train the negative model (via --minNumBadVariants 5000, for example)." : "lowering the maximum number of Gaussians allowed for use in the model (via --maxGaussians 4, for example).") );
        }

        engine.calculateWorstPerformingAnnotation( dataManager.getData(), goodModel, badModel );
@ -356,19 +371,11 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
        final List<Tranche> tranches = TrancheManager.findTranches( dataManager.getData(), TS_TRANCHES, metric, VRAC.MODE );
        tranchesStream.print(Tranche.tranchesString( tranches ));

-        // Find the filtering lodCutoff for display on the model PDFs. Red variants are those which were below the cutoff and filtered out of the final callset.
-        double lodCutoff = 0.0;
-        for( final Tranche tranche : tranches ) {
-            if( MathUtils.compareDoubles(tranche.ts, TS_FILTER_LEVEL, 0.0001) == 0 ) {
-                lodCutoff = tranche.minVQSLod;
-            }
-        }
-
        logger.info( "Writing out recalibration table..." );
        dataManager.writeOutRecalibrationTable( recalWriter );
        if( RSCRIPT_FILE != null ) {
            logger.info( "Writing out visualization Rscript file...");
-            createVisualizationScript( dataManager.getRandomDataForPlotting( 6000 ), goodModel, badModel, lodCutoff, dataManager.getAnnotationKeys().toArray(new String[USE_ANNOTATIONS.length]) );
+            createVisualizationScript( dataManager.getRandomDataForPlotting( 1000, positiveTrainingData, negativeTrainingData, dataManager.getEvaluationData() ), goodModel, badModel, 0.0, dataManager.getAnnotationKeys().toArray(new String[USE_ANNOTATIONS.length]) );
        }

        if(VRAC.MODE == VariantRecalibratorArgumentCollection.Mode.INDEL) {
@ -385,7 +392,7 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
        }
    }

-    private void createVisualizationScript( final ExpandingArrayList<VariantDatum> randomData, final GaussianMixtureModel goodModel, final GaussianMixtureModel badModel, final double lodCutoff, final String[] annotationKeys ) {
+    private void createVisualizationScript( final List<VariantDatum> randomData, final GaussianMixtureModel goodModel, final GaussianMixtureModel badModel, final double lodCutoff, final String[] annotationKeys ) {
        PrintStream stream;
        try {
            stream = new PrintStream(RSCRIPT_FILE);
@ -409,7 +416,7 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
            for( int jjj = iii + 1; jjj < annotationKeys.length; jjj++) {
                logger.info( "Building " + annotationKeys[iii] + " x " + annotationKeys[jjj] + " plot...");

-                final ExpandingArrayList<VariantDatum> fakeData = new ExpandingArrayList<VariantDatum>();
+                final List<VariantDatum> fakeData = new ExpandingArrayList<>();
                double minAnn1 = 100.0, maxAnn1 = -100.0, minAnn2 = 100.0, maxAnn2 = -100.0;
                for( final VariantDatum datum : randomData ) {
                    minAnn1 = Math.min(minAnn1, datum.annotations[iii]);
@ -418,8 +425,9 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
                    maxAnn2 = Math.max(maxAnn2, datum.annotations[jjj]);
                }
                // Create a fake set of data which spans the full extent of these two annotation dimensions in order to calculate the model PDF projected to 2D
-                for(double ann1 = minAnn1; ann1 <= maxAnn1; ann1+=0.1) {
-                    for(double ann2 = minAnn2; ann2 <= maxAnn2; ann2+=0.1) {
+                final double NUM_STEPS = 60.0;
+                for(double ann1 = minAnn1; ann1 <= maxAnn1; ann1+= (maxAnn1 - minAnn1) / NUM_STEPS) {
+                    for(double ann2 = minAnn2; ann2 <= maxAnn2; ann2+= (maxAnn2 - minAnn2) / NUM_STEPS) {
                        final VariantDatum datum = new VariantDatum();
                        datum.prior = 0.0;
                        datum.annotations = new double[randomData.get(0).annotations.length];
@ -441,7 +449,7 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat

                stream.print("surface <- c(");
                for( final VariantDatum datum : fakeData ) {
-                    stream.print(String.format("%.3f, %.3f, %.3f, ",
+                    stream.print(String.format("%.4f, %.4f, %.4f, ",
                            dataManager.denormalizeDatum(datum.annotations[iii], iii),
                            dataManager.denormalizeDatum(datum.annotations[jjj], jjj),
                            Math.min(4.0, Math.max(-4.0, datum.lod))));
@ -451,7 +459,7 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat

                stream.print("data <- c(");
                for( final VariantDatum datum : randomData ) {
-                    stream.print(String.format("%.3f, %.3f, %.3f, %d, %d,",
+                    stream.print(String.format("%.4f, %.4f, %.4f, %d, %d,",
                            dataManager.denormalizeDatum(datum.annotations[iii], iii),
                            dataManager.denormalizeDatum(datum.annotations[jjj], jjj),
                            (datum.lod < lodCutoff ? -1.0 : 1.0),
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibratorArgumentCollection.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibratorArgumentCollection.java
@ -73,27 +73,48 @@ public class VariantRecalibratorArgumentCollection {

    @Argument(fullName = "mode", shortName = "mode", doc = "Recalibration mode to employ: 1.) SNP for recalibrating only SNPs (emitting indels untouched in the output VCF); 2.) INDEL for indels (emitting SNPs untouched in the output VCF); and 3.) BOTH for recalibrating both SNPs and indels simultaneously (for testing purposes only, not recommended for general use).", required = false)
    public VariantRecalibratorArgumentCollection.Mode MODE = VariantRecalibratorArgumentCollection.Mode.SNP;
+
+    @Advanced
    @Argument(fullName="maxGaussians", shortName="mG", doc="The maximum number of Gaussians for the positive model to try during variational Bayes algorithm.", required=false)
-    public int MAX_GAUSSIANS = 10;
+    public int MAX_GAUSSIANS = 8;

    @Advanced
    @Argument(fullName="maxNegativeGaussians", shortName="mNG", doc="The maximum number of Gaussians for the negative model to try during variational Bayes algorithm.  The actual maximum used is the min of the mG and mNG arguments.  Note that this number should be small (like 4) to achieve the best results", required=false)
-    public int MAX_GAUSSIANS_FOR_NEGATIVE_MODEL = 4;
+    public int MAX_GAUSSIANS_FOR_NEGATIVE_MODEL = 2;

+    @Advanced
    @Argument(fullName="maxIterations", shortName="mI", doc="The maximum number of VBEM iterations to be performed in variational Bayes algorithm. Procedure will normally end when convergence is detected.", required=false)
-    public int MAX_ITERATIONS = 100;
+    public int MAX_ITERATIONS = 150;
+
+    @Advanced
    @Argument(fullName="numKMeans", shortName="nKM", doc="The number of k-means iterations to perform in order to initialize the means of the Gaussians in the Gaussian mixture model.", required=false)
-    public int NUM_KMEANS_ITERATIONS = 30;
+    public int NUM_KMEANS_ITERATIONS = 100;
+
+    @Advanced
    @Argument(fullName="stdThreshold", shortName="std", doc="If a variant has annotations more than -std standard deviations away from mean then don't use it for building the Gaussian mixture model.", required=false)
-    public double STD_THRESHOLD = 14.0;
-    @Argument(fullName="qualThreshold", shortName="qual", doc="If a known variant has raw QUAL value less than -qual then don't use it for building the Gaussian mixture model.", required=false)
-    public double QUAL_THRESHOLD = 80.0;
+    public double STD_THRESHOLD = 10.0;
+
+    @Advanced
    @Argument(fullName="shrinkage", shortName="shrinkage", doc="The shrinkage parameter in the variational Bayes algorithm.", required=false)
    public double SHRINKAGE = 1.0;
+
+    @Advanced
    @Argument(fullName="dirichlet", shortName="dirichlet", doc="The dirichlet parameter in the variational Bayes algorithm.", required=false)
    public double DIRICHLET_PARAMETER = 0.001;
+
+    @Advanced
    @Argument(fullName="priorCounts", shortName="priorCounts", doc="The number of prior counts to use in the variational Bayes algorithm.", required=false)
    public double PRIOR_COUNTS = 20.0;
-    @Argument(fullName="numBadVariants", shortName="numBad", doc="The number of worst scoring variants to use when building the Gaussian mixture model of bad variants.", required=false)
-    public int NUM_BAD_VARIANTS = 1000;
+
+    @Advanced
+    @Argument(fullName="maxNumTrainingData", shortName="maxNumTrainingData", doc="Maximum number of training data to be used in building the Gaussian mixture model. Training sets large than this will be randomly downsampled.", required=false)
+    protected int MAX_NUM_TRAINING_DATA = 2500000;
+
+    @Advanced
+    @Argument(fullName="minNumBadVariants", shortName="minNumBad", doc="The minimum number of worst scoring variants to use when building the Gaussian mixture model of bad variants.", required=false)
+    public int MIN_NUM_BAD_VARIANTS = 1000;
+
+    @Advanced
+    @Argument(fullName="badLodCutoff", shortName="badLodCutoff", doc="The LOD score below which to be used when building the Gaussian mixture model of bad variants.", required=false)
+    public double BAD_LOD_CUTOFF = -5.0;
 }
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibratorEngine.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibratorEngine.java
@ -69,7 +69,7 @@ public class VariantRecalibratorEngine {
    // the unified argument collection
    final private VariantRecalibratorArgumentCollection VRAC;

-    private final static double MIN_PROB_CONVERGENCE = 2E-2;
+    private final static double MIN_PROB_CONVERGENCE = 2E-3;

    /////////////////////////////
    // Public Methods to interface with the Engine
--- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibrationUnitTest.java
+++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibrationUnitTest.java
@ -0,0 +1,68 @@
+/*
+*  By downloading the PROGRAM you agree to the following terms of use:
+*
+*  BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
+*
+*  This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
+*
+*  WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
+*  WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
+*  NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
+*
+*  1. DEFINITIONS
+*  1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
+*
+*  2. LICENSE
+*  2.1   Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
+*  The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only.  For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
+*  2.2  No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD.  LICENSEE shall ensure that all of its users agree to the terms of this Agreement.  LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
+*  2.3  License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
+*
+*  3. OWNERSHIP OF INTELLECTUAL PROPERTY
+*  LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies.  LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
+*  Copyright 2012 Broad Institute, Inc.
+*  Notice of attribution:  The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
+*  LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
+*
+*  4. INDEMNIFICATION
+*  LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
+*
+*  5. NO REPRESENTATIONS OR WARRANTIES
+*  THE PROGRAM IS DELIVERED AS IS.  BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
+*  IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
+*
+*  6. ASSIGNMENT
+*  This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
+*
+*  7. MISCELLANEOUS
+*  7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
+*  7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
+*  7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
+*  7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested.  All notices under this Agreement shall be deemed effective upon receipt.
+*  7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
+*  7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
+*  7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
+*/
+
+package org.broadinstitute.sting.gatk.walkers.variantrecalibration;
+
+import junit.framework.Assert;
+import org.broadinstitute.sting.BaseTest;
+import org.broadinstitute.variant.vcf.VCFConstants;
+import org.testng.annotations.Test;
+
+/**
+ * Created with IntelliJ IDEA.
+ * User: rpoplin
+ * Date: 11/29/13
+ */
+
+public class ApplyRecalibrationUnitTest extends BaseTest {
+    @Test
+    public final void testGenerateFilterString() {
+        final ApplyRecalibration ar = new ApplyRecalibration();
+        ar.VQSLOD_CUTOFF = 0.0;
+        Assert.assertTrue(ar.generateFilterString(5.0).equals(VCFConstants.PASSES_FILTERS_v4));
+        Assert.assertTrue(ar.generateFilterString(-5.0).equals(ApplyRecalibration.LOW_VQSLOD_FILTER_NAME));
+    }
+}
--- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManagerUnitTest.java
+++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManagerUnitTest.java
@ -61,11 +61,85 @@ import java.util.List;
 */

 public class VariantDataManagerUnitTest extends BaseTest {
+
    @Test
    public final void testCalculateSortOrder() {
-        final double[] data = {2.0, 3.0, 0.0, 1.0, 4.0, 68.0, 5.0};
-        VariantDataManager vdm = new VariantDataManager(new ArrayList<String>(), new VariantRecalibratorArgumentCollection());
-        final List<Integer> order = vdm.calculateSortOrder(data);
-        Assert.assertArrayEquals(new int[]{2,3,0,1,4,6,5}, ArrayUtils.toPrimitive(order.toArray(new Integer[order.size()])));
+        final double passingQual = 400.0;
+        final VariantRecalibratorArgumentCollection VRAC = new VariantRecalibratorArgumentCollection();
+
+        VariantDataManager vdm = new VariantDataManager(new ArrayList<String>(), VRAC);
+
+        final List<VariantDatum> theData = new ArrayList<>();
+        final VariantDatum datum1 = new VariantDatum();
+        datum1.atTrainingSite = true;
+        datum1.failingSTDThreshold = false;
+        datum1.originalQual = passingQual;
+        datum1.annotations = new double[]{0.0,-10.0,10.0};
+        datum1.isNull = new boolean[]{false, false, false};
+        theData.add(datum1);
+
+        final VariantDatum datum2 = new VariantDatum();
+        datum2.atTrainingSite = true;
+        datum2.failingSTDThreshold = false;
+        datum2.originalQual = passingQual;
+        datum2.annotations = new double[]{0.0,-9.0,15.0};
+        datum2.isNull = new boolean[]{false, false, false};
+        theData.add(datum2);
+
+        final VariantDatum datum3 = new VariantDatum();
+        datum3.atTrainingSite = false;
+        datum3.failingSTDThreshold = false;
+        datum3.originalQual = passingQual;
+        datum3.annotations = new double[]{0.0,1.0,999.0};
+        datum3.isNull = new boolean[]{false, false, false};
+        theData.add(datum3);
+
+        final VariantDatum datum4 = new VariantDatum();
+        datum4.atTrainingSite = false;
+        datum4.failingSTDThreshold = false;
+        datum4.originalQual = passingQual;
+        datum4.annotations = new double[]{0.015,2.0,1001.11};
+        datum4.isNull = new boolean[]{false, false, false};
+        theData.add(datum4);
+
+        vdm.setData(theData);
+
+        final double[] meanVector = new double[3];
+        for( int iii = 0; iii < meanVector.length; iii++ ) {
+            meanVector[iii] = vdm.mean(iii, true);
+        }
+        final List<Integer> order = vdm.calculateSortOrder(meanVector);
+        Assert.assertArrayEquals(new int[]{2,1,0}, ArrayUtils.toPrimitive(order.toArray(new Integer[order.size()])));
+    }
+
+    @Test
+    public final void testDownSamplingTrainingData() {
+        final int MAX_NUM_TRAINING_DATA = 5000;
+        final double passingQual = 400.0;
+        final VariantRecalibratorArgumentCollection VRAC = new VariantRecalibratorArgumentCollection();
+        VRAC.MAX_NUM_TRAINING_DATA = MAX_NUM_TRAINING_DATA;
+
+        VariantDataManager vdm = new VariantDataManager(new ArrayList<String>(), VRAC);
+        final List<VariantDatum> theData = new ArrayList<>();
+        for( int iii = 0; iii < MAX_NUM_TRAINING_DATA * 10; iii++) {
+            final VariantDatum datum = new VariantDatum();
+            datum.atTrainingSite = true;
+            datum.failingSTDThreshold = false;
+            datum.originalQual = passingQual;
+            theData.add(datum);
+        }
+
+        for( int iii = 0; iii < MAX_NUM_TRAINING_DATA * 2; iii++) {
+            final VariantDatum datum = new VariantDatum();
+            datum.atTrainingSite = false;
+            datum.failingSTDThreshold = false;
+            datum.originalQual = passingQual;
+            theData.add(datum);
+        }
+
+        vdm.setData(theData);
+        final List<VariantDatum> trainingData = vdm.getTrainingData();
+
+        Assert.assertTrue( trainingData.size() == MAX_NUM_TRAINING_DATA );
    }
 }
--- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java
+++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java
@ -79,9 +79,9 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest {
    }

    VRTest lowPass = new VRTest(validationDataLocation + "phase1.projectConsensus.chr20.raw.snps.vcf",
-            "0f4ceeeb8e4a3c89f8591d5e531d8410",  // tranches
-            "c979a102669498ef40dde47ca4133c42",  // recal file
-            "8f60fd849537610b653b321869e94641"); // cut VCF
+            "6f029dc7d16e63e19c006613cd0a5cff",  // tranches
+            "73c7897441622c9b37376eb4f071c560",  // recal file
+            "11a28df79b92229bd317ac49a3ed0fa1"); // cut VCF

    @DataProvider(name = "VRTest")
    public Object[][] createData1() {
@ -126,9 +126,9 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest {
    }

    VRTest bcfTest = new VRTest(privateTestDir + "vqsr.bcf_test.snps.unfiltered.bcf",
-            "6539e025997579cd0c7da12219cbc572",  // tranches
-            "778e61f81ab3d468b75f684bef0478e5",  // recal file
-            "21e96b0bb47e2976f53f11181f920e51"); // cut VCF
+            "3ad7f55fb3b072f373cbce0b32b66df4",  // tranches
+            "e747c08131d58d9a4800720f6ca80e0c",  // recal file
+            "e5808af3af0f2611ba5a3d172ab2557b"); // cut VCF

    @DataProvider(name = "VRBCFTest")
    public Object[][] createVRBCFTest() {
@ -178,15 +178,15 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest {

    VRTest indelUnfiltered = new VRTest(
            validationDataLocation + "combined.phase1.chr20.raw.indels.unfiltered.sites.vcf", // all FILTERs as .
-            "8906fdae8beca712f5ff2808d35ef02d",  // tranches
-            "07ffea25e04f6ef53079bccb30bd6a7b",  // recal file
-            "8b3ef71cad71e8eb48a856a27ae4f8d5"); // cut VCF
+            "9a331328370889168a7aa3a625f73620",  // tranches
+            "2cbbd146d68c40200b782e0226f71976",  // recal file
+            "64dd98a5ab80cf5fd9a36eb66b38268e"); // cut VCF

    VRTest indelFiltered = new VRTest(
            validationDataLocation + "combined.phase1.chr20.raw.indels.filtered.sites.vcf", // all FILTERs as PASS
-            "8906fdae8beca712f5ff2808d35ef02d",  // tranches
-            "07ffea25e04f6ef53079bccb30bd6a7b",  // recal file
-            "3d69b280370cdd9611695e4893591306"); // cut VCF
+            "9a331328370889168a7aa3a625f73620",  // tranches
+            "2cbbd146d68c40200b782e0226f71976",  // recal file
+            "c0ec662001e829f5779a9d13b1d77d80"); // cut VCF

    @DataProvider(name = "VRIndelTest")
    public Object[][] createTestVariantRecalibratorIndel() {
@ -242,7 +242,7 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest {
                        " -o %s" +
                        " -tranchesFile " + privateTestDir + "VQSR.mixedTest.tranches" +
                        " -recalFile " + privateTestDir + "VQSR.mixedTest.recal",
-                Arrays.asList("20c23643a78c5b95abd1526fdab8960d"));
+                Arrays.asList("03a0ed00af6aac76d39e569f90594a02"));
        executeTest("testApplyRecalibrationSnpAndIndelTogether", spec);
    }