Some clean up for the variant recalibrator. Now uses @Input and @Output so that it can join the Queue party. Users now specify a -o, -clusterFile, -tranchesFile, and -reportDatFile. Example on the wiki. ApplyVariantCuts now has an integration test. Base quality recalibrator now requires a dbsnp rod or vcf file. Now that the base quality recalibrator is using @Output the PrintStream shouldn't be closed in OnTraversalDone.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4101 348d0f76-0448-11de-a6fe-93d51630548a
2010-08-24 20:14:58 +00:00 · 2010-08-24 20:14:58 +00:00 · 85007ffa87
parent f2b138d975
commit 85007ffa87
7 changed files with 178 additions and 174 deletions
--- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounterWalker.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounterWalker.java
@ -104,6 +104,8 @@ public class CovariateCounterWalker extends LocusWalker<CovariateCounterWalker.C
    /////////////////////////////
    @Argument(fullName="dont_sort_output", shortName="unsorted", required=false, doc="If specified, the output table recalibration csv file will be in an unsorted, arbitrary order to save some run time.")
    private boolean DONT_SORT_OUTPUT = false;
+    @Argument(fullName="run_without_dbsnp_potentially_ruining_quality", shortName="run_without_dbsnp_potentially_ruining_quality", required=false, doc="If specified, allows the recalibrator to be used without a dbsnp rod. Very unsafe and for expert users only.")
+    private boolean RUN_WITHOUT_DBSNP = false;

    /////////////////////////////
    // Private Member Variables
@ -183,8 +185,8 @@ public class CovariateCounterWalker extends LocusWalker<CovariateCounterWalker.C
                break;
            }
        }
-        if( !foundDBSNP ) {
-            Utils.warnUser("This calculation is critically dependent on being able to skip over known variant sites. Are you sure you want to be running without a dbSNP rod specified?");
+        if( !foundDBSNP && !RUN_WITHOUT_DBSNP ) {
+            throw new StingException("This calculation is critically dependent on being able to skip over known variant sites. Please provide a dbSNP ROD or a VCF file containing known sites of genetic variation.");
        }

        // Initialize the requested covariates by parsing the -cov argument
@ -347,7 +349,7 @@ public class CovariateCounterWalker extends LocusWalker<CovariateCounterWalker.C
            updateMismatchCounts(counter, context, ref.getBase()); // For sanity check to ensure novel mismatch rate vs dnsnp mismatch rate is reasonable
        }

-        return counter; // This value isn't actually used anywhere
+        return counter;
    }


@ -468,8 +470,6 @@ public class CovariateCounterWalker extends LocusWalker<CovariateCounterWalker.C
        logger.info( "Writing raw recalibration data..." );
        outputToCSV( sum, RECAL_FILE );
        logger.info( "...done!" );
-
-        RECAL_FILE.close();
    }

    /**
--- a/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyVariantCuts.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyVariantCuts.java
@ -27,6 +27,8 @@ package org.broadinstitute.sting.gatk.walkers.variantrecalibration;

 import org.broad.tribble.util.variantcontext.VariantContext;
 import org.broad.tribble.vcf.*;
+import org.broadinstitute.sting.commandline.Input;
+import org.broadinstitute.sting.commandline.Output;
 import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
 import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
 import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
@ -53,21 +55,28 @@ import java.util.*;

 public class ApplyVariantCuts extends RodWalker<Integer, Integer> {

+
+    /////////////////////////////
+    // Inputs
+    /////////////////////////////
+    @Input(fullName="tranches_file", shortName="tranchesFile", doc="The input tranches file describing where to cut the data", required=true)
+    private File TRANCHES_FILE;
+
+    /////////////////////////////
+    // Outputs
+    /////////////////////////////
+    @Output( doc="The output filtered VCF file", required=true)
+    private VCFWriter vcfWriter = null;
+
    /////////////////////////////
    // Command Line Arguments
    /////////////////////////////
-    @Argument(fullName="tranchesFile", shortName="tf", doc="The input tranches file describing where to cut the data", required=true)
-    private String TRANCHE_FILENAME = "optimizer.dat.tranches";
-    @Argument(fullName="outputVCFFile", shortName="outputVCF", doc="The output filtered VCF file", required=true)
-    private String OUTPUT_FILENAME = "optimizer.vcf";
    @Argument(fullName="fdr_filter_level", shortName="fdr_filter_level", doc="The FDR level at which to start filtering.", required=false)
    private double FDR_FILTER_LEVEL = 0.0;

-
    /////////////////////////////
    // Private Member Variables
    /////////////////////////////
-    private VCFWriter vcfWriter;
    final ExpandingArrayList<Double> qCuts = new ExpandingArrayList<Double>();
    final ExpandingArrayList<String> filterName = new ExpandingArrayList<String>();

@ -128,7 +137,7 @@ public class ApplyVariantCuts extends RodWalker<Integer, Integer> {
        // todo -- I would have updated your code but there's no integration test to protect me from unexpected effects
        boolean firstLine = true;
        try {
-            for( final String line : new XReadLines(new File( TRANCHE_FILENAME )) ) {
+            for( final String line : new XReadLines( TRANCHES_FILE ) ) {
                if( !firstLine ) {
                    final String[] vals = line.split(",");
                    double FDR = Double.parseDouble(vals[0]);
@ -147,7 +156,7 @@ public class ApplyVariantCuts extends RodWalker<Integer, Integer> {
                firstLine = false;
            }
        } catch( FileNotFoundException e ) {
-            throw new StingException("Can not find input file: " + TRANCHE_FILENAME);
+            throw new StingException("Can not find input file: " + TRANCHES_FILE);
        }

        // setup the header fields
@ -155,7 +164,6 @@ public class ApplyVariantCuts extends RodWalker<Integer, Integer> {
        hInfo.addAll(VCFUtils.getHeaderFields(getToolkit()));
        hInfo.add(new VCFInfoHeaderLine("OQ", 1, VCFHeaderLineType.Float, "The original variant quality score"));
        hInfo.add(new VCFHeaderLine("source", "VariantOptimizer"));
-        vcfWriter = new StandardVCFWriter( new File(OUTPUT_FILENAME) );
        final TreeSet<String> samples = new TreeSet<String>();
        samples.addAll(SampleUtils.getSampleListWithVCFHeader(getToolkit(), null));
        
@ -195,8 +203,9 @@ public class ApplyVariantCuts extends RodWalker<Integer, Integer> {
                            break;
                        }
                    }
-                    if( filterString == null )
+                    if( filterString == null ) {
                        filterString = filterName.get(0)+"+";
+                    }

                    if ( !filterString.equals(VCFConstants.PASSES_FILTERS_v4) ) {
                        Set<String> filters = new HashSet<String>();
@ -227,7 +236,6 @@ public class ApplyVariantCuts extends RodWalker<Integer, Integer> {
    }

    public void onTraversalDone( Integer reduceSum ) {
-        vcfWriter.close();
    }
 }

--- a/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/GenerateVariantClustersWalker.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/GenerateVariantClustersWalker.java
@ -27,6 +27,7 @@ package org.broadinstitute.sting.gatk.walkers.variantrecalibration;

 import org.broad.tribble.dbsnp.DbSNPFeature;
 import org.broad.tribble.util.variantcontext.VariantContext;
+import org.broadinstitute.sting.commandline.Output;
 import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
 import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
 import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils;
@ -41,7 +42,9 @@ import org.broadinstitute.sting.utils.StingException;
 import org.broadinstitute.sting.utils.Utils;
 import org.broadinstitute.sting.commandline.Argument;

+import java.io.File;
 import java.io.IOException;
+import java.io.PrintStream;
 import java.util.*;

 /**
@ -55,6 +58,13 @@ import java.util.*;

 public class GenerateVariantClustersWalker extends RodWalker<ExpandingArrayList<VariantDatum>, ExpandingArrayList<VariantDatum>> {

+    /////////////////////////////
+    // Outputs
+    /////////////////////////////
+
+    @Output(fullName="cluster_file", shortName="clusterFile", doc="The output cluster file", required=true)
+    private PrintStream CLUSTER_FILE;
+
    /////////////////////////////
    // Command Line Arguments
    /////////////////////////////
@ -64,17 +74,15 @@ public class GenerateVariantClustersWalker extends RodWalker<ExpandingArrayList<
    private String[] IGNORE_INPUT_FILTERS = null;
    @Argument(fullName="use_annotation", shortName="an", doc="The names of the annotations which should used for calculations", required=true)
    private String[] USE_ANNOTATIONS = null;
-    @Argument(fullName="clusterFile", shortName="clusterFile", doc="The output cluster file", required=true)
-    private String CLUSTER_FILENAME = "optimizer.cluster";
    @Argument(fullName="maxGaussians", shortName="mG", doc="The maximum number of Gaussians to try during Bayesian clustering", required=false)
    private int MAX_GAUSSIANS = 4;
    @Argument(fullName="maxIterations", shortName="mI", doc="The maximum number of iterations to be performed when clustering. Clustering will normally end when convergence is detected.", required=false)
    private int MAX_ITERATIONS = 200;
-    @Argument(fullName = "path_to_Rscript", shortName = "Rscript", doc = "The path to your implementation of Rscript. For Broad users this is maybe /broad/tools/apps/R-2.6.0/bin/Rscript", required = false)
-    private String PATH_TO_RSCRIPT = "Rscript";
-    @Argument(fullName = "path_to_resources", shortName = "resources", doc = "Path to resources folder holding the Sting R scripts.", required = false)
-    private String PATH_TO_RESOURCES = "R/";
-    @Argument(fullName="weightNovels", shortName="weightNovels", doc="The weight for novel variants during clustering", required=false)
+    @Argument(fullName = "path_to_Rscript", shortName="Rscript", doc="The path to your implementation of Rscript. For Broad users this is maybe /broad/tools/apps/R-2.6.0/bin/Rscript", required = false)
+ //   private String PATH_TO_RSCRIPT = "Rscript";
+ //   @Argument(fullName = "path_to_resources", shortName = "resources", doc="Path to resources folder holding the Sting R scripts.", required=false)
+ //   private String PATH_TO_RESOURCES = "R/";
+ //   @Argument(fullName="weightNovels", shortName="weightNovels", doc="The weight for novel variants during clustering", required=false)
    private double WEIGHT_NOVELS = 0.0;
    @Argument(fullName="weightKnowns", shortName="weightKnowns", doc="The weight for MQ2+ known variants during clustering", required=false)
    private double WEIGHT_KNOWNS = 0.0;
@ -89,13 +97,12 @@ public class GenerateVariantClustersWalker extends RodWalker<ExpandingArrayList<
    @Argument(fullName="stdThreshold", shortName="std", doc="If a variant has annotations more than -std standard deviations away from mean then don't use it for clustering.", required=false)
    private double STD_THRESHOLD = 4.5;
    @Argument(fullName="qualThreshold", shortName="qual", doc="If a known variant has raw QUAL value less than -qual then don't use it for clustering.", required=false)
-    private double QUAL_THRESHOLD = 1000.0;
+    private double QUAL_THRESHOLD = 800.0;
    @Argument(fullName="shrinkage", shortName="shrinkage", doc="The shrinkage parameter in variational Bayes algorithm.", required=false)
    private double SHRINKAGE = 0.0001;
    @Argument(fullName="dirichlet", shortName="dirichlet", doc="The dirichlet parameter in variational Bayes algoirthm.", required=false)
    private double DIRICHLET_PARAMETER = 1000.0;

-
    //@Argument(fullName="knn", shortName="knn", doc="The number of nearest neighbors to be used in the k-Nearest Neighbors model", required=false)
    //private int NUM_KNN = 2000;
    //@Argument(fullName = "optimization_model", shortName = "om", doc = "Optimization calculation model to employ -- GAUSSIAN_MIXTURE_MODEL is currently the default, while K_NEAREST_NEIGHBORS is also available for small callsets.", required = false)
@ -115,7 +122,7 @@ public class GenerateVariantClustersWalker extends RodWalker<ExpandingArrayList<
    //---------------------------------------------------------------------------------------------------------------

    public void initialize() {
-        if( !PATH_TO_RESOURCES.endsWith("/") ) { PATH_TO_RESOURCES = PATH_TO_RESOURCES + "/"; }
+ //       if( !PATH_TO_RESOURCES.endsWith("/") ) { PATH_TO_RESOURCES = PATH_TO_RESOURCES + "/"; }
        
        annotationKeys = new ExpandingArrayList<String>(Arrays.asList(USE_ANNOTATIONS));

@ -222,8 +229,8 @@ public class GenerateVariantClustersWalker extends RodWalker<ExpandingArrayList<
        VariantGaussianMixtureModel theModel;
        switch (OPTIMIZATION_MODEL) {
            case GAUSSIAN_MIXTURE_MODEL:
-                theModel = new VariantGaussianMixtureModel( dataManager, MAX_GAUSSIANS, MAX_ITERATIONS, maxAC, FORCE_INDEPENDENT,
-                                                            STD_THRESHOLD, QUAL_THRESHOLD, SHRINKAGE, DIRICHLET_PARAMETER );
+                theModel = new VariantGaussianMixtureModel( dataManager, MAX_GAUSSIANS, MAX_ITERATIONS, FORCE_INDEPENDENT,
+                                                            STD_THRESHOLD, SHRINKAGE, DIRICHLET_PARAMETER );
                break;
            //case K_NEAREST_NEIGHBORS:
            //    theModel = new VariantNearestNeighborsModel( dataManager, TARGET_TITV, NUM_KNN );
@ -232,7 +239,9 @@ public class GenerateVariantClustersWalker extends RodWalker<ExpandingArrayList<
                throw new StingException( "Variant Optimization Model is unrecognized. Implemented options are GAUSSIAN_MIXTURE_MODEL and K_NEAREST_NEIGHBORS" );
        }
        
-        theModel.run( CLUSTER_FILENAME );
+        theModel.run( CLUSTER_FILE );
+
+/*
        theModel.outputClusterReports( CLUSTER_FILENAME );

        for( final String annotation : annotationKeys ) {
@ -248,6 +257,6 @@ public class GenerateVariantClustersWalker extends RodWalker<ExpandingArrayList<
                Utils.warnUser("Unable to execute the RScript command because of [" + e.getMessage() + "].  While not critical to the calculations themselves, the script outputs a report that is extremely useful for confirming that the clustering proceded as expected.  We highly recommend trying to rerun the script manually if possible.");
            }
        }
+*/
    }
-
 }
--- a/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantGaussianMixtureModel.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantGaussianMixtureModel.java
@ -73,9 +73,7 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
    private final Matrix[] sigmaInverse;
    private double[] pClusterLog10;
    private final double[] determinant;
-    private final double[] alleleCountFactorArray;
    private final double stdThreshold;
-    private final double qualThreshold;

    private double[] empiricalMu;
    private Matrix empiricalSigma;
@ -85,12 +83,10 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
    private final double[] hyperParameter_lambda;

    private static final Pattern ANNOTATION_PATTERN = Pattern.compile("^@!ANNOTATION.*");
-    private static final Pattern ALLELECOUNT_PATTERN = Pattern.compile("^@!ALLELECOUNT.*");
    private static final Pattern CLUSTER_PATTERN = Pattern.compile("^@!CLUSTER.*");

    public VariantGaussianMixtureModel( final VariantDataManager _dataManager, final int _maxGaussians, final int _maxIterations,
-                                        final int maxAC, final boolean _forceIndependent, final double _stdThreshold, final double _qualThreshold,
-                                        final double _shrinkage, final double _dirichlet) {
+                                        final boolean _forceIndependent, final double _stdThreshold, final double _shrinkage, final double _dirichlet) {
        dataManager = _dataManager;
        maxGaussians = _maxGaussians;
        maxIterations = _maxIterations;
@ -99,9 +95,7 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
        sigma = new Matrix[maxGaussians];
        determinant = new double[maxGaussians];
        pClusterLog10 = new double[maxGaussians];
-        alleleCountFactorArray = new double[maxAC + 1];
        stdThreshold = _stdThreshold;
-        qualThreshold = _qualThreshold;
        FORCE_INDEPENDENT_ANNOTATIONS = _forceIndependent;
        hyperParameter_a = new double[maxGaussians];
        hyperParameter_b = new double[maxGaussians];
@ -112,26 +106,23 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
        DIRICHLET_PARAMETER = _dirichlet;
    }

-    public VariantGaussianMixtureModel( final double _targetTITV, final String clusterFileName, final double backOffGaussianFactor ) {
+    public VariantGaussianMixtureModel( final double _targetTITV, final File clusterFile, final double backOffGaussianFactor ) {
        super( _targetTITV );
        final ExpandingArrayList<String> annotationLines = new ExpandingArrayList<String>();
-        final ExpandingArrayList<String> alleleCountLines = new ExpandingArrayList<String>();
        final ExpandingArrayList<String> clusterLines = new ExpandingArrayList<String>();

        try {
-            for ( final String line : new XReadLines(new File( clusterFileName )) ) {
+            for ( final String line : new XReadLines( clusterFile ) ) {
                if( ANNOTATION_PATTERN.matcher(line).matches() ) {
                    annotationLines.add(line);
-                } else if( ALLELECOUNT_PATTERN.matcher(line).matches() ) {
-                    alleleCountLines.add(line);
                } else if( CLUSTER_PATTERN.matcher(line).matches() ) {
                    clusterLines.add(line);
                } else {
-                    throw new StingException("Malformed input file: " + clusterFileName);
+                    throw new StingException("Malformed input file: " + clusterFile);
                }
            }
        } catch ( FileNotFoundException e ) {
-            throw new StingException("Can not find input file: " + clusterFileName);
+            throw new StingException("Can not find input file: " + clusterFile);
        }

        dataManager = new VariantDataManager( annotationLines );
@ -140,7 +131,6 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
        DIRICHLET_PARAMETER = 0;
        maxIterations = 0;
        stdThreshold = 0.0;
-        qualThreshold = 0.0;
        FORCE_INDEPENDENT_ANNOTATIONS = false;
        hyperParameter_a = null;
        hyperParameter_b = null;
@ -155,14 +145,6 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
        pClusterLog10 = new double[maxGaussians];
        determinant = new double[maxGaussians];

-        alleleCountFactorArray = new double[alleleCountLines.size() + 1];
-        for( final String line : alleleCountLines ) {
-            final String[] vals = line.split(",");
-            int i = Integer.parseInt(vals[1]);
-            alleleCountFactorArray[i] = Double.parseDouble(vals[2]);
-            double oldACPrior = alleleCountFactorArray[i];
-        }
-
        int kkk = 0;
        for( final String line : clusterLines ) {
            final String[] vals = line.split(",");
@ -183,10 +165,7 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
        logger.info("Found " + maxGaussians + " clusters using " + dataManager.numAnnotations + " annotations: " + dataManager.annotationKeys);
    }
    
-    public final void run( final String clusterFileName ) {
-
-        // Initialize the Allele Count prior
-        generateAlleleCountPrior();
+    public final void run( final PrintStream clusterFile ) {

 //        int numValid = 0;
 //        int numOutlier = 0;
@ -215,7 +194,7 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
        logger.info("Initializing using k-means...");
        initializeUsingKMeans( dataManager.data );
        logger.info("... done!");
-        createClusters( dataManager.data, 0, maxGaussians, clusterFileName );
+        createClusters( dataManager.data, 0, maxGaussians, clusterFile );

        // Simply cluster with all the variants. The knowns have been given more weight than the novels
        //logger.info("Clustering with " + dataManager.data.length + " variants.");
@ -260,43 +239,12 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
        //empiricalSigma.timesEquals(1.0 / (Math.pow(maxGaussians, 2.0 / ((double) numAnnotations))));
    }

-
-    private void generateAlleleCountPrior() {
-
-        final double[] acExpectation = new double[alleleCountFactorArray.length];
-        final double[] acActual = new double[alleleCountFactorArray.length];
-        final int[] alleleCount = new int[alleleCountFactorArray.length];
-
-        double sumExpectation = 0.0;
-        for( int iii = 1; iii < alleleCountFactorArray.length; iii++ ) {
-            acExpectation[iii] = 1.0 / ((double) iii);
-            sumExpectation += acExpectation[iii];
-        }
-        for( int iii = 1; iii < alleleCountFactorArray.length; iii++ ) {
-            acExpectation[iii] /= sumExpectation; // Turn acExpectation into a probability distribution
-            alleleCount[iii] = 1; // Start off with one count to smooth the estimate
-        }
-        for( final VariantDatum datum : dataManager.data ) {
-            alleleCount[datum.alleleCount]++;
-        }
-        for( int iii = 1; iii < alleleCountFactorArray.length; iii++ ) {
-            acActual[iii] = ((double)alleleCount[iii]) / ((double) (dataManager.data.length+(alleleCountFactorArray.length-1))); // Turn acActual into a probability distribution
-        }
-        for( int iii = 1; iii < alleleCountFactorArray.length; iii++ ) {
-            alleleCountFactorArray[iii] = acExpectation[iii] / acActual[iii]; // Prior is (expected / observed)
-        }
-    }
-
    public void setSingletonFPRate( final double rate ) {
        this.singletonFPRate = rate;
    }

    public final double getAlleleCountPrior( final int alleleCount ) {
-        if ( this.singletonFPRate == -1 )
-            return alleleCountFactorArray[alleleCount];
-        else {
-            return Math.min(0.95, 1.0 - Math.pow(singletonFPRate, alleleCount)); //TODO -- define the vals
-        }
+        return Math.min(0.95, 1.0 - Math.pow(singletonFPRate, alleleCount)); //TODO -- define the vals
    }

    private void initializeUsingKMeans( final VariantDatum[] data ) {
@ -365,7 +313,7 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
        }
    }

-    public final void createClusters( final VariantDatum[] data, final int startCluster, final int stopCluster, final String clusterFileName ) {
+    public final void createClusters( final VariantDatum[] data, final int startCluster, final int stopCluster, final PrintStream clusterFile ) {

        final int numVariants = data.length;
        final int numAnnotations = data[0].annotations.length;
@ -439,38 +387,28 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
        /////////////////////////////////////////////////////////////////////////////////////////////////////////////////
        // Output the final cluster parameters
        /////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-        printClusterParameters( clusterFileName );
+        printClusterParameters( clusterFile );
    }

-    private void printClusterParameters( final String clusterFileName ) {
-        try {
-            final PrintStream outputFile = new PrintStream( clusterFileName );
-            dataManager.printClusterFileHeader( outputFile );
-            for( int iii = 1; iii < alleleCountFactorArray.length; iii++ ) {
-                outputFile.print("@!ALLELECOUNT,");
-                outputFile.println(iii + "," + alleleCountFactorArray[iii]);
-            }
+    private void printClusterParameters( final PrintStream clusterFile ) {
+        dataManager.printClusterFileHeader( clusterFile );

-            final int numAnnotations = mu[0].length;
-            for( int kkk = 0; kkk < maxGaussians; kkk++ ) {
-                if( Math.pow(10.0, pClusterLog10[kkk]) > 1E-4 ) { // BUGBUG: make this a command line argument
-                    final double sigmaVals[][] = sigma[kkk].getArray();
-                    outputFile.print("@!CLUSTER");
-                    outputFile.print("," + Math.pow(10.0, pClusterLog10[kkk]));
-                    for(int jjj = 0; jjj < numAnnotations; jjj++ ) {
-                        outputFile.print("," + mu[kkk][jjj]);
-                    }
-                    for(int jjj = 0; jjj < numAnnotations; jjj++ ) {
-                        for(int ppp = 0; ppp < numAnnotations; ppp++ ) {
-                            outputFile.print("," + (sigmaVals[jjj][ppp] / hyperParameter_a[kkk]) );
-                        }
-                    }
-                    outputFile.println();
+        final int numAnnotations = mu[0].length;
+        for( int kkk = 0; kkk < maxGaussians; kkk++ ) {
+            if( Math.pow(10.0, pClusterLog10[kkk]) > 1E-4 ) { // BUGBUG: make this a command line argument
+                final double sigmaVals[][] = sigma[kkk].getArray();
+                clusterFile.print("@!CLUSTER");
+                clusterFile.print("," + Math.pow(10.0, pClusterLog10[kkk]));
+                for(int jjj = 0; jjj < numAnnotations; jjj++ ) {
+                    clusterFile.print("," + mu[kkk][jjj]);
                }
+                for(int jjj = 0; jjj < numAnnotations; jjj++ ) {
+                    for(int ppp = 0; ppp < numAnnotations; ppp++ ) {
+                        clusterFile.print("," + (sigmaVals[jjj][ppp] / hyperParameter_a[kkk]) );
+                    }
+                }
+                clusterFile.println();
            }
-            outputFile.close();
-        } catch (Exception e) {
-            throw new StingException( "Unable to create output file: " + clusterFileName );
        }
    }

@ -563,13 +501,10 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
            }

            annIndex++;
-            outputFile.close();
        }
-
-       // BUGBUG: next output the actual cluster on top by integrating out every other annotation
    }

-    public final void outputOptimizationCurve( final VariantDatum[] data, final String outputPrefix,
+    public final void outputOptimizationCurve( final VariantDatum[] data, final PrintStream outputReportDatFile, final PrintStream outputTranchesFile,
                                               final int desiredNumVariants, final Double[] FDRtranches, final double QUAL_STEP ) {

        final int numVariants = data.length;
@ -595,16 +530,16 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel

        PrintStream outputFile;
        try {
-            outputFile = new PrintStream( outputPrefix + ".dat" );
+            outputFile = new PrintStream( outputReportDatFile );
        } catch (Exception e) {
-            throw new StingException( "Unable to create output file: " + outputPrefix + ".dat" );
+            throw new StingException( "Unable to create output file: " + outputReportDatFile );
        }
        PrintStream tranchesOutputFile;
        try {
-            tranchesOutputFile = new PrintStream( outputPrefix + ".dat.tranches" );
+            tranchesOutputFile = new PrintStream( outputTranchesFile );
            tranchesOutputFile.println("FDRtranche,novelTITV,pCut,numNovel,filterName");
        } catch (Exception e) {
-            throw new StingException( "Unable to create output file: " + outputPrefix + ".dat.tranches" );
+            throw new StingException( "Unable to create output file: " + outputTranchesFile );
        }

        int numKnown = 0;
@ -706,12 +641,8 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
                logger.info("\t" + String.format("--> with an implied novel FDR of %.2f percent", Math.abs(100.0 * (1.0-((novelTiTvAtCut[jjj] - 0.5) / (targetTITV - 0.5))))));
            }
        }
-
-        outputFile.close();
-        tranchesOutputFile.close();
    }

-
    private double evaluateGaussians( final VariantDatum[] data, final double[][] pVarInCluster, final int startCluster, final int stopCluster ) {

        final int numAnnotations = data[0].annotations.length;
--- a/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantOptimizationInterface.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantOptimizationInterface.java
@ -1,5 +1,7 @@
 package org.broadinstitute.sting.gatk.walkers.variantrecalibration;

+import java.io.PrintStream;
+
 /*
 * Copyright (c) 2010 The Broad Institute
 *
@ -32,5 +34,5 @@ package org.broadinstitute.sting.gatk.walkers.variantrecalibration;
 */

 public interface VariantOptimizationInterface {
-    public void run( String outputPrefix );
+    public void run( PrintStream outputPrefix );
 }
--- a/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java
@ -28,6 +28,8 @@ package org.broadinstitute.sting.gatk.walkers.variantrecalibration;
 import org.broad.tribble.dbsnp.DbSNPFeature;
 import org.broad.tribble.util.variantcontext.VariantContext;
 import org.broad.tribble.vcf.*;
+import org.broadinstitute.sting.commandline.Input;
+import org.broadinstitute.sting.commandline.Output;
 import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
 import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
 import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils;
@ -42,7 +44,9 @@ import org.broadinstitute.sting.commandline.Argument;
 import org.broadinstitute.sting.utils.vcf.VCFUtils;

 import java.io.File;
+import java.io.FileNotFoundException;
 import java.io.IOException;
+import java.io.PrintStream;
 import java.util.*;

 /**
@ -56,12 +60,29 @@ import java.util.*;

 public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDatum>, ExpandingArrayList<VariantDatum>> {

+    /////////////////////////////
+    // Inputs
+    /////////////////////////////
+    @Input(fullName="cluster_file", shortName="clusterFile", doc="The input cluster file generated by GenerateVariantClusters", required=true)
+    private File CLUSTER_FILE;
+
+    /////////////////////////////
+    // Outputs
+    /////////////////////////////
+    @Output(fullName="tranches_file", shortName="tranchesFile", doc="The output tranches file used by ApplyVariantCuts", required=true)
+    private PrintStream TRANCHES_FILE;
+    @Output(fullName="report_dat_file", shortName="reportDatFile", doc="The output report .dat file used with Rscript to create the optimization curve PDF file", required=true)
+    private File REPORT_DAT_FILE;
+    @Output(doc="File to which recalibrated variants should be written",required=true)
+    private VCFWriter vcfWriter = null;
+
+
    /////////////////////////////
    // Command Line Arguments
    /////////////////////////////
-    @Argument(fullName="target_titv", shortName="titv", doc="The expected Ti/Tv ratio to display on optimization curve output figures. (~~2.1 for whole genome experiments)", required=false)
-    private double TARGET_TITV = 2.1;
-    @Argument(fullName="backOff", shortName="backOff", doc="The Gaussian back off factor, used to prevent overfitting by spreading out the Gaussians.", required=false)
+    @Argument(fullName="target_titv", shortName="titv", doc="The expected novel Ti/Tv ratio to use when calculating FDR tranches and for display on optimization curve output figures. (~~2.07 for whole genome experiments)", required=true)
+    private double TARGET_TITV = 2.07;
+    @Argument(fullName="backOff", shortName="backOff", doc="The Gaussian back off factor, used to prevent overfitting by enlarging out the Gaussians.", required=false)
    private double BACKOFF_FACTOR = 1.3;
    @Argument(fullName="desired_num_variants", shortName="dV", doc="The desired number of variants to keep in a theoretically filtered set", required=false)
    private int DESIRED_NUM_VARIANTS = 0;
@ -75,14 +96,8 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
    private double KNOWN_QUAL_PRIOR = 10.0;
    @Argument(fullName="novel_prior", shortName="novelPrior", doc="A prior on the quality of novel variants, a phred scaled probability of being true.", required=false)
    private double NOVEL_QUAL_PRIOR = 2.0;
-    @Argument(fullName="output_prefix", shortName="output", doc="The prefix added to output VCF file name and optimization curve pdf file name", required=false)
-    private String OUTPUT_PREFIX = "optimizer";
-    @Argument(fullName="clusterFile", shortName="clusterFile", doc="The output cluster file", required=true)
-    private String CLUSTER_FILENAME = "optimizer.cluster";
    @Argument(fullName="FDRtranche", shortName="tranche", doc="The levels of novel false discovery rate (FDR, implied by ti/tv) at which to slice the data. (in percent, that is 1.0 for 1 percent)", required=false)
    private Double[] FDR_TRANCHES = null;
-    //@Argument(fullName = "optimization_model", shortName = "om", doc = "Optimization calculation model to employ -- GAUSSIAN_MIXTURE_MODEL is currently the default, while K_NEAREST_NEIGHBORS is also available for small callsets.", required = false)
-    private VariantOptimizationModel.Model OPTIMIZATION_MODEL = VariantOptimizationModel.Model.GAUSSIAN_MIXTURE_MODEL;
    @Argument(fullName = "path_to_Rscript", shortName = "Rscript", doc = "The path to your implementation of Rscript. For Broad users this is maybe /broad/tools/apps/R-2.6.0/bin/Rscript", required = false)
    private String PATH_TO_RSCRIPT = "Rscript";
    @Argument(fullName = "path_to_resources", shortName = "resources", doc = "Path to resources folder holding the Sting R scripts.", required = false)
@ -97,8 +112,8 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
    /////////////////////////////
    // Private Member Variables
    /////////////////////////////
+    private VariantOptimizationModel.Model OPTIMIZATION_MODEL = VariantOptimizationModel.Model.GAUSSIAN_MIXTURE_MODEL;
    private VariantGaussianMixtureModel theModel = null;
-    private VCFWriter vcfWriter;
    private Set<String> ignoreInputFilterSet = null;

    //---------------------------------------------------------------------------------------------------------------
@ -116,9 +131,10 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat

        switch (OPTIMIZATION_MODEL) {
            case GAUSSIAN_MIXTURE_MODEL:
-                theModel = new VariantGaussianMixtureModel( TARGET_TITV, CLUSTER_FILENAME, BACKOFF_FACTOR );
-                if ( SINGLETON_FP_RATE != -1 )
+                theModel = new VariantGaussianMixtureModel( TARGET_TITV, CLUSTER_FILE, BACKOFF_FACTOR );
+                if ( SINGLETON_FP_RATE != -1 ) {
                    theModel.setSingletonFPRate(SINGLETON_FP_RATE);
+                }
                break;
            //case K_NEAREST_NEIGHBORS:
            //    theModel = new VariantNearestNeighborsModel( dataManager, TARGET_TITV, NUM_KNN );
@ -136,7 +152,6 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
        hInfo.add(new VCFHeaderLine("source", "VariantOptimizer"));
        samples.addAll(SampleUtils.getUniqueSamplesFromRods(getToolkit()));

-        vcfWriter = new StandardVCFWriter( new File(OUTPUT_PREFIX + ".vcf") );
        final VCFHeader vcfHeader = new VCFHeader(hInfo, samples);
        vcfWriter.writeHeader(vcfHeader);

@ -154,13 +169,12 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat

        // Set up default values for the FDR tranches if necessary
        if( FDR_TRANCHES == null ) {
-            FDR_TRANCHES = new Double[6];
+            FDR_TRANCHES = new Double[5];
            FDR_TRANCHES[0] = 0.1;
            FDR_TRANCHES[1] = 1.0;
            FDR_TRANCHES[2] = 5.0;
-            FDR_TRANCHES[3] = 12.5;
-            FDR_TRANCHES[4] = 25.0;
-            FDR_TRANCHES[5] = 35.0;
+            FDR_TRANCHES[3] = 10.0;
+            FDR_TRANCHES[4] = 15.0;
        }
    }

@ -240,25 +254,31 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat

    public void onTraversalDone( ExpandingArrayList<VariantDatum> reduceSum ) {

-        vcfWriter.close();
-
        final VariantDataManager dataManager = new VariantDataManager( reduceSum, theModel.dataManager.annotationKeys );
        reduceSum.clear(); // Don't need this ever again, clean up some memory

-        theModel.outputOptimizationCurve( dataManager.data, OUTPUT_PREFIX, DESIRED_NUM_VARIANTS, FDR_TRANCHES, QUAL_STEP );
+        try {
+            PrintStream reportDatFilePrintStream = new PrintStream(REPORT_DAT_FILE);
+            theModel.outputOptimizationCurve( dataManager.data, reportDatFilePrintStream, TRANCHES_FILE, DESIRED_NUM_VARIANTS, FDR_TRANCHES, QUAL_STEP );
+        } catch ( FileNotFoundException e ) {
+            throw new StingException("Can't create report dat file: " + REPORT_DAT_FILE.getName());
+        }

        // Execute Rscript command to plot the optimization curve
        // Print out the command line to make it clear to the user what is being executed and how one might modify it
-        final String rScriptOptimizationCurveCommandLine = PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_OptimizationCurve.R" + " " + OUTPUT_PREFIX + ".dat" + " " + TARGET_TITV;
-        final String rScriptTranchesCommandLine = PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_Tranches.R" + " " + OUTPUT_PREFIX + ".dat" + " " + TARGET_TITV;
+        final String rScriptOptimizationCurveCommandLine = PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_OptimizationCurve.R" + " " + REPORT_DAT_FILE.getName() + " " + TARGET_TITV;
+        final String rScriptTranchesCommandLine = PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_Tranches.R" + " " + REPORT_DAT_FILE.getName() + " " + TARGET_TITV;
        logger.info( rScriptOptimizationCurveCommandLine );
        logger.info( rScriptTranchesCommandLine );

        // Execute the RScript command to plot the table of truth values
        try {
-            Runtime.getRuntime().exec( rScriptOptimizationCurveCommandLine );
-            Runtime.getRuntime().exec( rScriptTranchesCommandLine );
-        } catch ( IOException e ) {
+            Process p;
+            p = Runtime.getRuntime().exec( rScriptOptimizationCurveCommandLine );
+            p.waitFor();
+            p = Runtime.getRuntime().exec( rScriptTranchesCommandLine );
+            p.waitFor();
+        } catch ( Exception e ) {
            Utils.warnUser("Unable to execute the RScript command.  While not critical to the calculations themselves, the script outputs a report that is extremely useful for confirming that the recalibration proceded as expected.  We highly recommend trying to rerun the script manually if possible.");
        }
    }
--- a/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java
+++ b/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java
@ -7,12 +7,14 @@ import java.util.*;
 import java.io.File;

 public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest {
-    static HashMap<String, String> paramsFiles = new HashMap<String, String>();
-    
+    static HashMap<String, String> clusterFiles = new HashMap<String, String>();
+    static HashMap<String, String> tranchesFiles = new HashMap<String, String>();
+    static HashMap<String, String> inputVCFFiles = new HashMap<String, String>();
+
    @Test
    public void testGenerateVariantClusters() {
        HashMap<String, String> e = new HashMap<String, String>();
-        e.put( validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", "005898194773d2f2be45a68f6e4c4c25" );
+        e.put( validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", "e6724946c3298b3d74bb1ba1396a9190" );

        for ( Map.Entry<String, String> entry : e.entrySet() ) {
            String vcf = entry.getKey();
@ -20,7 +22,7 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest {

            WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
                    "-R " + b36KGReference +
-                            " --DBSNP /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod" +
+                            " --DBSNP " + GATKDataLocation + "dbsnp_129_b36.rod" +
                            " -T GenerateVariantClusters" +
                            " -B input,VCF," + vcf +
                            " -L 1:1-100,000,000" +
@ -30,37 +32,69 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest {
                    1, // just one output file
                    Arrays.asList(md5));
            List<File> result = executeTest("testGenerateVariantClusters", spec).getFirst();
-            paramsFiles.put(vcf, result.get(0).getAbsolutePath());
+            clusterFiles.put(vcf, result.get(0).getAbsolutePath());
        }
    }

    @Test
    public void testVariantRecalibrator() {
        HashMap<String, String> e = new HashMap<String, String>();
-        e.put( validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", "acb89428115564cbf35ab7c006252108" );
+        e.put( validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", "b97ab64b86ce8c8698855058d32853ce" );
        
        for ( Map.Entry<String, String> entry : e.entrySet() ) {
            String vcf = entry.getKey();
            String md5 = entry.getValue();
-            String paramsFile = paramsFiles.get(vcf);
-            System.out.printf("PARAMS FOR %s is %s%n", vcf, paramsFile);
-            if ( paramsFile != null ) {
-                File file = createTempFile("cluster",".vcf");
-                WalkerTestSpec spec = new WalkerTestSpec(
+            String clusterFile = clusterFiles.get(vcf);
+            System.out.printf("PARAMS FOR %s is %s%n", vcf, clusterFile);
+            if ( clusterFile != null ) {
+                WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
                        "-R " + b36KGReference +
-                                " --DBSNP /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod" +
+                                " --DBSNP " + GATKDataLocation + "dbsnp_129_b36.rod" +
                                " -T VariantRecalibrator" +
                                " -B input,VCF," + vcf +
                                " -L 1:40,000,000-100,000,000" +
                                " --ignore_filter GATK_STANDARD" +
-                                " -output " + file.getAbsolutePath().substring(0,file.getAbsolutePath().length()-4) +
-                                " -clusterFile " + paramsFile,
-                        0,
-                        new ArrayList<String>(0));
+                                " --ignore_filter HARD_TO_VALIDATE" +
+                                " -clusterFile " + clusterFile +
+                                " -titv 2.07" +
+                                " -o %s" +
+                                " -tranchesFile %s" +
+                                " -reportDatFile %s",                                
+                        3, // two output file
+                        Arrays.asList(md5, "f603aa2052ae6e81a6bde63a8a3f9539","951a17f9c11de391763b9a8cb239205a"));
+                List<File> result = executeTest("testVariantRecalibrator", spec).getFirst();
+                inputVCFFiles.put(vcf, result.get(0).getAbsolutePath());
+                tranchesFiles.put(vcf, result.get(1).getAbsolutePath());
+            }
+        }

-                spec.addAuxFile(md5, file);

-                executeTest("testVariantRecalibrator", spec);
+    }
+
+    @Test
+    public void testApplyVariantCuts() {
+        HashMap<String, String> e = new HashMap<String, String>();
+        e.put( validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", "e80f26d68be2b183fd7f062039cef28a" );
+
+        for ( Map.Entry<String, String> entry : e.entrySet() ) {
+            String vcf = entry.getKey();
+            String md5 = entry.getValue();
+            String inputVCFFile = inputVCFFiles.get(vcf);
+            String tranchesFile = tranchesFiles.get(vcf);
+            System.out.printf("PARAMS FOR %s is %s%n", vcf, inputVCFFile);
+            System.out.printf("PARAMS FOR %s is %s%n", vcf, tranchesFile);
+            if ( inputVCFFile != null && tranchesFile != null ) {
+                WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
+                        "-R " + b36KGReference +
+                                " --DBSNP " + GATKDataLocation + "dbsnp_129_b36.rod" +
+                                " -T ApplyVariantCuts" +
+                                " -L 1:40,000,000-100,000,000" +
+                                " -B input,VCF," + inputVCFFile +
+                                " -o %s" +
+                                " -tranchesFile " + tranchesFile,
+                        1, // just one output file
+                        Arrays.asList(md5));
+                List<File> result = executeTest("testApplyVariantCuts", spec).getFirst();
            }
        }
    }