Adding a skeleton for the second step of the variant optimization process.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3023 348d0f76-0448-11de-a6fe-93d51630548a
2010-03-17 17:03:40 +00:00 · 2010-03-17 17:03:40 +00:00 · 1bb4394aa9
parent ded4ba8966
commit 1bb4394aa9
8 changed files with 274 additions and 61 deletions
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/ApplyVariantClustersWalker.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/ApplyVariantClustersWalker.java
@ -0,0 +1,125 @@
+package org.broadinstitute.sting.playground.gatk.walkers.variantoptimizer;
+
+import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
+import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
+import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
+import org.broadinstitute.sting.gatk.walkers.RodWalker;
+import org.broadinstitute.sting.utils.ExpandingArrayList;
+import org.broadinstitute.sting.utils.StingException;
+import org.broadinstitute.sting.utils.cmdLine.Argument;
+
+/*
+ * Copyright (c) 2010 The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * Applies calibrated variant cluster parameters to variant calls to produce an accurate and informative variant quality score
+ *
+ * @author rpoplin
+ * @since Mar 17, 2010
+ *
+ * @help.summary Applies calibrated variant cluster parameters to variant calls to produce an accurate and informative variant quality score
+ */
+
+public class ApplyVariantClustersWalker extends RodWalker<ExpandingArrayList<VariantDatum>, ExpandingArrayList<VariantDatum>> {
+
+    /////////////////////////////
+    // Command Line Arguments
+    /////////////////////////////
+    //@Argument(fullName="target_titv", shortName="titv", doc="The target Ti/Tv ratio towards which to optimize. (~~2.2 for whole genome experiments)", required=false)
+    //private double TARGET_TITV = 2.1;
+    @Argument(fullName="desired_num_variants", shortName="dV", doc="The desired number of variants to keep in a theoretically filtered set", required=false)
+    private int DESIRED_NUM_VARIANTS = 0;
+    @Argument(fullName="ignore_input_filters", shortName="ignoreFilters", doc="If specified the optimizer will use variants even if the FILTER column is marked in the VCF file", required=false)
+    private boolean IGNORE_INPUT_FILTERS = false;
+    @Argument(fullName="output_prefix", shortName="output", doc="The prefix added to output VCF file name and optimization curve pdf file name", required=false)
+    private String OUTPUT_PREFIX = "optimizer";
+    @Argument(fullName="clusterFile", shortName="clusterFile", doc="The output cluster file", required=true)
+    private String CLUSTER_FILENAME = "optimizer.cluster";
+    //@Argument(fullName = "optimization_model", shortName = "om", doc = "Optimization calculation model to employ -- GAUSSIAN_MIXTURE_MODEL is currently the default, while K_NEAREST_NEIGHBORS is also available for small callsets.", required = false)
+    private VariantOptimizationModel.Model OPTIMIZATION_MODEL = VariantOptimizationModel.Model.GAUSSIAN_MIXTURE_MODEL;
+    @Argument(fullName = "path_to_Rscript", shortName = "Rscript", doc = "The path to your implementation of Rscript. For Broad users this is probably /broad/tools/apps/R-2.6.0/bin/Rscript", required = false)
+    private String PATH_TO_RSCRIPT = "/broad/tools/apps/R-2.6.0/bin/Rscript";
+    @Argument(fullName = "path_to_resources", shortName = "resources", doc = "Path to resources folder holding the Sting R scripts.", required = false)
+    private String PATH_TO_RESOURCES = "R/";
+
+    /////////////////////////////
+    // Private Member Variables
+    /////////////////////////////
+    private final ExpandingArrayList<String> annotationKeys = new ExpandingArrayList<String>();
+
+    //---------------------------------------------------------------------------------------------------------------
+    //
+    // initialize
+    //
+    //---------------------------------------------------------------------------------------------------------------
+
+    public void initialize() {
+        if( !PATH_TO_RESOURCES.endsWith("/") ) { PATH_TO_RESOURCES = PATH_TO_RESOURCES + "/"; }
+
+        VariantOptimizationModel theModel = null;
+        switch (OPTIMIZATION_MODEL) {
+            case GAUSSIAN_MIXTURE_MODEL:
+                theModel = new VariantGaussianMixtureModel( CLUSTER_FILENAME );
+                break;
+            //case K_NEAREST_NEIGHBORS:
+            //    theModel = new VariantNearestNeighborsModel( dataManager, TARGET_TITV, NUM_KNN );
+            //    break;
+            default:
+                throw new StingException( "Variant Optimization Model is unrecognized. Implemented options are GAUSSIAN_MIXTURE_MODEL and K_NEAREST_NEIGHBORS" );
+        }
+    }
+
+    //---------------------------------------------------------------------------------------------------------------
+    //
+    // map
+    //
+    //---------------------------------------------------------------------------------------------------------------
+
+    public ExpandingArrayList<VariantDatum> map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) {
+        final ExpandingArrayList<VariantDatum> mapList = new ExpandingArrayList<VariantDatum>();
+        return mapList;
+    }
+
+    //---------------------------------------------------------------------------------------------------------------
+    //
+    // reduce
+    //
+    //---------------------------------------------------------------------------------------------------------------
+
+     public ExpandingArrayList<VariantDatum> reduceInit() {
+        return new ExpandingArrayList<VariantDatum>();
+    }
+
+    public ExpandingArrayList<VariantDatum> reduce( final ExpandingArrayList<VariantDatum> mapValue, final ExpandingArrayList<VariantDatum> reduceSum ) {
+        reduceSum.addAll( mapValue );
+        return reduceSum;
+    }
+
+    public void onTraversalDone( ExpandingArrayList<VariantDatum> reduceSum ) {
+
+        final VariantDataManager dataManager = new VariantDataManager( reduceSum, annotationKeys );
+    }
+}
+
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantClusteringModel.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantClusteringModel.java
@ -33,5 +33,5 @@ package org.broadinstitute.sting.playground.gatk.walkers.variantoptimizer;

 public interface VariantClusteringModel extends VariantOptimizationInterface {
    public void createClusters( final VariantDatum[] data );
-    public void applyClusters( final VariantDatum[] data, final String outputPrefix );
+    //public void applyClusters( final VariantDatum[] data, final String outputPrefix );
 }
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantDataManager.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantDataManager.java
@ -4,6 +4,8 @@ import org.broadinstitute.sting.utils.ExpandingArrayList;
 import org.broadinstitute.sting.utils.StingException;

 import java.io.PrintStream;
+import java.util.ArrayList;
+import java.util.regex.Pattern;

 /*
 * Copyright (c) 2010 The Broad Institute
@ -40,8 +42,8 @@ public class VariantDataManager {
    public final VariantDatum[] data;
    public final int numVariants;
    public final int numAnnotations;
-    public final double[] varianceVector;
    public final double[] meanVector;
+    public final double[] varianceVector;
    public boolean isNormalized;
    private final ExpandingArrayList<String> annotationKeys;

@ -61,10 +63,26 @@ public class VariantDataManager {
        annotationKeys = _annotationKeys;
    }

-    public void normalizeData() {
-        for( int iii = 0; iii < numVariants; iii++ ) {
-            data[iii].originalAnnotations = data[iii].annotations.clone();
+    public VariantDataManager( final ExpandingArrayList<String> annotationLines ) {
+        data = null;
+        numVariants = 0;
+        numAnnotations = annotationLines.size();
+        meanVector = new double[numAnnotations];
+        varianceVector = new double[numAnnotations];
+        isNormalized = true;
+        annotationKeys = new ExpandingArrayList<String>();
+
+        int jjj = 0;
+        for( String line : annotationLines ) {
+            String[] vals = line.split(",");
+            annotationKeys.add(vals[1]);
+            meanVector[jjj] = Double.parseDouble(vals[2]);
+            varianceVector[jjj] = Double.parseDouble(vals[3]);
+            jjj++;
        }
+    }
+
+    public void normalizeData() {
        for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
            final double theMean = mean(data, jjj);
            final double theSTD = standardDeviation(data, theMean, jjj);
@ -99,10 +117,9 @@ public class VariantDataManager {
        return Math.sqrt( sum );
    }

-//    public void printClusterFileHeader( PrintStream outputFile ) {
-//        for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
-//            outputFile.println("@ANNOTATION," + (jjj == numAnnotations-1 ? "QUAL" : annotationKeys.get(jjj)) + "," + meanVector[jjj] + "," + varianceVector[jjj]);
-//        }
-//    }
-
+    public void printClusterFileHeader( PrintStream outputFile ) {
+        for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
+            outputFile.println("@!ANNOTATION," + (jjj == numAnnotations-1 ? "QUAL" : annotationKeys.get(jjj)) + "," + meanVector[jjj] + "," + varianceVector[jjj]);
+        }
+    }
 }
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantDatum.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantDatum.java
@ -33,7 +33,7 @@ package org.broadinstitute.sting.playground.gatk.walkers.variantoptimizer;

 public class VariantDatum {
    public double[] annotations;
-    public double[] originalAnnotations;
    public boolean isTransition;
    public boolean isKnown;
+    public double pTrue;
 }
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantGaussianMixtureModel.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantGaussianMixtureModel.java
@ -1,7 +1,15 @@
 package org.broadinstitute.sting.playground.gatk.walkers.variantoptimizer;

+import org.broadinstitute.sting.utils.ExpandingArrayList;
+import org.broadinstitute.sting.utils.StingException;
+import org.broadinstitute.sting.utils.xReadLines;
+
+import java.io.File;
+import java.io.FileNotFoundException;
 import java.io.PrintStream;
+import java.util.ArrayList;
 import java.util.Random;
+import java.util.regex.Pattern;

 /*
 * Copyright (c) 2010 The Broad Institute
@ -36,6 +44,7 @@ import java.util.Random;

 public final class VariantGaussianMixtureModel extends VariantOptimizationModel implements VariantClusteringModel {

+    private final VariantDataManager dataManager;
    private final int numGaussians;
    private final int numIterations;
    private final long RANDOM_SEED = 91801305;
@ -50,9 +59,15 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
    private final int[] numMaxClusterNovel;
    private final double[] clusterTITV;
    private final double[] clusterTruePositiveRate; // The true positive rate implied by the cluster's Ti/Tv ratio
+    //private final int desiredNumVariants;
+    private final int minVarInCluster;

-    public VariantGaussianMixtureModel( VariantDataManager _dataManager, final double _targetTITV, final int _numGaussians, final int _numIterations ) {
-       super( _dataManager, _targetTITV );
+    private static final Pattern ANNOTATION_PATTERN = Pattern.compile("^@!ANNOTATION.*");
+    private static final Pattern CLUSTER_PATTERN = Pattern.compile("^@!CLUSTER.*");
+
+    public VariantGaussianMixtureModel( final VariantDataManager _dataManager, final double _targetTITV, final int _numGaussians, final int _numIterations, final int _minVarInCluster ) {
+        super( _targetTITV );
+        dataManager = _dataManager;
        numGaussians = _numGaussians;
        numIterations = _numIterations;

@ -63,9 +78,46 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
        numMaxClusterNovel = new int[numGaussians];
        clusterTITV = new double[numGaussians];
        clusterTruePositiveRate = new double[numGaussians];
+        //desiredNumVariants = _desiredNumVariants;
+        minVarInCluster = _minVarInCluster;
+    }
+
+    public VariantGaussianMixtureModel( final String clusterFileName ) {
+        super( 0.0 );
+        ExpandingArrayList<String> annotationLines = new ExpandingArrayList<String>();
+        ExpandingArrayList<String> clusterLines = new ExpandingArrayList<String>();
+
+        try {
+            for ( String line : new xReadLines(new File( clusterFileName )) ) {
+                if( ANNOTATION_PATTERN.matcher(line).matches() ) {
+                    annotationLines.add(line);
+                } else if( CLUSTER_PATTERN.matcher(line).matches() ) {
+                    clusterLines.add(line);
+                } else {
+                    throw new StingException("Malformed input file: " + clusterFileName);
+                }
+            }
+        } catch ( FileNotFoundException e ) {
+            throw new StingException("Can not find input file: " + clusterFileName);
+        }
+
+        dataManager = new VariantDataManager( annotationLines );
+        numIterations = 0;
+        pCluster = null;
+        numMaxClusterKnown = null;
+        numMaxClusterNovel = null;
+        clusterTITV = null;
+        clusterTruePositiveRate = null;
+        //desiredNumVariants = _desiredNumVariants;
+        minVarInCluster = 0;
+
+        numGaussians = clusterLines.size();
+        mu = new double[numGaussians][];
+        sigma = new double[numGaussians][];
+
    }
    
-    public final void run( final String outputPrefix ) {
+    public final void run( final String clusterFileName ) {

        // Create the subset of the data to cluster with
        int numNovel = 0;
@ -78,6 +130,7 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel

        // Grab a set of data that is all of the novel variants plus 1.5x as many known variants drawn at random
        // If there are almost as many novels as known, simply use all the variants
+        // BUGBUG: allow downsampling and arbitrary mixtures of knowns and novels
        final int numSubset = (int)Math.floor(numNovel*2.5);
        if( numSubset * 1.3 < dataManager.numVariants ) {
            data = new VariantDatum[numSubset];
@ -100,10 +153,10 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
        System.out.println("Clustering with " + numNovel + " novel variants and " + (data.length - numNovel) + " known variants...");
        if( data.length == dataManager.numVariants ) { System.out.println(" (used all variants since 2.5*numNovel is so large compared to the full set) "); }
        createClusters( data ); // Using a subset of the data
-        System.out.println("Printing out cluster parameters...");
-        printClusters( outputPrefix );
-        System.out.println("Applying clusters to all variants...");
-        applyClusters( dataManager.data, outputPrefix ); // Using all the data
+        System.out.println("Outputting cluster parameters...");
+        printClusters( clusterFileName );
+        //System.out.println("Applying clusters to all variants...");
+        //applyClusters( dataManager.data, outputPrefix ); // Using all the data
    }

    public final void createClusters( final VariantDatum[] data ) {
@ -198,15 +251,14 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
        }
    }

-    private void printClusters( final String outputPrefix ) {
+    private void printClusters( final String clusterFileName ) {
        try {
-            final PrintStream outputFile = new PrintStream( outputPrefix + ".clusters" );
-            int clusterNumber = 0;
+            final PrintStream outputFile = new PrintStream( clusterFileName );
+            dataManager.printClusterFileHeader( outputFile );
            final int numAnnotations = mu[0].length;
            for( int kkk = 0; kkk < numGaussians; kkk++ ) {
-                //BUGBUG: only output the good clusters here to protect against over-fitting?
-                //if( numMaxClusterKnown[kkk] + numMaxClusterNovel[kkk] >= 2000 ) {
-                    outputFile.print(clusterNumber + ",");
+                if( numMaxClusterKnown[kkk] + numMaxClusterNovel[kkk] >= minVarInCluster ) {
+                    outputFile.print("@!CLUSTER,");
                    outputFile.print(numMaxClusterKnown[kkk] + ",");
                    outputFile.print(numMaxClusterNovel[kkk] + ",");
                    outputFile.print(clusterTITV[kkk] + ",");
@ -218,8 +270,7 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
                        outputFile.print(sigma[kkk][jjj] + ",");
                    }
                    outputFile.println(-1);
-                    clusterNumber++;
-                //}
+                }
            }
        } catch (Exception e) {
            e.printStackTrace();
@ -227,6 +278,7 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
        }
    }

+    /*
    public final void applyClusters( final VariantDatum[] data, final String outputPrefix ) {

        final int numVariants = data.length;
@ -262,6 +314,7 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
        int numKnownTv = 0;
        int numNovelTi = 0;
        int numNovelTv = 0;
+        boolean foundDesiredNumVariants = false;
        outputFile.println("pCut,numKnown,numNovel,knownTITV,novelTITV");
        for( double pCut = 1.0; pCut >= 0.0; pCut -= 0.001 ) {
            for( int iii = 0; iii < numVariants; iii++ ) {
@ -286,9 +339,21 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
                    }
                }
            }
-            outputFile.println( pCut + "," + numKnown + "," + numNovel + "," + ( ((double)numKnownTi) / ((double)numKnownTv) ) + "," + ( ((double)numNovelTi) / ((double)numNovelTv) ));
+            if( desiredNumVariants != 0 && !foundDesiredNumVariants && (numKnown + numNovel) >= desiredNumVariants ) {
+                System.out.println( "Keeping variants with p(true) >= " + String.format("%.3f",pCut) + " results in a filtered set with: " );
+                System.out.println("\t" + numKnown + " known variants");
+                System.out.println("\t" + numNovel + " novel variants, (dbSNP rate = " + String.format("%.2f",((double) numKnown * 100.0) / ((double) numKnown + numNovel) ) + "%)");
+                System.out.println("\t" + String.format("%.4f known Ti/Tv ratio", ((double)numKnownTi) / ((double)numKnownTv)));
+                System.out.println("\t" + String.format("%.4f novel Ti/Tv ratio", ((double)numNovelTi) / ((double)numNovelTv)));
+                System.out.println();
+                foundDesiredNumVariants = true;
+            }
+            outputFile.println( pCut + "," + numKnown + "," + numNovel + "," +
+                    ( numKnownTi ==0 || numKnownTv == 0 ? "NaN" : ( ((double)numKnownTi) / ((double)numKnownTv) ) ) + "," +
+                    ( numNovelTi ==0 || numNovelTv == 0 ? "NaN" : ( ((double)numNovelTi) / ((double)numNovelTv) )));
        }
    }
+    */


    private void evaluateGaussians( final VariantDatum[] data, final double[][] pVarInCluster ) {
@ -327,7 +392,6 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel

        double sumProb = 0.0;
        for( int kkk = 0; kkk < numGaussians; kkk++ ) {
-            //BUGBUG: only use the good clusters here to protect against over-fitting?
            double sum = 0.0;
            for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
                sum += ( (datum.annotations[jjj] - mu[kkk][jjj]) * (datum.annotations[jjj] - mu[kkk][jjj]) )
@ -400,6 +464,7 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
        }

        // Clean up extra big or extra small clusters
+        //BUGBUG: Is this a good idea?
        for( int kkk = 0; kkk < numGaussians; kkk++ ) {
            if( pCluster[kkk] > 0.45 ) { // This is a very large cluster compared to all the others
                final int numToReplace = 4;
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantNearestNeighborsModel.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantNearestNeighborsModel.java
@ -40,7 +40,8 @@ public final class VariantNearestNeighborsModel extends VariantOptimizationModel
    private final int numKNN;

    public VariantNearestNeighborsModel( VariantDataManager _dataManager, final double _targetTITV, final int _numKNN ) {
-        super( _dataManager, _targetTITV );
+        super( _targetTITV );
+        //dataManager = _dataManager;
        numKNN = _numKNN;
    }
    
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantOptimizationModel.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantOptimizationModel.java
@ -38,11 +38,9 @@ public abstract class VariantOptimizationModel implements VariantOptimizationInt
        K_NEAREST_NEIGHBORS
    }

-    protected final VariantDataManager dataManager;
    protected final double targetTITV;

-    public VariantOptimizationModel( VariantDataManager _dataManager, final double _targetTITV ) {
-        dataManager = _dataManager;
+    public VariantOptimizationModel( final double _targetTITV ) {
        targetTITV = _targetTITV;
    }

--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantOptimizer.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantOptimizer.java
@ -38,13 +38,12 @@ import java.io.IOException;
 */

 /**
- * Takes variant calls as .vcf files, learns a Gaussian mixture model over the variant annotations producing a callset that is optimized for
- *  an expected transition / transversion ratio.
+ * Takes variant calls as .vcf files, learns a Gaussian mixture model over the variant annotations producing calibrated variant cluster parameters which can be applied to other datasets
 *
 * @author rpoplin
 * @since Feb 11, 2010
 *
- * @help.summary Takes variant calls as .vcf files, learns a Gaussian mixture model over the variant annotations producing an optimized callset
+ * @help.summary Takes variant calls as .vcf files, learns a Gaussian mixture model over the variant annotations producing calibrated variant cluster parameters which can be applied to other datasets
 */

 public class VariantOptimizer extends RodWalker<ExpandingArrayList<VariantDatum>, ExpandingArrayList<VariantDatum>> {
@ -54,26 +53,31 @@ public class VariantOptimizer extends RodWalker<ExpandingArrayList<VariantDatum>
    /////////////////////////////
    @Argument(fullName="target_titv", shortName="titv", doc="The target Ti/Tv ratio towards which to optimize. (~~2.2 for whole genome experiments)", required=true)
    private double TARGET_TITV = 2.1;
+    //@Argument(fullName="desired_num_variants", shortName="dV", doc="The desired number of variants to keep in a theoretically filtered set", required=false)
+    //private int DESIRED_NUM_VARIANTS = 0;
    @Argument(fullName="ignore_input_filters", shortName="ignoreFilters", doc="If specified the optimizer will use variants even if the FILTER column is marked in the VCF file", required=false)
    private boolean IGNORE_INPUT_FILTERS = false;
    @Argument(fullName="exclude_annotation", shortName="exclude", doc="The names of the annotations which should be excluded from the calculations", required=false)
    private String[] EXCLUDED_ANNOTATIONS = null;
    @Argument(fullName="force_annotation", shortName="force", doc="The names of the annotations which should be forced into the calculations even if they aren't present in every variant", required=false)
    private String[] FORCED_ANNOTATIONS = null;
-    @Argument(fullName="output_prefix", shortName="output", doc="The output prefix added to output cluster file name and optimization curve pdf file name", required=false)
-    private String OUTPUT_PREFIX = "optimizer";
+    @Argument(fullName="clusterFile", shortName="clusterFile", doc="The output cluster file", required=true)
+    private String CLUSTER_FILENAME = "optimizer.cluster";
    @Argument(fullName="numGaussians", shortName="nG", doc="The number of Gaussians to be used in the Gaussian Mixture model", required=false)
    private int NUM_GAUSSIANS = 32;
    @Argument(fullName="numIterations", shortName="nI", doc="The number of iterations to be performed in the Gaussian Mixture model", required=false)
    private int NUM_ITERATIONS = 10;
-    @Argument(fullName="knn", shortName="knn", doc="The number of nearest neighbors to be used in the k-Nearest Neighbors model", required=false)
-    private int NUM_KNN = 2000;
-    @Argument(fullName = "optimization_model", shortName = "om", doc = "Optimization calculation model to employ -- GAUSSIAN_MIXTURE_MODEL is currently the default, while K_NEAREST_NEIGHBORS is also available for small callsets.", required = false)
+    @Argument(fullName="minVarInCluster", shortName="minVar", doc="The minimum number of variants in a cluster to be considered a valid cluster. Used to prevent overfitting. Default is 2000.", required=true)
+    private int MIN_VAR_IN_CLUSTER = 2000;
+
+    //@Argument(fullName="knn", shortName="knn", doc="The number of nearest neighbors to be used in the k-Nearest Neighbors model", required=false)
+    //private int NUM_KNN = 2000;
+    //@Argument(fullName = "optimization_model", shortName = "om", doc = "Optimization calculation model to employ -- GAUSSIAN_MIXTURE_MODEL is currently the default, while K_NEAREST_NEIGHBORS is also available for small callsets.", required = false)
    private VariantOptimizationModel.Model OPTIMIZATION_MODEL = VariantOptimizationModel.Model.GAUSSIAN_MIXTURE_MODEL;
-    @Argument(fullName = "path_to_Rscript", shortName = "Rscript", doc = "The path to your implementation of Rscript. For Broad users this is probably /broad/tools/apps/R-2.6.0/bin/Rscript", required = false)
-    private String PATH_TO_RSCRIPT = "/broad/tools/apps/R-2.6.0/bin/Rscript";
-    @Argument(fullName = "path_to_resources", shortName = "resources", doc = "Path to resources folder holding the Sting R scripts.", required = false)
-    private String PATH_TO_RESOURCES = "R/";
+    //@Argument(fullName = "path_to_Rscript", shortName = "Rscript", doc = "The path to your implementation of Rscript. For Broad users this is probably /broad/tools/apps/R-2.6.0/bin/Rscript", required = false)
+    //private String PATH_TO_RSCRIPT = "/broad/tools/apps/R-2.6.0/bin/Rscript";
+    //@Argument(fullName = "path_to_resources", shortName = "resources", doc = "Path to resources folder holding the Sting R scripts.", required = false)
+    //private String PATH_TO_RESOURCES = "R/";

    /////////////////////////////
    // Private Member Variables
@ -90,7 +94,7 @@ public class VariantOptimizer extends RodWalker<ExpandingArrayList<VariantDatum>
    //---------------------------------------------------------------------------------------------------------------

    public void initialize() {
-        if( !PATH_TO_RESOURCES.endsWith("/") ) { PATH_TO_RESOURCES = PATH_TO_RESOURCES + "/"; }   
+        //if( !PATH_TO_RESOURCES.endsWith("/") ) { PATH_TO_RESOURCES = PATH_TO_RESOURCES + "/"; }   
    }

    //---------------------------------------------------------------------------------------------------------------
@ -177,10 +181,10 @@ public class VariantOptimizer extends RodWalker<ExpandingArrayList<VariantDatum>

    public void onTraversalDone( ExpandingArrayList<VariantDatum> reduceSum ) {

-        final VariantDataManager dataManager = new VariantDataManager( reduceSum, annotationKeys);
+        final VariantDataManager dataManager = new VariantDataManager( reduceSum, annotationKeys );
        reduceSum.clear(); // Don't need this ever again, clean up some memory

-        logger.info( "There are " + dataManager.numVariants + " variants and " + dataManager.numAnnotations + " annotations.");
+        logger.info( "There are " + dataManager.numVariants + " variants and " + dataManager.numAnnotations + " annotations." );
        logger.info( "The annotations are: " + annotationKeys + " and QUAL." );

        dataManager.normalizeData(); // Each data point is now [ (x - mean) / standard deviation ]
@ -189,28 +193,31 @@ public class VariantOptimizer extends RodWalker<ExpandingArrayList<VariantDatum>
        VariantOptimizationModel theModel;
        switch (OPTIMIZATION_MODEL) {
            case GAUSSIAN_MIXTURE_MODEL:
-                theModel = new VariantGaussianMixtureModel( dataManager, TARGET_TITV, NUM_GAUSSIANS, NUM_ITERATIONS  );
-                break;
-            case K_NEAREST_NEIGHBORS:
-                theModel = new VariantNearestNeighborsModel( dataManager, TARGET_TITV, NUM_KNN );
+                theModel = new VariantGaussianMixtureModel( dataManager, TARGET_TITV, NUM_GAUSSIANS, NUM_ITERATIONS, MIN_VAR_IN_CLUSTER );
                break;
+            //case K_NEAREST_NEIGHBORS:
+            //    theModel = new VariantNearestNeighborsModel( dataManager, TARGET_TITV, NUM_KNN );
+            //    break;
            default:
-                throw new StingException("Variant Optimization Model is unrecognized. Implemented options are GAUSSIAN_MIXTURE_MODEL and K_NEAREST_NEIGHBORS");
+                throw new StingException( "Variant Optimization Model is unrecognized. Implemented options are GAUSSIAN_MIXTURE_MODEL and K_NEAREST_NEIGHBORS" );
        }
        
-        theModel.run( OUTPUT_PREFIX );
+        theModel.run( CLUSTER_FILENAME );
+
+
+        // Functionality moved to different walker

        // Execute Rscript command to plot the optimization curve
        // Print out the command line to make it clear to the user what is being executed and how one might modify it
-        final String rScriptCommandLine = PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_OptimizationCurve.R" + " " + OUTPUT_PREFIX + ".dat" + " " + TARGET_TITV;
-        System.out.println( rScriptCommandLine );
+        //final String rScriptCommandLine = PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_OptimizationCurve.R" + " " + OUTPUT_PREFIX + ".dat" + " " + TARGET_TITV;
+        //System.out.println( rScriptCommandLine );

        // Execute the RScript command to plot the table of truth values
-        try {
-            Runtime.getRuntime().exec( rScriptCommandLine );
-        } catch ( IOException e ) {
-            throw new StingException( "Unable to execute RScript command: " + rScriptCommandLine );
-        }
+        //try {
+        //    Runtime.getRuntime().exec( rScriptCommandLine );
+        //} catch ( IOException e ) {
+        //    throw new StingException( "Unable to execute RScript command: " + rScriptCommandLine );
+        //}
    }

 }