From 1bb4394aa9f6204bf18156a3e9101817539f365e Mon Sep 17 00:00:00 2001 From: rpoplin Date: Wed, 17 Mar 2010 17:03:40 +0000 Subject: [PATCH] Adding a skeleton for the second step of the variant optimization process. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3023 348d0f76-0448-11de-a6fe-93d51630548a --- .../ApplyVariantClustersWalker.java | 125 ++++++++++++++++++ .../VariantClusteringModel.java | 2 +- .../variantoptimizer/VariantDataManager.java | 37 ++++-- .../variantoptimizer/VariantDatum.java | 2 +- .../VariantGaussianMixtureModel.java | 99 +++++++++++--- .../VariantNearestNeighborsModel.java | 3 +- .../VariantOptimizationModel.java | 4 +- .../variantoptimizer/VariantOptimizer.java | 63 +++++---- 8 files changed, 274 insertions(+), 61 deletions(-) create mode 100755 java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/ApplyVariantClustersWalker.java diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/ApplyVariantClustersWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/ApplyVariantClustersWalker.java new file mode 100755 index 000000000..6b474a923 --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/ApplyVariantClustersWalker.java @@ -0,0 +1,125 @@ +package org.broadinstitute.sting.playground.gatk.walkers.variantoptimizer; + +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.RodWalker; +import org.broadinstitute.sting.utils.ExpandingArrayList; +import org.broadinstitute.sting.utils.StingException; +import org.broadinstitute.sting.utils.cmdLine.Argument; + +/* + * Copyright (c) 2010 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +/** + * Applies calibrated variant cluster parameters to variant calls to produce an accurate and informative variant quality score + * + * @author rpoplin + * @since Mar 17, 2010 + * + * @help.summary Applies calibrated variant cluster parameters to variant calls to produce an accurate and informative variant quality score + */ + +public class ApplyVariantClustersWalker extends RodWalker, ExpandingArrayList> { + + ///////////////////////////// + // Command Line Arguments + ///////////////////////////// + //@Argument(fullName="target_titv", shortName="titv", doc="The target Ti/Tv ratio towards which to optimize. (~~2.2 for whole genome experiments)", required=false) + //private double TARGET_TITV = 2.1; + @Argument(fullName="desired_num_variants", shortName="dV", doc="The desired number of variants to keep in a theoretically filtered set", required=false) + private int DESIRED_NUM_VARIANTS = 0; + @Argument(fullName="ignore_input_filters", shortName="ignoreFilters", doc="If specified the optimizer will use variants even if the FILTER column is marked in the VCF file", required=false) + private boolean IGNORE_INPUT_FILTERS = false; + @Argument(fullName="output_prefix", shortName="output", doc="The prefix added to output VCF file name and optimization curve pdf file name", required=false) + private String OUTPUT_PREFIX = "optimizer"; + @Argument(fullName="clusterFile", shortName="clusterFile", doc="The output cluster file", required=true) + private String CLUSTER_FILENAME = "optimizer.cluster"; + //@Argument(fullName = "optimization_model", shortName = "om", doc = "Optimization calculation model to employ -- GAUSSIAN_MIXTURE_MODEL is currently the default, while K_NEAREST_NEIGHBORS is also available for small callsets.", required = false) + private VariantOptimizationModel.Model OPTIMIZATION_MODEL = VariantOptimizationModel.Model.GAUSSIAN_MIXTURE_MODEL; + @Argument(fullName = "path_to_Rscript", shortName = "Rscript", doc = "The path to your implementation of Rscript. For Broad users this is probably /broad/tools/apps/R-2.6.0/bin/Rscript", required = false) + private String PATH_TO_RSCRIPT = "/broad/tools/apps/R-2.6.0/bin/Rscript"; + @Argument(fullName = "path_to_resources", shortName = "resources", doc = "Path to resources folder holding the Sting R scripts.", required = false) + private String PATH_TO_RESOURCES = "R/"; + + ///////////////////////////// + // Private Member Variables + ///////////////////////////// + private final ExpandingArrayList annotationKeys = new ExpandingArrayList(); + + //--------------------------------------------------------------------------------------------------------------- + // + // initialize + // + //--------------------------------------------------------------------------------------------------------------- + + public void initialize() { + if( !PATH_TO_RESOURCES.endsWith("/") ) { PATH_TO_RESOURCES = PATH_TO_RESOURCES + "/"; } + + VariantOptimizationModel theModel = null; + switch (OPTIMIZATION_MODEL) { + case GAUSSIAN_MIXTURE_MODEL: + theModel = new VariantGaussianMixtureModel( CLUSTER_FILENAME ); + break; + //case K_NEAREST_NEIGHBORS: + // theModel = new VariantNearestNeighborsModel( dataManager, TARGET_TITV, NUM_KNN ); + // break; + default: + throw new StingException( "Variant Optimization Model is unrecognized. Implemented options are GAUSSIAN_MIXTURE_MODEL and K_NEAREST_NEIGHBORS" ); + } + } + + //--------------------------------------------------------------------------------------------------------------- + // + // map + // + //--------------------------------------------------------------------------------------------------------------- + + public ExpandingArrayList map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) { + final ExpandingArrayList mapList = new ExpandingArrayList(); + return mapList; + } + + //--------------------------------------------------------------------------------------------------------------- + // + // reduce + // + //--------------------------------------------------------------------------------------------------------------- + + public ExpandingArrayList reduceInit() { + return new ExpandingArrayList(); + } + + public ExpandingArrayList reduce( final ExpandingArrayList mapValue, final ExpandingArrayList reduceSum ) { + reduceSum.addAll( mapValue ); + return reduceSum; + } + + public void onTraversalDone( ExpandingArrayList reduceSum ) { + + final VariantDataManager dataManager = new VariantDataManager( reduceSum, annotationKeys ); + } +} + diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantClusteringModel.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantClusteringModel.java index c5afc5861..1f42a61de 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantClusteringModel.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantClusteringModel.java @@ -33,5 +33,5 @@ package org.broadinstitute.sting.playground.gatk.walkers.variantoptimizer; public interface VariantClusteringModel extends VariantOptimizationInterface { public void createClusters( final VariantDatum[] data ); - public void applyClusters( final VariantDatum[] data, final String outputPrefix ); + //public void applyClusters( final VariantDatum[] data, final String outputPrefix ); } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantDataManager.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantDataManager.java index 492881ae8..8c0516bfe 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantDataManager.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantDataManager.java @@ -4,6 +4,8 @@ import org.broadinstitute.sting.utils.ExpandingArrayList; import org.broadinstitute.sting.utils.StingException; import java.io.PrintStream; +import java.util.ArrayList; +import java.util.regex.Pattern; /* * Copyright (c) 2010 The Broad Institute @@ -40,8 +42,8 @@ public class VariantDataManager { public final VariantDatum[] data; public final int numVariants; public final int numAnnotations; - public final double[] varianceVector; public final double[] meanVector; + public final double[] varianceVector; public boolean isNormalized; private final ExpandingArrayList annotationKeys; @@ -61,10 +63,26 @@ public class VariantDataManager { annotationKeys = _annotationKeys; } - public void normalizeData() { - for( int iii = 0; iii < numVariants; iii++ ) { - data[iii].originalAnnotations = data[iii].annotations.clone(); + public VariantDataManager( final ExpandingArrayList annotationLines ) { + data = null; + numVariants = 0; + numAnnotations = annotationLines.size(); + meanVector = new double[numAnnotations]; + varianceVector = new double[numAnnotations]; + isNormalized = true; + annotationKeys = new ExpandingArrayList(); + + int jjj = 0; + for( String line : annotationLines ) { + String[] vals = line.split(","); + annotationKeys.add(vals[1]); + meanVector[jjj] = Double.parseDouble(vals[2]); + varianceVector[jjj] = Double.parseDouble(vals[3]); + jjj++; } + } + + public void normalizeData() { for( int jjj = 0; jjj < numAnnotations; jjj++ ) { final double theMean = mean(data, jjj); final double theSTD = standardDeviation(data, theMean, jjj); @@ -99,10 +117,9 @@ public class VariantDataManager { return Math.sqrt( sum ); } -// public void printClusterFileHeader( PrintStream outputFile ) { -// for( int jjj = 0; jjj < numAnnotations; jjj++ ) { -// outputFile.println("@ANNOTATION," + (jjj == numAnnotations-1 ? "QUAL" : annotationKeys.get(jjj)) + "," + meanVector[jjj] + "," + varianceVector[jjj]); -// } -// } - + public void printClusterFileHeader( PrintStream outputFile ) { + for( int jjj = 0; jjj < numAnnotations; jjj++ ) { + outputFile.println("@!ANNOTATION," + (jjj == numAnnotations-1 ? "QUAL" : annotationKeys.get(jjj)) + "," + meanVector[jjj] + "," + varianceVector[jjj]); + } + } } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantDatum.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantDatum.java index 83d3e83a5..c232c08c2 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantDatum.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantDatum.java @@ -33,7 +33,7 @@ package org.broadinstitute.sting.playground.gatk.walkers.variantoptimizer; public class VariantDatum { public double[] annotations; - public double[] originalAnnotations; public boolean isTransition; public boolean isKnown; + public double pTrue; } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantGaussianMixtureModel.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantGaussianMixtureModel.java index b84cb5fdc..0d7fe8494 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantGaussianMixtureModel.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantGaussianMixtureModel.java @@ -1,7 +1,15 @@ package org.broadinstitute.sting.playground.gatk.walkers.variantoptimizer; +import org.broadinstitute.sting.utils.ExpandingArrayList; +import org.broadinstitute.sting.utils.StingException; +import org.broadinstitute.sting.utils.xReadLines; + +import java.io.File; +import java.io.FileNotFoundException; import java.io.PrintStream; +import java.util.ArrayList; import java.util.Random; +import java.util.regex.Pattern; /* * Copyright (c) 2010 The Broad Institute @@ -36,6 +44,7 @@ import java.util.Random; public final class VariantGaussianMixtureModel extends VariantOptimizationModel implements VariantClusteringModel { + private final VariantDataManager dataManager; private final int numGaussians; private final int numIterations; private final long RANDOM_SEED = 91801305; @@ -50,9 +59,15 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel private final int[] numMaxClusterNovel; private final double[] clusterTITV; private final double[] clusterTruePositiveRate; // The true positive rate implied by the cluster's Ti/Tv ratio + //private final int desiredNumVariants; + private final int minVarInCluster; - public VariantGaussianMixtureModel( VariantDataManager _dataManager, final double _targetTITV, final int _numGaussians, final int _numIterations ) { - super( _dataManager, _targetTITV ); + private static final Pattern ANNOTATION_PATTERN = Pattern.compile("^@!ANNOTATION.*"); + private static final Pattern CLUSTER_PATTERN = Pattern.compile("^@!CLUSTER.*"); + + public VariantGaussianMixtureModel( final VariantDataManager _dataManager, final double _targetTITV, final int _numGaussians, final int _numIterations, final int _minVarInCluster ) { + super( _targetTITV ); + dataManager = _dataManager; numGaussians = _numGaussians; numIterations = _numIterations; @@ -63,9 +78,46 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel numMaxClusterNovel = new int[numGaussians]; clusterTITV = new double[numGaussians]; clusterTruePositiveRate = new double[numGaussians]; + //desiredNumVariants = _desiredNumVariants; + minVarInCluster = _minVarInCluster; + } + + public VariantGaussianMixtureModel( final String clusterFileName ) { + super( 0.0 ); + ExpandingArrayList annotationLines = new ExpandingArrayList(); + ExpandingArrayList clusterLines = new ExpandingArrayList(); + + try { + for ( String line : new xReadLines(new File( clusterFileName )) ) { + if( ANNOTATION_PATTERN.matcher(line).matches() ) { + annotationLines.add(line); + } else if( CLUSTER_PATTERN.matcher(line).matches() ) { + clusterLines.add(line); + } else { + throw new StingException("Malformed input file: " + clusterFileName); + } + } + } catch ( FileNotFoundException e ) { + throw new StingException("Can not find input file: " + clusterFileName); + } + + dataManager = new VariantDataManager( annotationLines ); + numIterations = 0; + pCluster = null; + numMaxClusterKnown = null; + numMaxClusterNovel = null; + clusterTITV = null; + clusterTruePositiveRate = null; + //desiredNumVariants = _desiredNumVariants; + minVarInCluster = 0; + + numGaussians = clusterLines.size(); + mu = new double[numGaussians][]; + sigma = new double[numGaussians][]; + } - public final void run( final String outputPrefix ) { + public final void run( final String clusterFileName ) { // Create the subset of the data to cluster with int numNovel = 0; @@ -78,6 +130,7 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel // Grab a set of data that is all of the novel variants plus 1.5x as many known variants drawn at random // If there are almost as many novels as known, simply use all the variants + // BUGBUG: allow downsampling and arbitrary mixtures of knowns and novels final int numSubset = (int)Math.floor(numNovel*2.5); if( numSubset * 1.3 < dataManager.numVariants ) { data = new VariantDatum[numSubset]; @@ -100,10 +153,10 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel System.out.println("Clustering with " + numNovel + " novel variants and " + (data.length - numNovel) + " known variants..."); if( data.length == dataManager.numVariants ) { System.out.println(" (used all variants since 2.5*numNovel is so large compared to the full set) "); } createClusters( data ); // Using a subset of the data - System.out.println("Printing out cluster parameters..."); - printClusters( outputPrefix ); - System.out.println("Applying clusters to all variants..."); - applyClusters( dataManager.data, outputPrefix ); // Using all the data + System.out.println("Outputting cluster parameters..."); + printClusters( clusterFileName ); + //System.out.println("Applying clusters to all variants..."); + //applyClusters( dataManager.data, outputPrefix ); // Using all the data } public final void createClusters( final VariantDatum[] data ) { @@ -198,15 +251,14 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel } } - private void printClusters( final String outputPrefix ) { + private void printClusters( final String clusterFileName ) { try { - final PrintStream outputFile = new PrintStream( outputPrefix + ".clusters" ); - int clusterNumber = 0; + final PrintStream outputFile = new PrintStream( clusterFileName ); + dataManager.printClusterFileHeader( outputFile ); final int numAnnotations = mu[0].length; for( int kkk = 0; kkk < numGaussians; kkk++ ) { - //BUGBUG: only output the good clusters here to protect against over-fitting? - //if( numMaxClusterKnown[kkk] + numMaxClusterNovel[kkk] >= 2000 ) { - outputFile.print(clusterNumber + ","); + if( numMaxClusterKnown[kkk] + numMaxClusterNovel[kkk] >= minVarInCluster ) { + outputFile.print("@!CLUSTER,"); outputFile.print(numMaxClusterKnown[kkk] + ","); outputFile.print(numMaxClusterNovel[kkk] + ","); outputFile.print(clusterTITV[kkk] + ","); @@ -218,8 +270,7 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel outputFile.print(sigma[kkk][jjj] + ","); } outputFile.println(-1); - clusterNumber++; - //} + } } } catch (Exception e) { e.printStackTrace(); @@ -227,6 +278,7 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel } } + /* public final void applyClusters( final VariantDatum[] data, final String outputPrefix ) { final int numVariants = data.length; @@ -262,6 +314,7 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel int numKnownTv = 0; int numNovelTi = 0; int numNovelTv = 0; + boolean foundDesiredNumVariants = false; outputFile.println("pCut,numKnown,numNovel,knownTITV,novelTITV"); for( double pCut = 1.0; pCut >= 0.0; pCut -= 0.001 ) { for( int iii = 0; iii < numVariants; iii++ ) { @@ -286,9 +339,21 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel } } } - outputFile.println( pCut + "," + numKnown + "," + numNovel + "," + ( ((double)numKnownTi) / ((double)numKnownTv) ) + "," + ( ((double)numNovelTi) / ((double)numNovelTv) )); + if( desiredNumVariants != 0 && !foundDesiredNumVariants && (numKnown + numNovel) >= desiredNumVariants ) { + System.out.println( "Keeping variants with p(true) >= " + String.format("%.3f",pCut) + " results in a filtered set with: " ); + System.out.println("\t" + numKnown + " known variants"); + System.out.println("\t" + numNovel + " novel variants, (dbSNP rate = " + String.format("%.2f",((double) numKnown * 100.0) / ((double) numKnown + numNovel) ) + "%)"); + System.out.println("\t" + String.format("%.4f known Ti/Tv ratio", ((double)numKnownTi) / ((double)numKnownTv))); + System.out.println("\t" + String.format("%.4f novel Ti/Tv ratio", ((double)numNovelTi) / ((double)numNovelTv))); + System.out.println(); + foundDesiredNumVariants = true; + } + outputFile.println( pCut + "," + numKnown + "," + numNovel + "," + + ( numKnownTi ==0 || numKnownTv == 0 ? "NaN" : ( ((double)numKnownTi) / ((double)numKnownTv) ) ) + "," + + ( numNovelTi ==0 || numNovelTv == 0 ? "NaN" : ( ((double)numNovelTi) / ((double)numNovelTv) ))); } } + */ private void evaluateGaussians( final VariantDatum[] data, final double[][] pVarInCluster ) { @@ -327,7 +392,6 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel double sumProb = 0.0; for( int kkk = 0; kkk < numGaussians; kkk++ ) { - //BUGBUG: only use the good clusters here to protect against over-fitting? double sum = 0.0; for( int jjj = 0; jjj < numAnnotations; jjj++ ) { sum += ( (datum.annotations[jjj] - mu[kkk][jjj]) * (datum.annotations[jjj] - mu[kkk][jjj]) ) @@ -400,6 +464,7 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel } // Clean up extra big or extra small clusters + //BUGBUG: Is this a good idea? for( int kkk = 0; kkk < numGaussians; kkk++ ) { if( pCluster[kkk] > 0.45 ) { // This is a very large cluster compared to all the others final int numToReplace = 4; diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantNearestNeighborsModel.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantNearestNeighborsModel.java index f6fcd50fd..1c79bf583 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantNearestNeighborsModel.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantNearestNeighborsModel.java @@ -40,7 +40,8 @@ public final class VariantNearestNeighborsModel extends VariantOptimizationModel private final int numKNN; public VariantNearestNeighborsModel( VariantDataManager _dataManager, final double _targetTITV, final int _numKNN ) { - super( _dataManager, _targetTITV ); + super( _targetTITV ); + //dataManager = _dataManager; numKNN = _numKNN; } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantOptimizationModel.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantOptimizationModel.java index 6d85c2f52..c52e29ed3 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantOptimizationModel.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantOptimizationModel.java @@ -38,11 +38,9 @@ public abstract class VariantOptimizationModel implements VariantOptimizationInt K_NEAREST_NEIGHBORS } - protected final VariantDataManager dataManager; protected final double targetTITV; - public VariantOptimizationModel( VariantDataManager _dataManager, final double _targetTITV ) { - dataManager = _dataManager; + public VariantOptimizationModel( final double _targetTITV ) { targetTITV = _targetTITV; } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantOptimizer.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantOptimizer.java index c108282f3..9ec266d8f 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantOptimizer.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantOptimizer.java @@ -38,13 +38,12 @@ import java.io.IOException; */ /** - * Takes variant calls as .vcf files, learns a Gaussian mixture model over the variant annotations producing a callset that is optimized for - * an expected transition / transversion ratio. + * Takes variant calls as .vcf files, learns a Gaussian mixture model over the variant annotations producing calibrated variant cluster parameters which can be applied to other datasets * * @author rpoplin * @since Feb 11, 2010 * - * @help.summary Takes variant calls as .vcf files, learns a Gaussian mixture model over the variant annotations producing an optimized callset + * @help.summary Takes variant calls as .vcf files, learns a Gaussian mixture model over the variant annotations producing calibrated variant cluster parameters which can be applied to other datasets */ public class VariantOptimizer extends RodWalker, ExpandingArrayList> { @@ -54,26 +53,31 @@ public class VariantOptimizer extends RodWalker ///////////////////////////// @Argument(fullName="target_titv", shortName="titv", doc="The target Ti/Tv ratio towards which to optimize. (~~2.2 for whole genome experiments)", required=true) private double TARGET_TITV = 2.1; + //@Argument(fullName="desired_num_variants", shortName="dV", doc="The desired number of variants to keep in a theoretically filtered set", required=false) + //private int DESIRED_NUM_VARIANTS = 0; @Argument(fullName="ignore_input_filters", shortName="ignoreFilters", doc="If specified the optimizer will use variants even if the FILTER column is marked in the VCF file", required=false) private boolean IGNORE_INPUT_FILTERS = false; @Argument(fullName="exclude_annotation", shortName="exclude", doc="The names of the annotations which should be excluded from the calculations", required=false) private String[] EXCLUDED_ANNOTATIONS = null; @Argument(fullName="force_annotation", shortName="force", doc="The names of the annotations which should be forced into the calculations even if they aren't present in every variant", required=false) private String[] FORCED_ANNOTATIONS = null; - @Argument(fullName="output_prefix", shortName="output", doc="The output prefix added to output cluster file name and optimization curve pdf file name", required=false) - private String OUTPUT_PREFIX = "optimizer"; + @Argument(fullName="clusterFile", shortName="clusterFile", doc="The output cluster file", required=true) + private String CLUSTER_FILENAME = "optimizer.cluster"; @Argument(fullName="numGaussians", shortName="nG", doc="The number of Gaussians to be used in the Gaussian Mixture model", required=false) private int NUM_GAUSSIANS = 32; @Argument(fullName="numIterations", shortName="nI", doc="The number of iterations to be performed in the Gaussian Mixture model", required=false) private int NUM_ITERATIONS = 10; - @Argument(fullName="knn", shortName="knn", doc="The number of nearest neighbors to be used in the k-Nearest Neighbors model", required=false) - private int NUM_KNN = 2000; - @Argument(fullName = "optimization_model", shortName = "om", doc = "Optimization calculation model to employ -- GAUSSIAN_MIXTURE_MODEL is currently the default, while K_NEAREST_NEIGHBORS is also available for small callsets.", required = false) + @Argument(fullName="minVarInCluster", shortName="minVar", doc="The minimum number of variants in a cluster to be considered a valid cluster. Used to prevent overfitting. Default is 2000.", required=true) + private int MIN_VAR_IN_CLUSTER = 2000; + + //@Argument(fullName="knn", shortName="knn", doc="The number of nearest neighbors to be used in the k-Nearest Neighbors model", required=false) + //private int NUM_KNN = 2000; + //@Argument(fullName = "optimization_model", shortName = "om", doc = "Optimization calculation model to employ -- GAUSSIAN_MIXTURE_MODEL is currently the default, while K_NEAREST_NEIGHBORS is also available for small callsets.", required = false) private VariantOptimizationModel.Model OPTIMIZATION_MODEL = VariantOptimizationModel.Model.GAUSSIAN_MIXTURE_MODEL; - @Argument(fullName = "path_to_Rscript", shortName = "Rscript", doc = "The path to your implementation of Rscript. For Broad users this is probably /broad/tools/apps/R-2.6.0/bin/Rscript", required = false) - private String PATH_TO_RSCRIPT = "/broad/tools/apps/R-2.6.0/bin/Rscript"; - @Argument(fullName = "path_to_resources", shortName = "resources", doc = "Path to resources folder holding the Sting R scripts.", required = false) - private String PATH_TO_RESOURCES = "R/"; + //@Argument(fullName = "path_to_Rscript", shortName = "Rscript", doc = "The path to your implementation of Rscript. For Broad users this is probably /broad/tools/apps/R-2.6.0/bin/Rscript", required = false) + //private String PATH_TO_RSCRIPT = "/broad/tools/apps/R-2.6.0/bin/Rscript"; + //@Argument(fullName = "path_to_resources", shortName = "resources", doc = "Path to resources folder holding the Sting R scripts.", required = false) + //private String PATH_TO_RESOURCES = "R/"; ///////////////////////////// // Private Member Variables @@ -90,7 +94,7 @@ public class VariantOptimizer extends RodWalker //--------------------------------------------------------------------------------------------------------------- public void initialize() { - if( !PATH_TO_RESOURCES.endsWith("/") ) { PATH_TO_RESOURCES = PATH_TO_RESOURCES + "/"; } + //if( !PATH_TO_RESOURCES.endsWith("/") ) { PATH_TO_RESOURCES = PATH_TO_RESOURCES + "/"; } } //--------------------------------------------------------------------------------------------------------------- @@ -177,10 +181,10 @@ public class VariantOptimizer extends RodWalker public void onTraversalDone( ExpandingArrayList reduceSum ) { - final VariantDataManager dataManager = new VariantDataManager( reduceSum, annotationKeys); + final VariantDataManager dataManager = new VariantDataManager( reduceSum, annotationKeys ); reduceSum.clear(); // Don't need this ever again, clean up some memory - logger.info( "There are " + dataManager.numVariants + " variants and " + dataManager.numAnnotations + " annotations."); + logger.info( "There are " + dataManager.numVariants + " variants and " + dataManager.numAnnotations + " annotations." ); logger.info( "The annotations are: " + annotationKeys + " and QUAL." ); dataManager.normalizeData(); // Each data point is now [ (x - mean) / standard deviation ] @@ -189,28 +193,31 @@ public class VariantOptimizer extends RodWalker VariantOptimizationModel theModel; switch (OPTIMIZATION_MODEL) { case GAUSSIAN_MIXTURE_MODEL: - theModel = new VariantGaussianMixtureModel( dataManager, TARGET_TITV, NUM_GAUSSIANS, NUM_ITERATIONS ); - break; - case K_NEAREST_NEIGHBORS: - theModel = new VariantNearestNeighborsModel( dataManager, TARGET_TITV, NUM_KNN ); + theModel = new VariantGaussianMixtureModel( dataManager, TARGET_TITV, NUM_GAUSSIANS, NUM_ITERATIONS, MIN_VAR_IN_CLUSTER ); break; + //case K_NEAREST_NEIGHBORS: + // theModel = new VariantNearestNeighborsModel( dataManager, TARGET_TITV, NUM_KNN ); + // break; default: - throw new StingException("Variant Optimization Model is unrecognized. Implemented options are GAUSSIAN_MIXTURE_MODEL and K_NEAREST_NEIGHBORS"); + throw new StingException( "Variant Optimization Model is unrecognized. Implemented options are GAUSSIAN_MIXTURE_MODEL and K_NEAREST_NEIGHBORS" ); } - theModel.run( OUTPUT_PREFIX ); + theModel.run( CLUSTER_FILENAME ); + + + // Functionality moved to different walker // Execute Rscript command to plot the optimization curve // Print out the command line to make it clear to the user what is being executed and how one might modify it - final String rScriptCommandLine = PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_OptimizationCurve.R" + " " + OUTPUT_PREFIX + ".dat" + " " + TARGET_TITV; - System.out.println( rScriptCommandLine ); + //final String rScriptCommandLine = PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_OptimizationCurve.R" + " " + OUTPUT_PREFIX + ".dat" + " " + TARGET_TITV; + //System.out.println( rScriptCommandLine ); // Execute the RScript command to plot the table of truth values - try { - Runtime.getRuntime().exec( rScriptCommandLine ); - } catch ( IOException e ) { - throw new StingException( "Unable to execute RScript command: " + rScriptCommandLine ); - } + //try { + // Runtime.getRuntime().exec( rScriptCommandLine ); + //} catch ( IOException e ) { + // throw new StingException( "Unable to execute RScript command: " + rScriptCommandLine ); + //} } }