Adding a skeleton for the second step of the variant optimization process.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3023 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
ded4ba8966
commit
1bb4394aa9
|
|
@ -0,0 +1,125 @@
|
|||
package org.broadinstitute.sting.playground.gatk.walkers.variantoptimizer;
|
||||
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.RodWalker;
|
||||
import org.broadinstitute.sting.utils.ExpandingArrayList;
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||
|
||||
/*
|
||||
* Copyright (c) 2010 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Applies calibrated variant cluster parameters to variant calls to produce an accurate and informative variant quality score
|
||||
*
|
||||
* @author rpoplin
|
||||
* @since Mar 17, 2010
|
||||
*
|
||||
* @help.summary Applies calibrated variant cluster parameters to variant calls to produce an accurate and informative variant quality score
|
||||
*/
|
||||
|
||||
public class ApplyVariantClustersWalker extends RodWalker<ExpandingArrayList<VariantDatum>, ExpandingArrayList<VariantDatum>> {
|
||||
|
||||
/////////////////////////////
|
||||
// Command Line Arguments
|
||||
/////////////////////////////
|
||||
//@Argument(fullName="target_titv", shortName="titv", doc="The target Ti/Tv ratio towards which to optimize. (~~2.2 for whole genome experiments)", required=false)
|
||||
//private double TARGET_TITV = 2.1;
|
||||
@Argument(fullName="desired_num_variants", shortName="dV", doc="The desired number of variants to keep in a theoretically filtered set", required=false)
|
||||
private int DESIRED_NUM_VARIANTS = 0;
|
||||
@Argument(fullName="ignore_input_filters", shortName="ignoreFilters", doc="If specified the optimizer will use variants even if the FILTER column is marked in the VCF file", required=false)
|
||||
private boolean IGNORE_INPUT_FILTERS = false;
|
||||
@Argument(fullName="output_prefix", shortName="output", doc="The prefix added to output VCF file name and optimization curve pdf file name", required=false)
|
||||
private String OUTPUT_PREFIX = "optimizer";
|
||||
@Argument(fullName="clusterFile", shortName="clusterFile", doc="The output cluster file", required=true)
|
||||
private String CLUSTER_FILENAME = "optimizer.cluster";
|
||||
//@Argument(fullName = "optimization_model", shortName = "om", doc = "Optimization calculation model to employ -- GAUSSIAN_MIXTURE_MODEL is currently the default, while K_NEAREST_NEIGHBORS is also available for small callsets.", required = false)
|
||||
private VariantOptimizationModel.Model OPTIMIZATION_MODEL = VariantOptimizationModel.Model.GAUSSIAN_MIXTURE_MODEL;
|
||||
@Argument(fullName = "path_to_Rscript", shortName = "Rscript", doc = "The path to your implementation of Rscript. For Broad users this is probably /broad/tools/apps/R-2.6.0/bin/Rscript", required = false)
|
||||
private String PATH_TO_RSCRIPT = "/broad/tools/apps/R-2.6.0/bin/Rscript";
|
||||
@Argument(fullName = "path_to_resources", shortName = "resources", doc = "Path to resources folder holding the Sting R scripts.", required = false)
|
||||
private String PATH_TO_RESOURCES = "R/";
|
||||
|
||||
/////////////////////////////
|
||||
// Private Member Variables
|
||||
/////////////////////////////
|
||||
private final ExpandingArrayList<String> annotationKeys = new ExpandingArrayList<String>();
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// initialize
|
||||
//
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
public void initialize() {
|
||||
if( !PATH_TO_RESOURCES.endsWith("/") ) { PATH_TO_RESOURCES = PATH_TO_RESOURCES + "/"; }
|
||||
|
||||
VariantOptimizationModel theModel = null;
|
||||
switch (OPTIMIZATION_MODEL) {
|
||||
case GAUSSIAN_MIXTURE_MODEL:
|
||||
theModel = new VariantGaussianMixtureModel( CLUSTER_FILENAME );
|
||||
break;
|
||||
//case K_NEAREST_NEIGHBORS:
|
||||
// theModel = new VariantNearestNeighborsModel( dataManager, TARGET_TITV, NUM_KNN );
|
||||
// break;
|
||||
default:
|
||||
throw new StingException( "Variant Optimization Model is unrecognized. Implemented options are GAUSSIAN_MIXTURE_MODEL and K_NEAREST_NEIGHBORS" );
|
||||
}
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// map
|
||||
//
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
public ExpandingArrayList<VariantDatum> map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) {
|
||||
final ExpandingArrayList<VariantDatum> mapList = new ExpandingArrayList<VariantDatum>();
|
||||
return mapList;
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// reduce
|
||||
//
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
public ExpandingArrayList<VariantDatum> reduceInit() {
|
||||
return new ExpandingArrayList<VariantDatum>();
|
||||
}
|
||||
|
||||
public ExpandingArrayList<VariantDatum> reduce( final ExpandingArrayList<VariantDatum> mapValue, final ExpandingArrayList<VariantDatum> reduceSum ) {
|
||||
reduceSum.addAll( mapValue );
|
||||
return reduceSum;
|
||||
}
|
||||
|
||||
public void onTraversalDone( ExpandingArrayList<VariantDatum> reduceSum ) {
|
||||
|
||||
final VariantDataManager dataManager = new VariantDataManager( reduceSum, annotationKeys );
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -33,5 +33,5 @@ package org.broadinstitute.sting.playground.gatk.walkers.variantoptimizer;
|
|||
|
||||
public interface VariantClusteringModel extends VariantOptimizationInterface {
|
||||
public void createClusters( final VariantDatum[] data );
|
||||
public void applyClusters( final VariantDatum[] data, final String outputPrefix );
|
||||
//public void applyClusters( final VariantDatum[] data, final String outputPrefix );
|
||||
}
|
||||
|
|
|
|||
|
|
@ -4,6 +4,8 @@ import org.broadinstitute.sting.utils.ExpandingArrayList;
|
|||
import org.broadinstitute.sting.utils.StingException;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
/*
|
||||
* Copyright (c) 2010 The Broad Institute
|
||||
|
|
@ -40,8 +42,8 @@ public class VariantDataManager {
|
|||
public final VariantDatum[] data;
|
||||
public final int numVariants;
|
||||
public final int numAnnotations;
|
||||
public final double[] varianceVector;
|
||||
public final double[] meanVector;
|
||||
public final double[] varianceVector;
|
||||
public boolean isNormalized;
|
||||
private final ExpandingArrayList<String> annotationKeys;
|
||||
|
||||
|
|
@ -61,10 +63,26 @@ public class VariantDataManager {
|
|||
annotationKeys = _annotationKeys;
|
||||
}
|
||||
|
||||
public void normalizeData() {
|
||||
for( int iii = 0; iii < numVariants; iii++ ) {
|
||||
data[iii].originalAnnotations = data[iii].annotations.clone();
|
||||
public VariantDataManager( final ExpandingArrayList<String> annotationLines ) {
|
||||
data = null;
|
||||
numVariants = 0;
|
||||
numAnnotations = annotationLines.size();
|
||||
meanVector = new double[numAnnotations];
|
||||
varianceVector = new double[numAnnotations];
|
||||
isNormalized = true;
|
||||
annotationKeys = new ExpandingArrayList<String>();
|
||||
|
||||
int jjj = 0;
|
||||
for( String line : annotationLines ) {
|
||||
String[] vals = line.split(",");
|
||||
annotationKeys.add(vals[1]);
|
||||
meanVector[jjj] = Double.parseDouble(vals[2]);
|
||||
varianceVector[jjj] = Double.parseDouble(vals[3]);
|
||||
jjj++;
|
||||
}
|
||||
}
|
||||
|
||||
public void normalizeData() {
|
||||
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
|
||||
final double theMean = mean(data, jjj);
|
||||
final double theSTD = standardDeviation(data, theMean, jjj);
|
||||
|
|
@ -99,10 +117,9 @@ public class VariantDataManager {
|
|||
return Math.sqrt( sum );
|
||||
}
|
||||
|
||||
// public void printClusterFileHeader( PrintStream outputFile ) {
|
||||
// for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
|
||||
// outputFile.println("@ANNOTATION," + (jjj == numAnnotations-1 ? "QUAL" : annotationKeys.get(jjj)) + "," + meanVector[jjj] + "," + varianceVector[jjj]);
|
||||
// }
|
||||
// }
|
||||
|
||||
public void printClusterFileHeader( PrintStream outputFile ) {
|
||||
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
|
||||
outputFile.println("@!ANNOTATION," + (jjj == numAnnotations-1 ? "QUAL" : annotationKeys.get(jjj)) + "," + meanVector[jjj] + "," + varianceVector[jjj]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -33,7 +33,7 @@ package org.broadinstitute.sting.playground.gatk.walkers.variantoptimizer;
|
|||
|
||||
public class VariantDatum {
|
||||
public double[] annotations;
|
||||
public double[] originalAnnotations;
|
||||
public boolean isTransition;
|
||||
public boolean isKnown;
|
||||
public double pTrue;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,7 +1,15 @@
|
|||
package org.broadinstitute.sting.playground.gatk.walkers.variantoptimizer;
|
||||
|
||||
import org.broadinstitute.sting.utils.ExpandingArrayList;
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
import org.broadinstitute.sting.utils.xReadLines;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.PrintStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Random;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
/*
|
||||
* Copyright (c) 2010 The Broad Institute
|
||||
|
|
@ -36,6 +44,7 @@ import java.util.Random;
|
|||
|
||||
public final class VariantGaussianMixtureModel extends VariantOptimizationModel implements VariantClusteringModel {
|
||||
|
||||
private final VariantDataManager dataManager;
|
||||
private final int numGaussians;
|
||||
private final int numIterations;
|
||||
private final long RANDOM_SEED = 91801305;
|
||||
|
|
@ -50,9 +59,15 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
|
|||
private final int[] numMaxClusterNovel;
|
||||
private final double[] clusterTITV;
|
||||
private final double[] clusterTruePositiveRate; // The true positive rate implied by the cluster's Ti/Tv ratio
|
||||
//private final int desiredNumVariants;
|
||||
private final int minVarInCluster;
|
||||
|
||||
public VariantGaussianMixtureModel( VariantDataManager _dataManager, final double _targetTITV, final int _numGaussians, final int _numIterations ) {
|
||||
super( _dataManager, _targetTITV );
|
||||
private static final Pattern ANNOTATION_PATTERN = Pattern.compile("^@!ANNOTATION.*");
|
||||
private static final Pattern CLUSTER_PATTERN = Pattern.compile("^@!CLUSTER.*");
|
||||
|
||||
public VariantGaussianMixtureModel( final VariantDataManager _dataManager, final double _targetTITV, final int _numGaussians, final int _numIterations, final int _minVarInCluster ) {
|
||||
super( _targetTITV );
|
||||
dataManager = _dataManager;
|
||||
numGaussians = _numGaussians;
|
||||
numIterations = _numIterations;
|
||||
|
||||
|
|
@ -63,9 +78,46 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
|
|||
numMaxClusterNovel = new int[numGaussians];
|
||||
clusterTITV = new double[numGaussians];
|
||||
clusterTruePositiveRate = new double[numGaussians];
|
||||
//desiredNumVariants = _desiredNumVariants;
|
||||
minVarInCluster = _minVarInCluster;
|
||||
}
|
||||
|
||||
public VariantGaussianMixtureModel( final String clusterFileName ) {
|
||||
super( 0.0 );
|
||||
ExpandingArrayList<String> annotationLines = new ExpandingArrayList<String>();
|
||||
ExpandingArrayList<String> clusterLines = new ExpandingArrayList<String>();
|
||||
|
||||
try {
|
||||
for ( String line : new xReadLines(new File( clusterFileName )) ) {
|
||||
if( ANNOTATION_PATTERN.matcher(line).matches() ) {
|
||||
annotationLines.add(line);
|
||||
} else if( CLUSTER_PATTERN.matcher(line).matches() ) {
|
||||
clusterLines.add(line);
|
||||
} else {
|
||||
throw new StingException("Malformed input file: " + clusterFileName);
|
||||
}
|
||||
}
|
||||
} catch ( FileNotFoundException e ) {
|
||||
throw new StingException("Can not find input file: " + clusterFileName);
|
||||
}
|
||||
|
||||
dataManager = new VariantDataManager( annotationLines );
|
||||
numIterations = 0;
|
||||
pCluster = null;
|
||||
numMaxClusterKnown = null;
|
||||
numMaxClusterNovel = null;
|
||||
clusterTITV = null;
|
||||
clusterTruePositiveRate = null;
|
||||
//desiredNumVariants = _desiredNumVariants;
|
||||
minVarInCluster = 0;
|
||||
|
||||
numGaussians = clusterLines.size();
|
||||
mu = new double[numGaussians][];
|
||||
sigma = new double[numGaussians][];
|
||||
|
||||
}
|
||||
|
||||
public final void run( final String outputPrefix ) {
|
||||
public final void run( final String clusterFileName ) {
|
||||
|
||||
// Create the subset of the data to cluster with
|
||||
int numNovel = 0;
|
||||
|
|
@ -78,6 +130,7 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
|
|||
|
||||
// Grab a set of data that is all of the novel variants plus 1.5x as many known variants drawn at random
|
||||
// If there are almost as many novels as known, simply use all the variants
|
||||
// BUGBUG: allow downsampling and arbitrary mixtures of knowns and novels
|
||||
final int numSubset = (int)Math.floor(numNovel*2.5);
|
||||
if( numSubset * 1.3 < dataManager.numVariants ) {
|
||||
data = new VariantDatum[numSubset];
|
||||
|
|
@ -100,10 +153,10 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
|
|||
System.out.println("Clustering with " + numNovel + " novel variants and " + (data.length - numNovel) + " known variants...");
|
||||
if( data.length == dataManager.numVariants ) { System.out.println(" (used all variants since 2.5*numNovel is so large compared to the full set) "); }
|
||||
createClusters( data ); // Using a subset of the data
|
||||
System.out.println("Printing out cluster parameters...");
|
||||
printClusters( outputPrefix );
|
||||
System.out.println("Applying clusters to all variants...");
|
||||
applyClusters( dataManager.data, outputPrefix ); // Using all the data
|
||||
System.out.println("Outputting cluster parameters...");
|
||||
printClusters( clusterFileName );
|
||||
//System.out.println("Applying clusters to all variants...");
|
||||
//applyClusters( dataManager.data, outputPrefix ); // Using all the data
|
||||
}
|
||||
|
||||
public final void createClusters( final VariantDatum[] data ) {
|
||||
|
|
@ -198,15 +251,14 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
|
|||
}
|
||||
}
|
||||
|
||||
private void printClusters( final String outputPrefix ) {
|
||||
private void printClusters( final String clusterFileName ) {
|
||||
try {
|
||||
final PrintStream outputFile = new PrintStream( outputPrefix + ".clusters" );
|
||||
int clusterNumber = 0;
|
||||
final PrintStream outputFile = new PrintStream( clusterFileName );
|
||||
dataManager.printClusterFileHeader( outputFile );
|
||||
final int numAnnotations = mu[0].length;
|
||||
for( int kkk = 0; kkk < numGaussians; kkk++ ) {
|
||||
//BUGBUG: only output the good clusters here to protect against over-fitting?
|
||||
//if( numMaxClusterKnown[kkk] + numMaxClusterNovel[kkk] >= 2000 ) {
|
||||
outputFile.print(clusterNumber + ",");
|
||||
if( numMaxClusterKnown[kkk] + numMaxClusterNovel[kkk] >= minVarInCluster ) {
|
||||
outputFile.print("@!CLUSTER,");
|
||||
outputFile.print(numMaxClusterKnown[kkk] + ",");
|
||||
outputFile.print(numMaxClusterNovel[kkk] + ",");
|
||||
outputFile.print(clusterTITV[kkk] + ",");
|
||||
|
|
@ -218,8 +270,7 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
|
|||
outputFile.print(sigma[kkk][jjj] + ",");
|
||||
}
|
||||
outputFile.println(-1);
|
||||
clusterNumber++;
|
||||
//}
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
|
|
@ -227,6 +278,7 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
|
|||
}
|
||||
}
|
||||
|
||||
/*
|
||||
public final void applyClusters( final VariantDatum[] data, final String outputPrefix ) {
|
||||
|
||||
final int numVariants = data.length;
|
||||
|
|
@ -262,6 +314,7 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
|
|||
int numKnownTv = 0;
|
||||
int numNovelTi = 0;
|
||||
int numNovelTv = 0;
|
||||
boolean foundDesiredNumVariants = false;
|
||||
outputFile.println("pCut,numKnown,numNovel,knownTITV,novelTITV");
|
||||
for( double pCut = 1.0; pCut >= 0.0; pCut -= 0.001 ) {
|
||||
for( int iii = 0; iii < numVariants; iii++ ) {
|
||||
|
|
@ -286,9 +339,21 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
|
|||
}
|
||||
}
|
||||
}
|
||||
outputFile.println( pCut + "," + numKnown + "," + numNovel + "," + ( ((double)numKnownTi) / ((double)numKnownTv) ) + "," + ( ((double)numNovelTi) / ((double)numNovelTv) ));
|
||||
if( desiredNumVariants != 0 && !foundDesiredNumVariants && (numKnown + numNovel) >= desiredNumVariants ) {
|
||||
System.out.println( "Keeping variants with p(true) >= " + String.format("%.3f",pCut) + " results in a filtered set with: " );
|
||||
System.out.println("\t" + numKnown + " known variants");
|
||||
System.out.println("\t" + numNovel + " novel variants, (dbSNP rate = " + String.format("%.2f",((double) numKnown * 100.0) / ((double) numKnown + numNovel) ) + "%)");
|
||||
System.out.println("\t" + String.format("%.4f known Ti/Tv ratio", ((double)numKnownTi) / ((double)numKnownTv)));
|
||||
System.out.println("\t" + String.format("%.4f novel Ti/Tv ratio", ((double)numNovelTi) / ((double)numNovelTv)));
|
||||
System.out.println();
|
||||
foundDesiredNumVariants = true;
|
||||
}
|
||||
outputFile.println( pCut + "," + numKnown + "," + numNovel + "," +
|
||||
( numKnownTi ==0 || numKnownTv == 0 ? "NaN" : ( ((double)numKnownTi) / ((double)numKnownTv) ) ) + "," +
|
||||
( numNovelTi ==0 || numNovelTv == 0 ? "NaN" : ( ((double)numNovelTi) / ((double)numNovelTv) )));
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
|
||||
private void evaluateGaussians( final VariantDatum[] data, final double[][] pVarInCluster ) {
|
||||
|
|
@ -327,7 +392,6 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
|
|||
|
||||
double sumProb = 0.0;
|
||||
for( int kkk = 0; kkk < numGaussians; kkk++ ) {
|
||||
//BUGBUG: only use the good clusters here to protect against over-fitting?
|
||||
double sum = 0.0;
|
||||
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
|
||||
sum += ( (datum.annotations[jjj] - mu[kkk][jjj]) * (datum.annotations[jjj] - mu[kkk][jjj]) )
|
||||
|
|
@ -400,6 +464,7 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
|
|||
}
|
||||
|
||||
// Clean up extra big or extra small clusters
|
||||
//BUGBUG: Is this a good idea?
|
||||
for( int kkk = 0; kkk < numGaussians; kkk++ ) {
|
||||
if( pCluster[kkk] > 0.45 ) { // This is a very large cluster compared to all the others
|
||||
final int numToReplace = 4;
|
||||
|
|
|
|||
|
|
@ -40,7 +40,8 @@ public final class VariantNearestNeighborsModel extends VariantOptimizationModel
|
|||
private final int numKNN;
|
||||
|
||||
public VariantNearestNeighborsModel( VariantDataManager _dataManager, final double _targetTITV, final int _numKNN ) {
|
||||
super( _dataManager, _targetTITV );
|
||||
super( _targetTITV );
|
||||
//dataManager = _dataManager;
|
||||
numKNN = _numKNN;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -38,11 +38,9 @@ public abstract class VariantOptimizationModel implements VariantOptimizationInt
|
|||
K_NEAREST_NEIGHBORS
|
||||
}
|
||||
|
||||
protected final VariantDataManager dataManager;
|
||||
protected final double targetTITV;
|
||||
|
||||
public VariantOptimizationModel( VariantDataManager _dataManager, final double _targetTITV ) {
|
||||
dataManager = _dataManager;
|
||||
public VariantOptimizationModel( final double _targetTITV ) {
|
||||
targetTITV = _targetTITV;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -38,13 +38,12 @@ import java.io.IOException;
|
|||
*/
|
||||
|
||||
/**
|
||||
* Takes variant calls as .vcf files, learns a Gaussian mixture model over the variant annotations producing a callset that is optimized for
|
||||
* an expected transition / transversion ratio.
|
||||
* Takes variant calls as .vcf files, learns a Gaussian mixture model over the variant annotations producing calibrated variant cluster parameters which can be applied to other datasets
|
||||
*
|
||||
* @author rpoplin
|
||||
* @since Feb 11, 2010
|
||||
*
|
||||
* @help.summary Takes variant calls as .vcf files, learns a Gaussian mixture model over the variant annotations producing an optimized callset
|
||||
* @help.summary Takes variant calls as .vcf files, learns a Gaussian mixture model over the variant annotations producing calibrated variant cluster parameters which can be applied to other datasets
|
||||
*/
|
||||
|
||||
public class VariantOptimizer extends RodWalker<ExpandingArrayList<VariantDatum>, ExpandingArrayList<VariantDatum>> {
|
||||
|
|
@ -54,26 +53,31 @@ public class VariantOptimizer extends RodWalker<ExpandingArrayList<VariantDatum>
|
|||
/////////////////////////////
|
||||
@Argument(fullName="target_titv", shortName="titv", doc="The target Ti/Tv ratio towards which to optimize. (~~2.2 for whole genome experiments)", required=true)
|
||||
private double TARGET_TITV = 2.1;
|
||||
//@Argument(fullName="desired_num_variants", shortName="dV", doc="The desired number of variants to keep in a theoretically filtered set", required=false)
|
||||
//private int DESIRED_NUM_VARIANTS = 0;
|
||||
@Argument(fullName="ignore_input_filters", shortName="ignoreFilters", doc="If specified the optimizer will use variants even if the FILTER column is marked in the VCF file", required=false)
|
||||
private boolean IGNORE_INPUT_FILTERS = false;
|
||||
@Argument(fullName="exclude_annotation", shortName="exclude", doc="The names of the annotations which should be excluded from the calculations", required=false)
|
||||
private String[] EXCLUDED_ANNOTATIONS = null;
|
||||
@Argument(fullName="force_annotation", shortName="force", doc="The names of the annotations which should be forced into the calculations even if they aren't present in every variant", required=false)
|
||||
private String[] FORCED_ANNOTATIONS = null;
|
||||
@Argument(fullName="output_prefix", shortName="output", doc="The output prefix added to output cluster file name and optimization curve pdf file name", required=false)
|
||||
private String OUTPUT_PREFIX = "optimizer";
|
||||
@Argument(fullName="clusterFile", shortName="clusterFile", doc="The output cluster file", required=true)
|
||||
private String CLUSTER_FILENAME = "optimizer.cluster";
|
||||
@Argument(fullName="numGaussians", shortName="nG", doc="The number of Gaussians to be used in the Gaussian Mixture model", required=false)
|
||||
private int NUM_GAUSSIANS = 32;
|
||||
@Argument(fullName="numIterations", shortName="nI", doc="The number of iterations to be performed in the Gaussian Mixture model", required=false)
|
||||
private int NUM_ITERATIONS = 10;
|
||||
@Argument(fullName="knn", shortName="knn", doc="The number of nearest neighbors to be used in the k-Nearest Neighbors model", required=false)
|
||||
private int NUM_KNN = 2000;
|
||||
@Argument(fullName = "optimization_model", shortName = "om", doc = "Optimization calculation model to employ -- GAUSSIAN_MIXTURE_MODEL is currently the default, while K_NEAREST_NEIGHBORS is also available for small callsets.", required = false)
|
||||
@Argument(fullName="minVarInCluster", shortName="minVar", doc="The minimum number of variants in a cluster to be considered a valid cluster. Used to prevent overfitting. Default is 2000.", required=true)
|
||||
private int MIN_VAR_IN_CLUSTER = 2000;
|
||||
|
||||
//@Argument(fullName="knn", shortName="knn", doc="The number of nearest neighbors to be used in the k-Nearest Neighbors model", required=false)
|
||||
//private int NUM_KNN = 2000;
|
||||
//@Argument(fullName = "optimization_model", shortName = "om", doc = "Optimization calculation model to employ -- GAUSSIAN_MIXTURE_MODEL is currently the default, while K_NEAREST_NEIGHBORS is also available for small callsets.", required = false)
|
||||
private VariantOptimizationModel.Model OPTIMIZATION_MODEL = VariantOptimizationModel.Model.GAUSSIAN_MIXTURE_MODEL;
|
||||
@Argument(fullName = "path_to_Rscript", shortName = "Rscript", doc = "The path to your implementation of Rscript. For Broad users this is probably /broad/tools/apps/R-2.6.0/bin/Rscript", required = false)
|
||||
private String PATH_TO_RSCRIPT = "/broad/tools/apps/R-2.6.0/bin/Rscript";
|
||||
@Argument(fullName = "path_to_resources", shortName = "resources", doc = "Path to resources folder holding the Sting R scripts.", required = false)
|
||||
private String PATH_TO_RESOURCES = "R/";
|
||||
//@Argument(fullName = "path_to_Rscript", shortName = "Rscript", doc = "The path to your implementation of Rscript. For Broad users this is probably /broad/tools/apps/R-2.6.0/bin/Rscript", required = false)
|
||||
//private String PATH_TO_RSCRIPT = "/broad/tools/apps/R-2.6.0/bin/Rscript";
|
||||
//@Argument(fullName = "path_to_resources", shortName = "resources", doc = "Path to resources folder holding the Sting R scripts.", required = false)
|
||||
//private String PATH_TO_RESOURCES = "R/";
|
||||
|
||||
/////////////////////////////
|
||||
// Private Member Variables
|
||||
|
|
@ -90,7 +94,7 @@ public class VariantOptimizer extends RodWalker<ExpandingArrayList<VariantDatum>
|
|||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
public void initialize() {
|
||||
if( !PATH_TO_RESOURCES.endsWith("/") ) { PATH_TO_RESOURCES = PATH_TO_RESOURCES + "/"; }
|
||||
//if( !PATH_TO_RESOURCES.endsWith("/") ) { PATH_TO_RESOURCES = PATH_TO_RESOURCES + "/"; }
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
|
@ -177,10 +181,10 @@ public class VariantOptimizer extends RodWalker<ExpandingArrayList<VariantDatum>
|
|||
|
||||
public void onTraversalDone( ExpandingArrayList<VariantDatum> reduceSum ) {
|
||||
|
||||
final VariantDataManager dataManager = new VariantDataManager( reduceSum, annotationKeys);
|
||||
final VariantDataManager dataManager = new VariantDataManager( reduceSum, annotationKeys );
|
||||
reduceSum.clear(); // Don't need this ever again, clean up some memory
|
||||
|
||||
logger.info( "There are " + dataManager.numVariants + " variants and " + dataManager.numAnnotations + " annotations.");
|
||||
logger.info( "There are " + dataManager.numVariants + " variants and " + dataManager.numAnnotations + " annotations." );
|
||||
logger.info( "The annotations are: " + annotationKeys + " and QUAL." );
|
||||
|
||||
dataManager.normalizeData(); // Each data point is now [ (x - mean) / standard deviation ]
|
||||
|
|
@ -189,28 +193,31 @@ public class VariantOptimizer extends RodWalker<ExpandingArrayList<VariantDatum>
|
|||
VariantOptimizationModel theModel;
|
||||
switch (OPTIMIZATION_MODEL) {
|
||||
case GAUSSIAN_MIXTURE_MODEL:
|
||||
theModel = new VariantGaussianMixtureModel( dataManager, TARGET_TITV, NUM_GAUSSIANS, NUM_ITERATIONS );
|
||||
break;
|
||||
case K_NEAREST_NEIGHBORS:
|
||||
theModel = new VariantNearestNeighborsModel( dataManager, TARGET_TITV, NUM_KNN );
|
||||
theModel = new VariantGaussianMixtureModel( dataManager, TARGET_TITV, NUM_GAUSSIANS, NUM_ITERATIONS, MIN_VAR_IN_CLUSTER );
|
||||
break;
|
||||
//case K_NEAREST_NEIGHBORS:
|
||||
// theModel = new VariantNearestNeighborsModel( dataManager, TARGET_TITV, NUM_KNN );
|
||||
// break;
|
||||
default:
|
||||
throw new StingException("Variant Optimization Model is unrecognized. Implemented options are GAUSSIAN_MIXTURE_MODEL and K_NEAREST_NEIGHBORS");
|
||||
throw new StingException( "Variant Optimization Model is unrecognized. Implemented options are GAUSSIAN_MIXTURE_MODEL and K_NEAREST_NEIGHBORS" );
|
||||
}
|
||||
|
||||
theModel.run( OUTPUT_PREFIX );
|
||||
theModel.run( CLUSTER_FILENAME );
|
||||
|
||||
|
||||
// Functionality moved to different walker
|
||||
|
||||
// Execute Rscript command to plot the optimization curve
|
||||
// Print out the command line to make it clear to the user what is being executed and how one might modify it
|
||||
final String rScriptCommandLine = PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_OptimizationCurve.R" + " " + OUTPUT_PREFIX + ".dat" + " " + TARGET_TITV;
|
||||
System.out.println( rScriptCommandLine );
|
||||
//final String rScriptCommandLine = PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_OptimizationCurve.R" + " " + OUTPUT_PREFIX + ".dat" + " " + TARGET_TITV;
|
||||
//System.out.println( rScriptCommandLine );
|
||||
|
||||
// Execute the RScript command to plot the table of truth values
|
||||
try {
|
||||
Runtime.getRuntime().exec( rScriptCommandLine );
|
||||
} catch ( IOException e ) {
|
||||
throw new StingException( "Unable to execute RScript command: " + rScriptCommandLine );
|
||||
}
|
||||
//try {
|
||||
// Runtime.getRuntime().exec( rScriptCommandLine );
|
||||
//} catch ( IOException e ) {
|
||||
// throw new StingException( "Unable to execute RScript command: " + rScriptCommandLine );
|
||||
//}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue