Adding a skeleton for the second step of the variant optimization process.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3023 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
ded4ba8966
commit
1bb4394aa9
|
|
@ -0,0 +1,125 @@
|
||||||
|
package org.broadinstitute.sting.playground.gatk.walkers.variantoptimizer;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||||
|
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||||
|
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||||
|
import org.broadinstitute.sting.gatk.walkers.RodWalker;
|
||||||
|
import org.broadinstitute.sting.utils.ExpandingArrayList;
|
||||||
|
import org.broadinstitute.sting.utils.StingException;
|
||||||
|
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2010 The Broad Institute
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person
|
||||||
|
* obtaining a copy of this software and associated documentation
|
||||||
|
* files (the "Software"), to deal in the Software without
|
||||||
|
* restriction, including without limitation the rights to use,
|
||||||
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following
|
||||||
|
* conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be
|
||||||
|
* included in all copies or substantial portions of the Software.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||||
|
* OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Applies calibrated variant cluster parameters to variant calls to produce an accurate and informative variant quality score
|
||||||
|
*
|
||||||
|
* @author rpoplin
|
||||||
|
* @since Mar 17, 2010
|
||||||
|
*
|
||||||
|
* @help.summary Applies calibrated variant cluster parameters to variant calls to produce an accurate and informative variant quality score
|
||||||
|
*/
|
||||||
|
|
||||||
|
public class ApplyVariantClustersWalker extends RodWalker<ExpandingArrayList<VariantDatum>, ExpandingArrayList<VariantDatum>> {
|
||||||
|
|
||||||
|
/////////////////////////////
|
||||||
|
// Command Line Arguments
|
||||||
|
/////////////////////////////
|
||||||
|
//@Argument(fullName="target_titv", shortName="titv", doc="The target Ti/Tv ratio towards which to optimize. (~~2.2 for whole genome experiments)", required=false)
|
||||||
|
//private double TARGET_TITV = 2.1;
|
||||||
|
@Argument(fullName="desired_num_variants", shortName="dV", doc="The desired number of variants to keep in a theoretically filtered set", required=false)
|
||||||
|
private int DESIRED_NUM_VARIANTS = 0;
|
||||||
|
@Argument(fullName="ignore_input_filters", shortName="ignoreFilters", doc="If specified the optimizer will use variants even if the FILTER column is marked in the VCF file", required=false)
|
||||||
|
private boolean IGNORE_INPUT_FILTERS = false;
|
||||||
|
@Argument(fullName="output_prefix", shortName="output", doc="The prefix added to output VCF file name and optimization curve pdf file name", required=false)
|
||||||
|
private String OUTPUT_PREFIX = "optimizer";
|
||||||
|
@Argument(fullName="clusterFile", shortName="clusterFile", doc="The output cluster file", required=true)
|
||||||
|
private String CLUSTER_FILENAME = "optimizer.cluster";
|
||||||
|
//@Argument(fullName = "optimization_model", shortName = "om", doc = "Optimization calculation model to employ -- GAUSSIAN_MIXTURE_MODEL is currently the default, while K_NEAREST_NEIGHBORS is also available for small callsets.", required = false)
|
||||||
|
private VariantOptimizationModel.Model OPTIMIZATION_MODEL = VariantOptimizationModel.Model.GAUSSIAN_MIXTURE_MODEL;
|
||||||
|
@Argument(fullName = "path_to_Rscript", shortName = "Rscript", doc = "The path to your implementation of Rscript. For Broad users this is probably /broad/tools/apps/R-2.6.0/bin/Rscript", required = false)
|
||||||
|
private String PATH_TO_RSCRIPT = "/broad/tools/apps/R-2.6.0/bin/Rscript";
|
||||||
|
@Argument(fullName = "path_to_resources", shortName = "resources", doc = "Path to resources folder holding the Sting R scripts.", required = false)
|
||||||
|
private String PATH_TO_RESOURCES = "R/";
|
||||||
|
|
||||||
|
/////////////////////////////
|
||||||
|
// Private Member Variables
|
||||||
|
/////////////////////////////
|
||||||
|
private final ExpandingArrayList<String> annotationKeys = new ExpandingArrayList<String>();
|
||||||
|
|
||||||
|
//---------------------------------------------------------------------------------------------------------------
|
||||||
|
//
|
||||||
|
// initialize
|
||||||
|
//
|
||||||
|
//---------------------------------------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
public void initialize() {
|
||||||
|
if( !PATH_TO_RESOURCES.endsWith("/") ) { PATH_TO_RESOURCES = PATH_TO_RESOURCES + "/"; }
|
||||||
|
|
||||||
|
VariantOptimizationModel theModel = null;
|
||||||
|
switch (OPTIMIZATION_MODEL) {
|
||||||
|
case GAUSSIAN_MIXTURE_MODEL:
|
||||||
|
theModel = new VariantGaussianMixtureModel( CLUSTER_FILENAME );
|
||||||
|
break;
|
||||||
|
//case K_NEAREST_NEIGHBORS:
|
||||||
|
// theModel = new VariantNearestNeighborsModel( dataManager, TARGET_TITV, NUM_KNN );
|
||||||
|
// break;
|
||||||
|
default:
|
||||||
|
throw new StingException( "Variant Optimization Model is unrecognized. Implemented options are GAUSSIAN_MIXTURE_MODEL and K_NEAREST_NEIGHBORS" );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
//---------------------------------------------------------------------------------------------------------------
|
||||||
|
//
|
||||||
|
// map
|
||||||
|
//
|
||||||
|
//---------------------------------------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
public ExpandingArrayList<VariantDatum> map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) {
|
||||||
|
final ExpandingArrayList<VariantDatum> mapList = new ExpandingArrayList<VariantDatum>();
|
||||||
|
return mapList;
|
||||||
|
}
|
||||||
|
|
||||||
|
//---------------------------------------------------------------------------------------------------------------
|
||||||
|
//
|
||||||
|
// reduce
|
||||||
|
//
|
||||||
|
//---------------------------------------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
public ExpandingArrayList<VariantDatum> reduceInit() {
|
||||||
|
return new ExpandingArrayList<VariantDatum>();
|
||||||
|
}
|
||||||
|
|
||||||
|
public ExpandingArrayList<VariantDatum> reduce( final ExpandingArrayList<VariantDatum> mapValue, final ExpandingArrayList<VariantDatum> reduceSum ) {
|
||||||
|
reduceSum.addAll( mapValue );
|
||||||
|
return reduceSum;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void onTraversalDone( ExpandingArrayList<VariantDatum> reduceSum ) {
|
||||||
|
|
||||||
|
final VariantDataManager dataManager = new VariantDataManager( reduceSum, annotationKeys );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
@ -33,5 +33,5 @@ package org.broadinstitute.sting.playground.gatk.walkers.variantoptimizer;
|
||||||
|
|
||||||
public interface VariantClusteringModel extends VariantOptimizationInterface {
|
public interface VariantClusteringModel extends VariantOptimizationInterface {
|
||||||
public void createClusters( final VariantDatum[] data );
|
public void createClusters( final VariantDatum[] data );
|
||||||
public void applyClusters( final VariantDatum[] data, final String outputPrefix );
|
//public void applyClusters( final VariantDatum[] data, final String outputPrefix );
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -4,6 +4,8 @@ import org.broadinstitute.sting.utils.ExpandingArrayList;
|
||||||
import org.broadinstitute.sting.utils.StingException;
|
import org.broadinstitute.sting.utils.StingException;
|
||||||
|
|
||||||
import java.io.PrintStream;
|
import java.io.PrintStream;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Copyright (c) 2010 The Broad Institute
|
* Copyright (c) 2010 The Broad Institute
|
||||||
|
|
@ -40,8 +42,8 @@ public class VariantDataManager {
|
||||||
public final VariantDatum[] data;
|
public final VariantDatum[] data;
|
||||||
public final int numVariants;
|
public final int numVariants;
|
||||||
public final int numAnnotations;
|
public final int numAnnotations;
|
||||||
public final double[] varianceVector;
|
|
||||||
public final double[] meanVector;
|
public final double[] meanVector;
|
||||||
|
public final double[] varianceVector;
|
||||||
public boolean isNormalized;
|
public boolean isNormalized;
|
||||||
private final ExpandingArrayList<String> annotationKeys;
|
private final ExpandingArrayList<String> annotationKeys;
|
||||||
|
|
||||||
|
|
@ -61,10 +63,26 @@ public class VariantDataManager {
|
||||||
annotationKeys = _annotationKeys;
|
annotationKeys = _annotationKeys;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void normalizeData() {
|
public VariantDataManager( final ExpandingArrayList<String> annotationLines ) {
|
||||||
for( int iii = 0; iii < numVariants; iii++ ) {
|
data = null;
|
||||||
data[iii].originalAnnotations = data[iii].annotations.clone();
|
numVariants = 0;
|
||||||
|
numAnnotations = annotationLines.size();
|
||||||
|
meanVector = new double[numAnnotations];
|
||||||
|
varianceVector = new double[numAnnotations];
|
||||||
|
isNormalized = true;
|
||||||
|
annotationKeys = new ExpandingArrayList<String>();
|
||||||
|
|
||||||
|
int jjj = 0;
|
||||||
|
for( String line : annotationLines ) {
|
||||||
|
String[] vals = line.split(",");
|
||||||
|
annotationKeys.add(vals[1]);
|
||||||
|
meanVector[jjj] = Double.parseDouble(vals[2]);
|
||||||
|
varianceVector[jjj] = Double.parseDouble(vals[3]);
|
||||||
|
jjj++;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void normalizeData() {
|
||||||
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
|
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
|
||||||
final double theMean = mean(data, jjj);
|
final double theMean = mean(data, jjj);
|
||||||
final double theSTD = standardDeviation(data, theMean, jjj);
|
final double theSTD = standardDeviation(data, theMean, jjj);
|
||||||
|
|
@ -99,10 +117,9 @@ public class VariantDataManager {
|
||||||
return Math.sqrt( sum );
|
return Math.sqrt( sum );
|
||||||
}
|
}
|
||||||
|
|
||||||
// public void printClusterFileHeader( PrintStream outputFile ) {
|
public void printClusterFileHeader( PrintStream outputFile ) {
|
||||||
// for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
|
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
|
||||||
// outputFile.println("@ANNOTATION," + (jjj == numAnnotations-1 ? "QUAL" : annotationKeys.get(jjj)) + "," + meanVector[jjj] + "," + varianceVector[jjj]);
|
outputFile.println("@!ANNOTATION," + (jjj == numAnnotations-1 ? "QUAL" : annotationKeys.get(jjj)) + "," + meanVector[jjj] + "," + varianceVector[jjj]);
|
||||||
// }
|
}
|
||||||
// }
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -33,7 +33,7 @@ package org.broadinstitute.sting.playground.gatk.walkers.variantoptimizer;
|
||||||
|
|
||||||
public class VariantDatum {
|
public class VariantDatum {
|
||||||
public double[] annotations;
|
public double[] annotations;
|
||||||
public double[] originalAnnotations;
|
|
||||||
public boolean isTransition;
|
public boolean isTransition;
|
||||||
public boolean isKnown;
|
public boolean isKnown;
|
||||||
|
public double pTrue;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -1,7 +1,15 @@
|
||||||
package org.broadinstitute.sting.playground.gatk.walkers.variantoptimizer;
|
package org.broadinstitute.sting.playground.gatk.walkers.variantoptimizer;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.utils.ExpandingArrayList;
|
||||||
|
import org.broadinstitute.sting.utils.StingException;
|
||||||
|
import org.broadinstitute.sting.utils.xReadLines;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.FileNotFoundException;
|
||||||
import java.io.PrintStream;
|
import java.io.PrintStream;
|
||||||
|
import java.util.ArrayList;
|
||||||
import java.util.Random;
|
import java.util.Random;
|
||||||
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Copyright (c) 2010 The Broad Institute
|
* Copyright (c) 2010 The Broad Institute
|
||||||
|
|
@ -36,6 +44,7 @@ import java.util.Random;
|
||||||
|
|
||||||
public final class VariantGaussianMixtureModel extends VariantOptimizationModel implements VariantClusteringModel {
|
public final class VariantGaussianMixtureModel extends VariantOptimizationModel implements VariantClusteringModel {
|
||||||
|
|
||||||
|
private final VariantDataManager dataManager;
|
||||||
private final int numGaussians;
|
private final int numGaussians;
|
||||||
private final int numIterations;
|
private final int numIterations;
|
||||||
private final long RANDOM_SEED = 91801305;
|
private final long RANDOM_SEED = 91801305;
|
||||||
|
|
@ -50,9 +59,15 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
|
||||||
private final int[] numMaxClusterNovel;
|
private final int[] numMaxClusterNovel;
|
||||||
private final double[] clusterTITV;
|
private final double[] clusterTITV;
|
||||||
private final double[] clusterTruePositiveRate; // The true positive rate implied by the cluster's Ti/Tv ratio
|
private final double[] clusterTruePositiveRate; // The true positive rate implied by the cluster's Ti/Tv ratio
|
||||||
|
//private final int desiredNumVariants;
|
||||||
|
private final int minVarInCluster;
|
||||||
|
|
||||||
public VariantGaussianMixtureModel( VariantDataManager _dataManager, final double _targetTITV, final int _numGaussians, final int _numIterations ) {
|
private static final Pattern ANNOTATION_PATTERN = Pattern.compile("^@!ANNOTATION.*");
|
||||||
super( _dataManager, _targetTITV );
|
private static final Pattern CLUSTER_PATTERN = Pattern.compile("^@!CLUSTER.*");
|
||||||
|
|
||||||
|
public VariantGaussianMixtureModel( final VariantDataManager _dataManager, final double _targetTITV, final int _numGaussians, final int _numIterations, final int _minVarInCluster ) {
|
||||||
|
super( _targetTITV );
|
||||||
|
dataManager = _dataManager;
|
||||||
numGaussians = _numGaussians;
|
numGaussians = _numGaussians;
|
||||||
numIterations = _numIterations;
|
numIterations = _numIterations;
|
||||||
|
|
||||||
|
|
@ -63,9 +78,46 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
|
||||||
numMaxClusterNovel = new int[numGaussians];
|
numMaxClusterNovel = new int[numGaussians];
|
||||||
clusterTITV = new double[numGaussians];
|
clusterTITV = new double[numGaussians];
|
||||||
clusterTruePositiveRate = new double[numGaussians];
|
clusterTruePositiveRate = new double[numGaussians];
|
||||||
|
//desiredNumVariants = _desiredNumVariants;
|
||||||
|
minVarInCluster = _minVarInCluster;
|
||||||
|
}
|
||||||
|
|
||||||
|
public VariantGaussianMixtureModel( final String clusterFileName ) {
|
||||||
|
super( 0.0 );
|
||||||
|
ExpandingArrayList<String> annotationLines = new ExpandingArrayList<String>();
|
||||||
|
ExpandingArrayList<String> clusterLines = new ExpandingArrayList<String>();
|
||||||
|
|
||||||
|
try {
|
||||||
|
for ( String line : new xReadLines(new File( clusterFileName )) ) {
|
||||||
|
if( ANNOTATION_PATTERN.matcher(line).matches() ) {
|
||||||
|
annotationLines.add(line);
|
||||||
|
} else if( CLUSTER_PATTERN.matcher(line).matches() ) {
|
||||||
|
clusterLines.add(line);
|
||||||
|
} else {
|
||||||
|
throw new StingException("Malformed input file: " + clusterFileName);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch ( FileNotFoundException e ) {
|
||||||
|
throw new StingException("Can not find input file: " + clusterFileName);
|
||||||
|
}
|
||||||
|
|
||||||
|
dataManager = new VariantDataManager( annotationLines );
|
||||||
|
numIterations = 0;
|
||||||
|
pCluster = null;
|
||||||
|
numMaxClusterKnown = null;
|
||||||
|
numMaxClusterNovel = null;
|
||||||
|
clusterTITV = null;
|
||||||
|
clusterTruePositiveRate = null;
|
||||||
|
//desiredNumVariants = _desiredNumVariants;
|
||||||
|
minVarInCluster = 0;
|
||||||
|
|
||||||
|
numGaussians = clusterLines.size();
|
||||||
|
mu = new double[numGaussians][];
|
||||||
|
sigma = new double[numGaussians][];
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public final void run( final String outputPrefix ) {
|
public final void run( final String clusterFileName ) {
|
||||||
|
|
||||||
// Create the subset of the data to cluster with
|
// Create the subset of the data to cluster with
|
||||||
int numNovel = 0;
|
int numNovel = 0;
|
||||||
|
|
@ -78,6 +130,7 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
|
||||||
|
|
||||||
// Grab a set of data that is all of the novel variants plus 1.5x as many known variants drawn at random
|
// Grab a set of data that is all of the novel variants plus 1.5x as many known variants drawn at random
|
||||||
// If there are almost as many novels as known, simply use all the variants
|
// If there are almost as many novels as known, simply use all the variants
|
||||||
|
// BUGBUG: allow downsampling and arbitrary mixtures of knowns and novels
|
||||||
final int numSubset = (int)Math.floor(numNovel*2.5);
|
final int numSubset = (int)Math.floor(numNovel*2.5);
|
||||||
if( numSubset * 1.3 < dataManager.numVariants ) {
|
if( numSubset * 1.3 < dataManager.numVariants ) {
|
||||||
data = new VariantDatum[numSubset];
|
data = new VariantDatum[numSubset];
|
||||||
|
|
@ -100,10 +153,10 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
|
||||||
System.out.println("Clustering with " + numNovel + " novel variants and " + (data.length - numNovel) + " known variants...");
|
System.out.println("Clustering with " + numNovel + " novel variants and " + (data.length - numNovel) + " known variants...");
|
||||||
if( data.length == dataManager.numVariants ) { System.out.println(" (used all variants since 2.5*numNovel is so large compared to the full set) "); }
|
if( data.length == dataManager.numVariants ) { System.out.println(" (used all variants since 2.5*numNovel is so large compared to the full set) "); }
|
||||||
createClusters( data ); // Using a subset of the data
|
createClusters( data ); // Using a subset of the data
|
||||||
System.out.println("Printing out cluster parameters...");
|
System.out.println("Outputting cluster parameters...");
|
||||||
printClusters( outputPrefix );
|
printClusters( clusterFileName );
|
||||||
System.out.println("Applying clusters to all variants...");
|
//System.out.println("Applying clusters to all variants...");
|
||||||
applyClusters( dataManager.data, outputPrefix ); // Using all the data
|
//applyClusters( dataManager.data, outputPrefix ); // Using all the data
|
||||||
}
|
}
|
||||||
|
|
||||||
public final void createClusters( final VariantDatum[] data ) {
|
public final void createClusters( final VariantDatum[] data ) {
|
||||||
|
|
@ -198,15 +251,14 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private void printClusters( final String outputPrefix ) {
|
private void printClusters( final String clusterFileName ) {
|
||||||
try {
|
try {
|
||||||
final PrintStream outputFile = new PrintStream( outputPrefix + ".clusters" );
|
final PrintStream outputFile = new PrintStream( clusterFileName );
|
||||||
int clusterNumber = 0;
|
dataManager.printClusterFileHeader( outputFile );
|
||||||
final int numAnnotations = mu[0].length;
|
final int numAnnotations = mu[0].length;
|
||||||
for( int kkk = 0; kkk < numGaussians; kkk++ ) {
|
for( int kkk = 0; kkk < numGaussians; kkk++ ) {
|
||||||
//BUGBUG: only output the good clusters here to protect against over-fitting?
|
if( numMaxClusterKnown[kkk] + numMaxClusterNovel[kkk] >= minVarInCluster ) {
|
||||||
//if( numMaxClusterKnown[kkk] + numMaxClusterNovel[kkk] >= 2000 ) {
|
outputFile.print("@!CLUSTER,");
|
||||||
outputFile.print(clusterNumber + ",");
|
|
||||||
outputFile.print(numMaxClusterKnown[kkk] + ",");
|
outputFile.print(numMaxClusterKnown[kkk] + ",");
|
||||||
outputFile.print(numMaxClusterNovel[kkk] + ",");
|
outputFile.print(numMaxClusterNovel[kkk] + ",");
|
||||||
outputFile.print(clusterTITV[kkk] + ",");
|
outputFile.print(clusterTITV[kkk] + ",");
|
||||||
|
|
@ -218,8 +270,7 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
|
||||||
outputFile.print(sigma[kkk][jjj] + ",");
|
outputFile.print(sigma[kkk][jjj] + ",");
|
||||||
}
|
}
|
||||||
outputFile.println(-1);
|
outputFile.println(-1);
|
||||||
clusterNumber++;
|
}
|
||||||
//}
|
|
||||||
}
|
}
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
e.printStackTrace();
|
e.printStackTrace();
|
||||||
|
|
@ -227,6 +278,7 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
public final void applyClusters( final VariantDatum[] data, final String outputPrefix ) {
|
public final void applyClusters( final VariantDatum[] data, final String outputPrefix ) {
|
||||||
|
|
||||||
final int numVariants = data.length;
|
final int numVariants = data.length;
|
||||||
|
|
@ -262,6 +314,7 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
|
||||||
int numKnownTv = 0;
|
int numKnownTv = 0;
|
||||||
int numNovelTi = 0;
|
int numNovelTi = 0;
|
||||||
int numNovelTv = 0;
|
int numNovelTv = 0;
|
||||||
|
boolean foundDesiredNumVariants = false;
|
||||||
outputFile.println("pCut,numKnown,numNovel,knownTITV,novelTITV");
|
outputFile.println("pCut,numKnown,numNovel,knownTITV,novelTITV");
|
||||||
for( double pCut = 1.0; pCut >= 0.0; pCut -= 0.001 ) {
|
for( double pCut = 1.0; pCut >= 0.0; pCut -= 0.001 ) {
|
||||||
for( int iii = 0; iii < numVariants; iii++ ) {
|
for( int iii = 0; iii < numVariants; iii++ ) {
|
||||||
|
|
@ -286,9 +339,21 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
outputFile.println( pCut + "," + numKnown + "," + numNovel + "," + ( ((double)numKnownTi) / ((double)numKnownTv) ) + "," + ( ((double)numNovelTi) / ((double)numNovelTv) ));
|
if( desiredNumVariants != 0 && !foundDesiredNumVariants && (numKnown + numNovel) >= desiredNumVariants ) {
|
||||||
|
System.out.println( "Keeping variants with p(true) >= " + String.format("%.3f",pCut) + " results in a filtered set with: " );
|
||||||
|
System.out.println("\t" + numKnown + " known variants");
|
||||||
|
System.out.println("\t" + numNovel + " novel variants, (dbSNP rate = " + String.format("%.2f",((double) numKnown * 100.0) / ((double) numKnown + numNovel) ) + "%)");
|
||||||
|
System.out.println("\t" + String.format("%.4f known Ti/Tv ratio", ((double)numKnownTi) / ((double)numKnownTv)));
|
||||||
|
System.out.println("\t" + String.format("%.4f novel Ti/Tv ratio", ((double)numNovelTi) / ((double)numNovelTv)));
|
||||||
|
System.out.println();
|
||||||
|
foundDesiredNumVariants = true;
|
||||||
|
}
|
||||||
|
outputFile.println( pCut + "," + numKnown + "," + numNovel + "," +
|
||||||
|
( numKnownTi ==0 || numKnownTv == 0 ? "NaN" : ( ((double)numKnownTi) / ((double)numKnownTv) ) ) + "," +
|
||||||
|
( numNovelTi ==0 || numNovelTv == 0 ? "NaN" : ( ((double)numNovelTi) / ((double)numNovelTv) )));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
*/
|
||||||
|
|
||||||
|
|
||||||
private void evaluateGaussians( final VariantDatum[] data, final double[][] pVarInCluster ) {
|
private void evaluateGaussians( final VariantDatum[] data, final double[][] pVarInCluster ) {
|
||||||
|
|
@ -327,7 +392,6 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
|
||||||
|
|
||||||
double sumProb = 0.0;
|
double sumProb = 0.0;
|
||||||
for( int kkk = 0; kkk < numGaussians; kkk++ ) {
|
for( int kkk = 0; kkk < numGaussians; kkk++ ) {
|
||||||
//BUGBUG: only use the good clusters here to protect against over-fitting?
|
|
||||||
double sum = 0.0;
|
double sum = 0.0;
|
||||||
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
|
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
|
||||||
sum += ( (datum.annotations[jjj] - mu[kkk][jjj]) * (datum.annotations[jjj] - mu[kkk][jjj]) )
|
sum += ( (datum.annotations[jjj] - mu[kkk][jjj]) * (datum.annotations[jjj] - mu[kkk][jjj]) )
|
||||||
|
|
@ -400,6 +464,7 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
|
||||||
}
|
}
|
||||||
|
|
||||||
// Clean up extra big or extra small clusters
|
// Clean up extra big or extra small clusters
|
||||||
|
//BUGBUG: Is this a good idea?
|
||||||
for( int kkk = 0; kkk < numGaussians; kkk++ ) {
|
for( int kkk = 0; kkk < numGaussians; kkk++ ) {
|
||||||
if( pCluster[kkk] > 0.45 ) { // This is a very large cluster compared to all the others
|
if( pCluster[kkk] > 0.45 ) { // This is a very large cluster compared to all the others
|
||||||
final int numToReplace = 4;
|
final int numToReplace = 4;
|
||||||
|
|
|
||||||
|
|
@ -40,7 +40,8 @@ public final class VariantNearestNeighborsModel extends VariantOptimizationModel
|
||||||
private final int numKNN;
|
private final int numKNN;
|
||||||
|
|
||||||
public VariantNearestNeighborsModel( VariantDataManager _dataManager, final double _targetTITV, final int _numKNN ) {
|
public VariantNearestNeighborsModel( VariantDataManager _dataManager, final double _targetTITV, final int _numKNN ) {
|
||||||
super( _dataManager, _targetTITV );
|
super( _targetTITV );
|
||||||
|
//dataManager = _dataManager;
|
||||||
numKNN = _numKNN;
|
numKNN = _numKNN;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -38,11 +38,9 @@ public abstract class VariantOptimizationModel implements VariantOptimizationInt
|
||||||
K_NEAREST_NEIGHBORS
|
K_NEAREST_NEIGHBORS
|
||||||
}
|
}
|
||||||
|
|
||||||
protected final VariantDataManager dataManager;
|
|
||||||
protected final double targetTITV;
|
protected final double targetTITV;
|
||||||
|
|
||||||
public VariantOptimizationModel( VariantDataManager _dataManager, final double _targetTITV ) {
|
public VariantOptimizationModel( final double _targetTITV ) {
|
||||||
dataManager = _dataManager;
|
|
||||||
targetTITV = _targetTITV;
|
targetTITV = _targetTITV;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -38,13 +38,12 @@ import java.io.IOException;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Takes variant calls as .vcf files, learns a Gaussian mixture model over the variant annotations producing a callset that is optimized for
|
* Takes variant calls as .vcf files, learns a Gaussian mixture model over the variant annotations producing calibrated variant cluster parameters which can be applied to other datasets
|
||||||
* an expected transition / transversion ratio.
|
|
||||||
*
|
*
|
||||||
* @author rpoplin
|
* @author rpoplin
|
||||||
* @since Feb 11, 2010
|
* @since Feb 11, 2010
|
||||||
*
|
*
|
||||||
* @help.summary Takes variant calls as .vcf files, learns a Gaussian mixture model over the variant annotations producing an optimized callset
|
* @help.summary Takes variant calls as .vcf files, learns a Gaussian mixture model over the variant annotations producing calibrated variant cluster parameters which can be applied to other datasets
|
||||||
*/
|
*/
|
||||||
|
|
||||||
public class VariantOptimizer extends RodWalker<ExpandingArrayList<VariantDatum>, ExpandingArrayList<VariantDatum>> {
|
public class VariantOptimizer extends RodWalker<ExpandingArrayList<VariantDatum>, ExpandingArrayList<VariantDatum>> {
|
||||||
|
|
@ -54,26 +53,31 @@ public class VariantOptimizer extends RodWalker<ExpandingArrayList<VariantDatum>
|
||||||
/////////////////////////////
|
/////////////////////////////
|
||||||
@Argument(fullName="target_titv", shortName="titv", doc="The target Ti/Tv ratio towards which to optimize. (~~2.2 for whole genome experiments)", required=true)
|
@Argument(fullName="target_titv", shortName="titv", doc="The target Ti/Tv ratio towards which to optimize. (~~2.2 for whole genome experiments)", required=true)
|
||||||
private double TARGET_TITV = 2.1;
|
private double TARGET_TITV = 2.1;
|
||||||
|
//@Argument(fullName="desired_num_variants", shortName="dV", doc="The desired number of variants to keep in a theoretically filtered set", required=false)
|
||||||
|
//private int DESIRED_NUM_VARIANTS = 0;
|
||||||
@Argument(fullName="ignore_input_filters", shortName="ignoreFilters", doc="If specified the optimizer will use variants even if the FILTER column is marked in the VCF file", required=false)
|
@Argument(fullName="ignore_input_filters", shortName="ignoreFilters", doc="If specified the optimizer will use variants even if the FILTER column is marked in the VCF file", required=false)
|
||||||
private boolean IGNORE_INPUT_FILTERS = false;
|
private boolean IGNORE_INPUT_FILTERS = false;
|
||||||
@Argument(fullName="exclude_annotation", shortName="exclude", doc="The names of the annotations which should be excluded from the calculations", required=false)
|
@Argument(fullName="exclude_annotation", shortName="exclude", doc="The names of the annotations which should be excluded from the calculations", required=false)
|
||||||
private String[] EXCLUDED_ANNOTATIONS = null;
|
private String[] EXCLUDED_ANNOTATIONS = null;
|
||||||
@Argument(fullName="force_annotation", shortName="force", doc="The names of the annotations which should be forced into the calculations even if they aren't present in every variant", required=false)
|
@Argument(fullName="force_annotation", shortName="force", doc="The names of the annotations which should be forced into the calculations even if they aren't present in every variant", required=false)
|
||||||
private String[] FORCED_ANNOTATIONS = null;
|
private String[] FORCED_ANNOTATIONS = null;
|
||||||
@Argument(fullName="output_prefix", shortName="output", doc="The output prefix added to output cluster file name and optimization curve pdf file name", required=false)
|
@Argument(fullName="clusterFile", shortName="clusterFile", doc="The output cluster file", required=true)
|
||||||
private String OUTPUT_PREFIX = "optimizer";
|
private String CLUSTER_FILENAME = "optimizer.cluster";
|
||||||
@Argument(fullName="numGaussians", shortName="nG", doc="The number of Gaussians to be used in the Gaussian Mixture model", required=false)
|
@Argument(fullName="numGaussians", shortName="nG", doc="The number of Gaussians to be used in the Gaussian Mixture model", required=false)
|
||||||
private int NUM_GAUSSIANS = 32;
|
private int NUM_GAUSSIANS = 32;
|
||||||
@Argument(fullName="numIterations", shortName="nI", doc="The number of iterations to be performed in the Gaussian Mixture model", required=false)
|
@Argument(fullName="numIterations", shortName="nI", doc="The number of iterations to be performed in the Gaussian Mixture model", required=false)
|
||||||
private int NUM_ITERATIONS = 10;
|
private int NUM_ITERATIONS = 10;
|
||||||
@Argument(fullName="knn", shortName="knn", doc="The number of nearest neighbors to be used in the k-Nearest Neighbors model", required=false)
|
@Argument(fullName="minVarInCluster", shortName="minVar", doc="The minimum number of variants in a cluster to be considered a valid cluster. Used to prevent overfitting. Default is 2000.", required=true)
|
||||||
private int NUM_KNN = 2000;
|
private int MIN_VAR_IN_CLUSTER = 2000;
|
||||||
@Argument(fullName = "optimization_model", shortName = "om", doc = "Optimization calculation model to employ -- GAUSSIAN_MIXTURE_MODEL is currently the default, while K_NEAREST_NEIGHBORS is also available for small callsets.", required = false)
|
|
||||||
|
//@Argument(fullName="knn", shortName="knn", doc="The number of nearest neighbors to be used in the k-Nearest Neighbors model", required=false)
|
||||||
|
//private int NUM_KNN = 2000;
|
||||||
|
//@Argument(fullName = "optimization_model", shortName = "om", doc = "Optimization calculation model to employ -- GAUSSIAN_MIXTURE_MODEL is currently the default, while K_NEAREST_NEIGHBORS is also available for small callsets.", required = false)
|
||||||
private VariantOptimizationModel.Model OPTIMIZATION_MODEL = VariantOptimizationModel.Model.GAUSSIAN_MIXTURE_MODEL;
|
private VariantOptimizationModel.Model OPTIMIZATION_MODEL = VariantOptimizationModel.Model.GAUSSIAN_MIXTURE_MODEL;
|
||||||
@Argument(fullName = "path_to_Rscript", shortName = "Rscript", doc = "The path to your implementation of Rscript. For Broad users this is probably /broad/tools/apps/R-2.6.0/bin/Rscript", required = false)
|
//@Argument(fullName = "path_to_Rscript", shortName = "Rscript", doc = "The path to your implementation of Rscript. For Broad users this is probably /broad/tools/apps/R-2.6.0/bin/Rscript", required = false)
|
||||||
private String PATH_TO_RSCRIPT = "/broad/tools/apps/R-2.6.0/bin/Rscript";
|
//private String PATH_TO_RSCRIPT = "/broad/tools/apps/R-2.6.0/bin/Rscript";
|
||||||
@Argument(fullName = "path_to_resources", shortName = "resources", doc = "Path to resources folder holding the Sting R scripts.", required = false)
|
//@Argument(fullName = "path_to_resources", shortName = "resources", doc = "Path to resources folder holding the Sting R scripts.", required = false)
|
||||||
private String PATH_TO_RESOURCES = "R/";
|
//private String PATH_TO_RESOURCES = "R/";
|
||||||
|
|
||||||
/////////////////////////////
|
/////////////////////////////
|
||||||
// Private Member Variables
|
// Private Member Variables
|
||||||
|
|
@ -90,7 +94,7 @@ public class VariantOptimizer extends RodWalker<ExpandingArrayList<VariantDatum>
|
||||||
//---------------------------------------------------------------------------------------------------------------
|
//---------------------------------------------------------------------------------------------------------------
|
||||||
|
|
||||||
public void initialize() {
|
public void initialize() {
|
||||||
if( !PATH_TO_RESOURCES.endsWith("/") ) { PATH_TO_RESOURCES = PATH_TO_RESOURCES + "/"; }
|
//if( !PATH_TO_RESOURCES.endsWith("/") ) { PATH_TO_RESOURCES = PATH_TO_RESOURCES + "/"; }
|
||||||
}
|
}
|
||||||
|
|
||||||
//---------------------------------------------------------------------------------------------------------------
|
//---------------------------------------------------------------------------------------------------------------
|
||||||
|
|
@ -177,10 +181,10 @@ public class VariantOptimizer extends RodWalker<ExpandingArrayList<VariantDatum>
|
||||||
|
|
||||||
public void onTraversalDone( ExpandingArrayList<VariantDatum> reduceSum ) {
|
public void onTraversalDone( ExpandingArrayList<VariantDatum> reduceSum ) {
|
||||||
|
|
||||||
final VariantDataManager dataManager = new VariantDataManager( reduceSum, annotationKeys);
|
final VariantDataManager dataManager = new VariantDataManager( reduceSum, annotationKeys );
|
||||||
reduceSum.clear(); // Don't need this ever again, clean up some memory
|
reduceSum.clear(); // Don't need this ever again, clean up some memory
|
||||||
|
|
||||||
logger.info( "There are " + dataManager.numVariants + " variants and " + dataManager.numAnnotations + " annotations.");
|
logger.info( "There are " + dataManager.numVariants + " variants and " + dataManager.numAnnotations + " annotations." );
|
||||||
logger.info( "The annotations are: " + annotationKeys + " and QUAL." );
|
logger.info( "The annotations are: " + annotationKeys + " and QUAL." );
|
||||||
|
|
||||||
dataManager.normalizeData(); // Each data point is now [ (x - mean) / standard deviation ]
|
dataManager.normalizeData(); // Each data point is now [ (x - mean) / standard deviation ]
|
||||||
|
|
@ -189,28 +193,31 @@ public class VariantOptimizer extends RodWalker<ExpandingArrayList<VariantDatum>
|
||||||
VariantOptimizationModel theModel;
|
VariantOptimizationModel theModel;
|
||||||
switch (OPTIMIZATION_MODEL) {
|
switch (OPTIMIZATION_MODEL) {
|
||||||
case GAUSSIAN_MIXTURE_MODEL:
|
case GAUSSIAN_MIXTURE_MODEL:
|
||||||
theModel = new VariantGaussianMixtureModel( dataManager, TARGET_TITV, NUM_GAUSSIANS, NUM_ITERATIONS );
|
theModel = new VariantGaussianMixtureModel( dataManager, TARGET_TITV, NUM_GAUSSIANS, NUM_ITERATIONS, MIN_VAR_IN_CLUSTER );
|
||||||
break;
|
|
||||||
case K_NEAREST_NEIGHBORS:
|
|
||||||
theModel = new VariantNearestNeighborsModel( dataManager, TARGET_TITV, NUM_KNN );
|
|
||||||
break;
|
break;
|
||||||
|
//case K_NEAREST_NEIGHBORS:
|
||||||
|
// theModel = new VariantNearestNeighborsModel( dataManager, TARGET_TITV, NUM_KNN );
|
||||||
|
// break;
|
||||||
default:
|
default:
|
||||||
throw new StingException("Variant Optimization Model is unrecognized. Implemented options are GAUSSIAN_MIXTURE_MODEL and K_NEAREST_NEIGHBORS");
|
throw new StingException( "Variant Optimization Model is unrecognized. Implemented options are GAUSSIAN_MIXTURE_MODEL and K_NEAREST_NEIGHBORS" );
|
||||||
}
|
}
|
||||||
|
|
||||||
theModel.run( OUTPUT_PREFIX );
|
theModel.run( CLUSTER_FILENAME );
|
||||||
|
|
||||||
|
|
||||||
|
// Functionality moved to different walker
|
||||||
|
|
||||||
// Execute Rscript command to plot the optimization curve
|
// Execute Rscript command to plot the optimization curve
|
||||||
// Print out the command line to make it clear to the user what is being executed and how one might modify it
|
// Print out the command line to make it clear to the user what is being executed and how one might modify it
|
||||||
final String rScriptCommandLine = PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_OptimizationCurve.R" + " " + OUTPUT_PREFIX + ".dat" + " " + TARGET_TITV;
|
//final String rScriptCommandLine = PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_OptimizationCurve.R" + " " + OUTPUT_PREFIX + ".dat" + " " + TARGET_TITV;
|
||||||
System.out.println( rScriptCommandLine );
|
//System.out.println( rScriptCommandLine );
|
||||||
|
|
||||||
// Execute the RScript command to plot the table of truth values
|
// Execute the RScript command to plot the table of truth values
|
||||||
try {
|
//try {
|
||||||
Runtime.getRuntime().exec( rScriptCommandLine );
|
// Runtime.getRuntime().exec( rScriptCommandLine );
|
||||||
} catch ( IOException e ) {
|
//} catch ( IOException e ) {
|
||||||
throw new StingException( "Unable to execute RScript command: " + rScriptCommandLine );
|
// throw new StingException( "Unable to execute RScript command: " + rScriptCommandLine );
|
||||||
}
|
//}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue