Adding a skeleton for the second step of the variant optimization process.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3023 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
rpoplin 2010-03-17 17:03:40 +00:00
parent ded4ba8966
commit 1bb4394aa9
8 changed files with 274 additions and 61 deletions

View File

@ -0,0 +1,125 @@
package org.broadinstitute.sting.playground.gatk.walkers.variantoptimizer;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.utils.ExpandingArrayList;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.cmdLine.Argument;
/*
* Copyright (c) 2010 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
/**
* Applies calibrated variant cluster parameters to variant calls to produce an accurate and informative variant quality score
*
* @author rpoplin
* @since Mar 17, 2010
*
* @help.summary Applies calibrated variant cluster parameters to variant calls to produce an accurate and informative variant quality score
*/
public class ApplyVariantClustersWalker extends RodWalker<ExpandingArrayList<VariantDatum>, ExpandingArrayList<VariantDatum>> {
/////////////////////////////
// Command Line Arguments
/////////////////////////////
//@Argument(fullName="target_titv", shortName="titv", doc="The target Ti/Tv ratio towards which to optimize. (~~2.2 for whole genome experiments)", required=false)
//private double TARGET_TITV = 2.1;
@Argument(fullName="desired_num_variants", shortName="dV", doc="The desired number of variants to keep in a theoretically filtered set", required=false)
private int DESIRED_NUM_VARIANTS = 0;
@Argument(fullName="ignore_input_filters", shortName="ignoreFilters", doc="If specified the optimizer will use variants even if the FILTER column is marked in the VCF file", required=false)
private boolean IGNORE_INPUT_FILTERS = false;
@Argument(fullName="output_prefix", shortName="output", doc="The prefix added to output VCF file name and optimization curve pdf file name", required=false)
private String OUTPUT_PREFIX = "optimizer";
@Argument(fullName="clusterFile", shortName="clusterFile", doc="The output cluster file", required=true)
private String CLUSTER_FILENAME = "optimizer.cluster";
//@Argument(fullName = "optimization_model", shortName = "om", doc = "Optimization calculation model to employ -- GAUSSIAN_MIXTURE_MODEL is currently the default, while K_NEAREST_NEIGHBORS is also available for small callsets.", required = false)
private VariantOptimizationModel.Model OPTIMIZATION_MODEL = VariantOptimizationModel.Model.GAUSSIAN_MIXTURE_MODEL;
@Argument(fullName = "path_to_Rscript", shortName = "Rscript", doc = "The path to your implementation of Rscript. For Broad users this is probably /broad/tools/apps/R-2.6.0/bin/Rscript", required = false)
private String PATH_TO_RSCRIPT = "/broad/tools/apps/R-2.6.0/bin/Rscript";
@Argument(fullName = "path_to_resources", shortName = "resources", doc = "Path to resources folder holding the Sting R scripts.", required = false)
private String PATH_TO_RESOURCES = "R/";
/////////////////////////////
// Private Member Variables
/////////////////////////////
private final ExpandingArrayList<String> annotationKeys = new ExpandingArrayList<String>();
//---------------------------------------------------------------------------------------------------------------
//
// initialize
//
//---------------------------------------------------------------------------------------------------------------
public void initialize() {
if( !PATH_TO_RESOURCES.endsWith("/") ) { PATH_TO_RESOURCES = PATH_TO_RESOURCES + "/"; }
VariantOptimizationModel theModel = null;
switch (OPTIMIZATION_MODEL) {
case GAUSSIAN_MIXTURE_MODEL:
theModel = new VariantGaussianMixtureModel( CLUSTER_FILENAME );
break;
//case K_NEAREST_NEIGHBORS:
// theModel = new VariantNearestNeighborsModel( dataManager, TARGET_TITV, NUM_KNN );
// break;
default:
throw new StingException( "Variant Optimization Model is unrecognized. Implemented options are GAUSSIAN_MIXTURE_MODEL and K_NEAREST_NEIGHBORS" );
}
}
//---------------------------------------------------------------------------------------------------------------
//
// map
//
//---------------------------------------------------------------------------------------------------------------
public ExpandingArrayList<VariantDatum> map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) {
final ExpandingArrayList<VariantDatum> mapList = new ExpandingArrayList<VariantDatum>();
return mapList;
}
//---------------------------------------------------------------------------------------------------------------
//
// reduce
//
//---------------------------------------------------------------------------------------------------------------
public ExpandingArrayList<VariantDatum> reduceInit() {
return new ExpandingArrayList<VariantDatum>();
}
public ExpandingArrayList<VariantDatum> reduce( final ExpandingArrayList<VariantDatum> mapValue, final ExpandingArrayList<VariantDatum> reduceSum ) {
reduceSum.addAll( mapValue );
return reduceSum;
}
public void onTraversalDone( ExpandingArrayList<VariantDatum> reduceSum ) {
final VariantDataManager dataManager = new VariantDataManager( reduceSum, annotationKeys );
}
}

View File

@ -33,5 +33,5 @@ package org.broadinstitute.sting.playground.gatk.walkers.variantoptimizer;
public interface VariantClusteringModel extends VariantOptimizationInterface {
public void createClusters( final VariantDatum[] data );
public void applyClusters( final VariantDatum[] data, final String outputPrefix );
//public void applyClusters( final VariantDatum[] data, final String outputPrefix );
}

View File

@ -4,6 +4,8 @@ import org.broadinstitute.sting.utils.ExpandingArrayList;
import org.broadinstitute.sting.utils.StingException;
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.regex.Pattern;
/*
* Copyright (c) 2010 The Broad Institute
@ -40,8 +42,8 @@ public class VariantDataManager {
public final VariantDatum[] data;
public final int numVariants;
public final int numAnnotations;
public final double[] varianceVector;
public final double[] meanVector;
public final double[] varianceVector;
public boolean isNormalized;
private final ExpandingArrayList<String> annotationKeys;
@ -61,10 +63,26 @@ public class VariantDataManager {
annotationKeys = _annotationKeys;
}
public void normalizeData() {
for( int iii = 0; iii < numVariants; iii++ ) {
data[iii].originalAnnotations = data[iii].annotations.clone();
public VariantDataManager( final ExpandingArrayList<String> annotationLines ) {
data = null;
numVariants = 0;
numAnnotations = annotationLines.size();
meanVector = new double[numAnnotations];
varianceVector = new double[numAnnotations];
isNormalized = true;
annotationKeys = new ExpandingArrayList<String>();
int jjj = 0;
for( String line : annotationLines ) {
String[] vals = line.split(",");
annotationKeys.add(vals[1]);
meanVector[jjj] = Double.parseDouble(vals[2]);
varianceVector[jjj] = Double.parseDouble(vals[3]);
jjj++;
}
}
public void normalizeData() {
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
final double theMean = mean(data, jjj);
final double theSTD = standardDeviation(data, theMean, jjj);
@ -99,10 +117,9 @@ public class VariantDataManager {
return Math.sqrt( sum );
}
// public void printClusterFileHeader( PrintStream outputFile ) {
// for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
// outputFile.println("@ANNOTATION," + (jjj == numAnnotations-1 ? "QUAL" : annotationKeys.get(jjj)) + "," + meanVector[jjj] + "," + varianceVector[jjj]);
// }
// }
public void printClusterFileHeader( PrintStream outputFile ) {
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
outputFile.println("@!ANNOTATION," + (jjj == numAnnotations-1 ? "QUAL" : annotationKeys.get(jjj)) + "," + meanVector[jjj] + "," + varianceVector[jjj]);
}
}
}

View File

@ -33,7 +33,7 @@ package org.broadinstitute.sting.playground.gatk.walkers.variantoptimizer;
public class VariantDatum {
public double[] annotations;
public double[] originalAnnotations;
public boolean isTransition;
public boolean isKnown;
public double pTrue;
}

View File

@ -1,7 +1,15 @@
package org.broadinstitute.sting.playground.gatk.walkers.variantoptimizer;
import org.broadinstitute.sting.utils.ExpandingArrayList;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.xReadLines;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.Random;
import java.util.regex.Pattern;
/*
* Copyright (c) 2010 The Broad Institute
@ -36,6 +44,7 @@ import java.util.Random;
public final class VariantGaussianMixtureModel extends VariantOptimizationModel implements VariantClusteringModel {
private final VariantDataManager dataManager;
private final int numGaussians;
private final int numIterations;
private final long RANDOM_SEED = 91801305;
@ -50,9 +59,15 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
private final int[] numMaxClusterNovel;
private final double[] clusterTITV;
private final double[] clusterTruePositiveRate; // The true positive rate implied by the cluster's Ti/Tv ratio
//private final int desiredNumVariants;
private final int minVarInCluster;
public VariantGaussianMixtureModel( VariantDataManager _dataManager, final double _targetTITV, final int _numGaussians, final int _numIterations ) {
super( _dataManager, _targetTITV );
private static final Pattern ANNOTATION_PATTERN = Pattern.compile("^@!ANNOTATION.*");
private static final Pattern CLUSTER_PATTERN = Pattern.compile("^@!CLUSTER.*");
public VariantGaussianMixtureModel( final VariantDataManager _dataManager, final double _targetTITV, final int _numGaussians, final int _numIterations, final int _minVarInCluster ) {
super( _targetTITV );
dataManager = _dataManager;
numGaussians = _numGaussians;
numIterations = _numIterations;
@ -63,9 +78,46 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
numMaxClusterNovel = new int[numGaussians];
clusterTITV = new double[numGaussians];
clusterTruePositiveRate = new double[numGaussians];
//desiredNumVariants = _desiredNumVariants;
minVarInCluster = _minVarInCluster;
}
public VariantGaussianMixtureModel( final String clusterFileName ) {
super( 0.0 );
ExpandingArrayList<String> annotationLines = new ExpandingArrayList<String>();
ExpandingArrayList<String> clusterLines = new ExpandingArrayList<String>();
try {
for ( String line : new xReadLines(new File( clusterFileName )) ) {
if( ANNOTATION_PATTERN.matcher(line).matches() ) {
annotationLines.add(line);
} else if( CLUSTER_PATTERN.matcher(line).matches() ) {
clusterLines.add(line);
} else {
throw new StingException("Malformed input file: " + clusterFileName);
}
}
} catch ( FileNotFoundException e ) {
throw new StingException("Can not find input file: " + clusterFileName);
}
dataManager = new VariantDataManager( annotationLines );
numIterations = 0;
pCluster = null;
numMaxClusterKnown = null;
numMaxClusterNovel = null;
clusterTITV = null;
clusterTruePositiveRate = null;
//desiredNumVariants = _desiredNumVariants;
minVarInCluster = 0;
numGaussians = clusterLines.size();
mu = new double[numGaussians][];
sigma = new double[numGaussians][];
}
public final void run( final String outputPrefix ) {
public final void run( final String clusterFileName ) {
// Create the subset of the data to cluster with
int numNovel = 0;
@ -78,6 +130,7 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
// Grab a set of data that is all of the novel variants plus 1.5x as many known variants drawn at random
// If there are almost as many novels as known, simply use all the variants
// BUGBUG: allow downsampling and arbitrary mixtures of knowns and novels
final int numSubset = (int)Math.floor(numNovel*2.5);
if( numSubset * 1.3 < dataManager.numVariants ) {
data = new VariantDatum[numSubset];
@ -100,10 +153,10 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
System.out.println("Clustering with " + numNovel + " novel variants and " + (data.length - numNovel) + " known variants...");
if( data.length == dataManager.numVariants ) { System.out.println(" (used all variants since 2.5*numNovel is so large compared to the full set) "); }
createClusters( data ); // Using a subset of the data
System.out.println("Printing out cluster parameters...");
printClusters( outputPrefix );
System.out.println("Applying clusters to all variants...");
applyClusters( dataManager.data, outputPrefix ); // Using all the data
System.out.println("Outputting cluster parameters...");
printClusters( clusterFileName );
//System.out.println("Applying clusters to all variants...");
//applyClusters( dataManager.data, outputPrefix ); // Using all the data
}
public final void createClusters( final VariantDatum[] data ) {
@ -198,15 +251,14 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
}
}
private void printClusters( final String outputPrefix ) {
private void printClusters( final String clusterFileName ) {
try {
final PrintStream outputFile = new PrintStream( outputPrefix + ".clusters" );
int clusterNumber = 0;
final PrintStream outputFile = new PrintStream( clusterFileName );
dataManager.printClusterFileHeader( outputFile );
final int numAnnotations = mu[0].length;
for( int kkk = 0; kkk < numGaussians; kkk++ ) {
//BUGBUG: only output the good clusters here to protect against over-fitting?
//if( numMaxClusterKnown[kkk] + numMaxClusterNovel[kkk] >= 2000 ) {
outputFile.print(clusterNumber + ",");
if( numMaxClusterKnown[kkk] + numMaxClusterNovel[kkk] >= minVarInCluster ) {
outputFile.print("@!CLUSTER,");
outputFile.print(numMaxClusterKnown[kkk] + ",");
outputFile.print(numMaxClusterNovel[kkk] + ",");
outputFile.print(clusterTITV[kkk] + ",");
@ -218,8 +270,7 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
outputFile.print(sigma[kkk][jjj] + ",");
}
outputFile.println(-1);
clusterNumber++;
//}
}
}
} catch (Exception e) {
e.printStackTrace();
@ -227,6 +278,7 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
}
}
/*
public final void applyClusters( final VariantDatum[] data, final String outputPrefix ) {
final int numVariants = data.length;
@ -262,6 +314,7 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
int numKnownTv = 0;
int numNovelTi = 0;
int numNovelTv = 0;
boolean foundDesiredNumVariants = false;
outputFile.println("pCut,numKnown,numNovel,knownTITV,novelTITV");
for( double pCut = 1.0; pCut >= 0.0; pCut -= 0.001 ) {
for( int iii = 0; iii < numVariants; iii++ ) {
@ -286,9 +339,21 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
}
}
}
outputFile.println( pCut + "," + numKnown + "," + numNovel + "," + ( ((double)numKnownTi) / ((double)numKnownTv) ) + "," + ( ((double)numNovelTi) / ((double)numNovelTv) ));
if( desiredNumVariants != 0 && !foundDesiredNumVariants && (numKnown + numNovel) >= desiredNumVariants ) {
System.out.println( "Keeping variants with p(true) >= " + String.format("%.3f",pCut) + " results in a filtered set with: " );
System.out.println("\t" + numKnown + " known variants");
System.out.println("\t" + numNovel + " novel variants, (dbSNP rate = " + String.format("%.2f",((double) numKnown * 100.0) / ((double) numKnown + numNovel) ) + "%)");
System.out.println("\t" + String.format("%.4f known Ti/Tv ratio", ((double)numKnownTi) / ((double)numKnownTv)));
System.out.println("\t" + String.format("%.4f novel Ti/Tv ratio", ((double)numNovelTi) / ((double)numNovelTv)));
System.out.println();
foundDesiredNumVariants = true;
}
outputFile.println( pCut + "," + numKnown + "," + numNovel + "," +
( numKnownTi ==0 || numKnownTv == 0 ? "NaN" : ( ((double)numKnownTi) / ((double)numKnownTv) ) ) + "," +
( numNovelTi ==0 || numNovelTv == 0 ? "NaN" : ( ((double)numNovelTi) / ((double)numNovelTv) )));
}
}
*/
private void evaluateGaussians( final VariantDatum[] data, final double[][] pVarInCluster ) {
@ -327,7 +392,6 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
double sumProb = 0.0;
for( int kkk = 0; kkk < numGaussians; kkk++ ) {
//BUGBUG: only use the good clusters here to protect against over-fitting?
double sum = 0.0;
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
sum += ( (datum.annotations[jjj] - mu[kkk][jjj]) * (datum.annotations[jjj] - mu[kkk][jjj]) )
@ -400,6 +464,7 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
}
// Clean up extra big or extra small clusters
//BUGBUG: Is this a good idea?
for( int kkk = 0; kkk < numGaussians; kkk++ ) {
if( pCluster[kkk] > 0.45 ) { // This is a very large cluster compared to all the others
final int numToReplace = 4;

View File

@ -40,7 +40,8 @@ public final class VariantNearestNeighborsModel extends VariantOptimizationModel
private final int numKNN;
public VariantNearestNeighborsModel( VariantDataManager _dataManager, final double _targetTITV, final int _numKNN ) {
super( _dataManager, _targetTITV );
super( _targetTITV );
//dataManager = _dataManager;
numKNN = _numKNN;
}

View File

@ -38,11 +38,9 @@ public abstract class VariantOptimizationModel implements VariantOptimizationInt
K_NEAREST_NEIGHBORS
}
protected final VariantDataManager dataManager;
protected final double targetTITV;
public VariantOptimizationModel( VariantDataManager _dataManager, final double _targetTITV ) {
dataManager = _dataManager;
public VariantOptimizationModel( final double _targetTITV ) {
targetTITV = _targetTITV;
}

View File

@ -38,13 +38,12 @@ import java.io.IOException;
*/
/**
* Takes variant calls as .vcf files, learns a Gaussian mixture model over the variant annotations producing a callset that is optimized for
* an expected transition / transversion ratio.
* Takes variant calls as .vcf files, learns a Gaussian mixture model over the variant annotations producing calibrated variant cluster parameters which can be applied to other datasets
*
* @author rpoplin
* @since Feb 11, 2010
*
* @help.summary Takes variant calls as .vcf files, learns a Gaussian mixture model over the variant annotations producing an optimized callset
* @help.summary Takes variant calls as .vcf files, learns a Gaussian mixture model over the variant annotations producing calibrated variant cluster parameters which can be applied to other datasets
*/
public class VariantOptimizer extends RodWalker<ExpandingArrayList<VariantDatum>, ExpandingArrayList<VariantDatum>> {
@ -54,26 +53,31 @@ public class VariantOptimizer extends RodWalker<ExpandingArrayList<VariantDatum>
/////////////////////////////
@Argument(fullName="target_titv", shortName="titv", doc="The target Ti/Tv ratio towards which to optimize. (~~2.2 for whole genome experiments)", required=true)
private double TARGET_TITV = 2.1;
//@Argument(fullName="desired_num_variants", shortName="dV", doc="The desired number of variants to keep in a theoretically filtered set", required=false)
//private int DESIRED_NUM_VARIANTS = 0;
@Argument(fullName="ignore_input_filters", shortName="ignoreFilters", doc="If specified the optimizer will use variants even if the FILTER column is marked in the VCF file", required=false)
private boolean IGNORE_INPUT_FILTERS = false;
@Argument(fullName="exclude_annotation", shortName="exclude", doc="The names of the annotations which should be excluded from the calculations", required=false)
private String[] EXCLUDED_ANNOTATIONS = null;
@Argument(fullName="force_annotation", shortName="force", doc="The names of the annotations which should be forced into the calculations even if they aren't present in every variant", required=false)
private String[] FORCED_ANNOTATIONS = null;
@Argument(fullName="output_prefix", shortName="output", doc="The output prefix added to output cluster file name and optimization curve pdf file name", required=false)
private String OUTPUT_PREFIX = "optimizer";
@Argument(fullName="clusterFile", shortName="clusterFile", doc="The output cluster file", required=true)
private String CLUSTER_FILENAME = "optimizer.cluster";
@Argument(fullName="numGaussians", shortName="nG", doc="The number of Gaussians to be used in the Gaussian Mixture model", required=false)
private int NUM_GAUSSIANS = 32;
@Argument(fullName="numIterations", shortName="nI", doc="The number of iterations to be performed in the Gaussian Mixture model", required=false)
private int NUM_ITERATIONS = 10;
@Argument(fullName="knn", shortName="knn", doc="The number of nearest neighbors to be used in the k-Nearest Neighbors model", required=false)
private int NUM_KNN = 2000;
@Argument(fullName = "optimization_model", shortName = "om", doc = "Optimization calculation model to employ -- GAUSSIAN_MIXTURE_MODEL is currently the default, while K_NEAREST_NEIGHBORS is also available for small callsets.", required = false)
@Argument(fullName="minVarInCluster", shortName="minVar", doc="The minimum number of variants in a cluster to be considered a valid cluster. Used to prevent overfitting. Default is 2000.", required=true)
private int MIN_VAR_IN_CLUSTER = 2000;
//@Argument(fullName="knn", shortName="knn", doc="The number of nearest neighbors to be used in the k-Nearest Neighbors model", required=false)
//private int NUM_KNN = 2000;
//@Argument(fullName = "optimization_model", shortName = "om", doc = "Optimization calculation model to employ -- GAUSSIAN_MIXTURE_MODEL is currently the default, while K_NEAREST_NEIGHBORS is also available for small callsets.", required = false)
private VariantOptimizationModel.Model OPTIMIZATION_MODEL = VariantOptimizationModel.Model.GAUSSIAN_MIXTURE_MODEL;
@Argument(fullName = "path_to_Rscript", shortName = "Rscript", doc = "The path to your implementation of Rscript. For Broad users this is probably /broad/tools/apps/R-2.6.0/bin/Rscript", required = false)
private String PATH_TO_RSCRIPT = "/broad/tools/apps/R-2.6.0/bin/Rscript";
@Argument(fullName = "path_to_resources", shortName = "resources", doc = "Path to resources folder holding the Sting R scripts.", required = false)
private String PATH_TO_RESOURCES = "R/";
//@Argument(fullName = "path_to_Rscript", shortName = "Rscript", doc = "The path to your implementation of Rscript. For Broad users this is probably /broad/tools/apps/R-2.6.0/bin/Rscript", required = false)
//private String PATH_TO_RSCRIPT = "/broad/tools/apps/R-2.6.0/bin/Rscript";
//@Argument(fullName = "path_to_resources", shortName = "resources", doc = "Path to resources folder holding the Sting R scripts.", required = false)
//private String PATH_TO_RESOURCES = "R/";
/////////////////////////////
// Private Member Variables
@ -90,7 +94,7 @@ public class VariantOptimizer extends RodWalker<ExpandingArrayList<VariantDatum>
//---------------------------------------------------------------------------------------------------------------
public void initialize() {
if( !PATH_TO_RESOURCES.endsWith("/") ) { PATH_TO_RESOURCES = PATH_TO_RESOURCES + "/"; }
//if( !PATH_TO_RESOURCES.endsWith("/") ) { PATH_TO_RESOURCES = PATH_TO_RESOURCES + "/"; }
}
//---------------------------------------------------------------------------------------------------------------
@ -177,10 +181,10 @@ public class VariantOptimizer extends RodWalker<ExpandingArrayList<VariantDatum>
public void onTraversalDone( ExpandingArrayList<VariantDatum> reduceSum ) {
final VariantDataManager dataManager = new VariantDataManager( reduceSum, annotationKeys);
final VariantDataManager dataManager = new VariantDataManager( reduceSum, annotationKeys );
reduceSum.clear(); // Don't need this ever again, clean up some memory
logger.info( "There are " + dataManager.numVariants + " variants and " + dataManager.numAnnotations + " annotations.");
logger.info( "There are " + dataManager.numVariants + " variants and " + dataManager.numAnnotations + " annotations." );
logger.info( "The annotations are: " + annotationKeys + " and QUAL." );
dataManager.normalizeData(); // Each data point is now [ (x - mean) / standard deviation ]
@ -189,28 +193,31 @@ public class VariantOptimizer extends RodWalker<ExpandingArrayList<VariantDatum>
VariantOptimizationModel theModel;
switch (OPTIMIZATION_MODEL) {
case GAUSSIAN_MIXTURE_MODEL:
theModel = new VariantGaussianMixtureModel( dataManager, TARGET_TITV, NUM_GAUSSIANS, NUM_ITERATIONS );
break;
case K_NEAREST_NEIGHBORS:
theModel = new VariantNearestNeighborsModel( dataManager, TARGET_TITV, NUM_KNN );
theModel = new VariantGaussianMixtureModel( dataManager, TARGET_TITV, NUM_GAUSSIANS, NUM_ITERATIONS, MIN_VAR_IN_CLUSTER );
break;
//case K_NEAREST_NEIGHBORS:
// theModel = new VariantNearestNeighborsModel( dataManager, TARGET_TITV, NUM_KNN );
// break;
default:
throw new StingException("Variant Optimization Model is unrecognized. Implemented options are GAUSSIAN_MIXTURE_MODEL and K_NEAREST_NEIGHBORS");
throw new StingException( "Variant Optimization Model is unrecognized. Implemented options are GAUSSIAN_MIXTURE_MODEL and K_NEAREST_NEIGHBORS" );
}
theModel.run( OUTPUT_PREFIX );
theModel.run( CLUSTER_FILENAME );
// Functionality moved to different walker
// Execute Rscript command to plot the optimization curve
// Print out the command line to make it clear to the user what is being executed and how one might modify it
final String rScriptCommandLine = PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_OptimizationCurve.R" + " " + OUTPUT_PREFIX + ".dat" + " " + TARGET_TITV;
System.out.println( rScriptCommandLine );
//final String rScriptCommandLine = PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_OptimizationCurve.R" + " " + OUTPUT_PREFIX + ".dat" + " " + TARGET_TITV;
//System.out.println( rScriptCommandLine );
// Execute the RScript command to plot the table of truth values
try {
Runtime.getRuntime().exec( rScriptCommandLine );
} catch ( IOException e ) {
throw new StingException( "Unable to execute RScript command: " + rScriptCommandLine );
}
//try {
// Runtime.getRuntime().exec( rScriptCommandLine );
//} catch ( IOException e ) {
// throw new StingException( "Unable to execute RScript command: " + rScriptCommandLine );
//}
}
}