Some clean up for the variant recalibrator. Now uses @Input and @Output so that it can join the Queue party. Users now specify a -o, -clusterFile, -tranchesFile, and -reportDatFile. Example on the wiki. ApplyVariantCuts now has an integration test. Base quality recalibrator now requires a dbsnp rod or vcf file. Now that the base quality recalibrator is using @Output the PrintStream shouldn't be closed in OnTraversalDone.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4101 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
rpoplin 2010-08-24 20:14:58 +00:00
parent f2b138d975
commit 85007ffa87
7 changed files with 178 additions and 174 deletions

View File

@ -104,6 +104,8 @@ public class CovariateCounterWalker extends LocusWalker<CovariateCounterWalker.C
/////////////////////////////
@Argument(fullName="dont_sort_output", shortName="unsorted", required=false, doc="If specified, the output table recalibration csv file will be in an unsorted, arbitrary order to save some run time.")
private boolean DONT_SORT_OUTPUT = false;
@Argument(fullName="run_without_dbsnp_potentially_ruining_quality", shortName="run_without_dbsnp_potentially_ruining_quality", required=false, doc="If specified, allows the recalibrator to be used without a dbsnp rod. Very unsafe and for expert users only.")
private boolean RUN_WITHOUT_DBSNP = false;
/////////////////////////////
// Private Member Variables
@ -183,8 +185,8 @@ public class CovariateCounterWalker extends LocusWalker<CovariateCounterWalker.C
break;
}
}
if( !foundDBSNP ) {
Utils.warnUser("This calculation is critically dependent on being able to skip over known variant sites. Are you sure you want to be running without a dbSNP rod specified?");
if( !foundDBSNP && !RUN_WITHOUT_DBSNP ) {
throw new StingException("This calculation is critically dependent on being able to skip over known variant sites. Please provide a dbSNP ROD or a VCF file containing known sites of genetic variation.");
}
// Initialize the requested covariates by parsing the -cov argument
@ -347,7 +349,7 @@ public class CovariateCounterWalker extends LocusWalker<CovariateCounterWalker.C
updateMismatchCounts(counter, context, ref.getBase()); // For sanity check to ensure novel mismatch rate vs dnsnp mismatch rate is reasonable
}
return counter; // This value isn't actually used anywhere
return counter;
}
@ -468,8 +470,6 @@ public class CovariateCounterWalker extends LocusWalker<CovariateCounterWalker.C
logger.info( "Writing raw recalibration data..." );
outputToCSV( sum, RECAL_FILE );
logger.info( "...done!" );
RECAL_FILE.close();
}
/**

View File

@ -27,6 +27,8 @@ package org.broadinstitute.sting.gatk.walkers.variantrecalibration;
import org.broad.tribble.util.variantcontext.VariantContext;
import org.broad.tribble.vcf.*;
import org.broadinstitute.sting.commandline.Input;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
@ -53,21 +55,28 @@ import java.util.*;
public class ApplyVariantCuts extends RodWalker<Integer, Integer> {
/////////////////////////////
// Inputs
/////////////////////////////
@Input(fullName="tranches_file", shortName="tranchesFile", doc="The input tranches file describing where to cut the data", required=true)
private File TRANCHES_FILE;
/////////////////////////////
// Outputs
/////////////////////////////
@Output( doc="The output filtered VCF file", required=true)
private VCFWriter vcfWriter = null;
/////////////////////////////
// Command Line Arguments
/////////////////////////////
@Argument(fullName="tranchesFile", shortName="tf", doc="The input tranches file describing where to cut the data", required=true)
private String TRANCHE_FILENAME = "optimizer.dat.tranches";
@Argument(fullName="outputVCFFile", shortName="outputVCF", doc="The output filtered VCF file", required=true)
private String OUTPUT_FILENAME = "optimizer.vcf";
@Argument(fullName="fdr_filter_level", shortName="fdr_filter_level", doc="The FDR level at which to start filtering.", required=false)
private double FDR_FILTER_LEVEL = 0.0;
/////////////////////////////
// Private Member Variables
/////////////////////////////
private VCFWriter vcfWriter;
final ExpandingArrayList<Double> qCuts = new ExpandingArrayList<Double>();
final ExpandingArrayList<String> filterName = new ExpandingArrayList<String>();
@ -128,7 +137,7 @@ public class ApplyVariantCuts extends RodWalker<Integer, Integer> {
// todo -- I would have updated your code but there's no integration test to protect me from unexpected effects
boolean firstLine = true;
try {
for( final String line : new XReadLines(new File( TRANCHE_FILENAME )) ) {
for( final String line : new XReadLines( TRANCHES_FILE ) ) {
if( !firstLine ) {
final String[] vals = line.split(",");
double FDR = Double.parseDouble(vals[0]);
@ -147,7 +156,7 @@ public class ApplyVariantCuts extends RodWalker<Integer, Integer> {
firstLine = false;
}
} catch( FileNotFoundException e ) {
throw new StingException("Can not find input file: " + TRANCHE_FILENAME);
throw new StingException("Can not find input file: " + TRANCHES_FILE);
}
// setup the header fields
@ -155,7 +164,6 @@ public class ApplyVariantCuts extends RodWalker<Integer, Integer> {
hInfo.addAll(VCFUtils.getHeaderFields(getToolkit()));
hInfo.add(new VCFInfoHeaderLine("OQ", 1, VCFHeaderLineType.Float, "The original variant quality score"));
hInfo.add(new VCFHeaderLine("source", "VariantOptimizer"));
vcfWriter = new StandardVCFWriter( new File(OUTPUT_FILENAME) );
final TreeSet<String> samples = new TreeSet<String>();
samples.addAll(SampleUtils.getSampleListWithVCFHeader(getToolkit(), null));
@ -195,8 +203,9 @@ public class ApplyVariantCuts extends RodWalker<Integer, Integer> {
break;
}
}
if( filterString == null )
if( filterString == null ) {
filterString = filterName.get(0)+"+";
}
if ( !filterString.equals(VCFConstants.PASSES_FILTERS_v4) ) {
Set<String> filters = new HashSet<String>();
@ -227,7 +236,6 @@ public class ApplyVariantCuts extends RodWalker<Integer, Integer> {
}
public void onTraversalDone( Integer reduceSum ) {
vcfWriter.close();
}
}

View File

@ -27,6 +27,7 @@ package org.broadinstitute.sting.gatk.walkers.variantrecalibration;
import org.broad.tribble.dbsnp.DbSNPFeature;
import org.broad.tribble.util.variantcontext.VariantContext;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils;
@ -41,7 +42,9 @@ import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.commandline.Argument;
import java.io.File;
import java.io.IOException;
import java.io.PrintStream;
import java.util.*;
/**
@ -55,6 +58,13 @@ import java.util.*;
public class GenerateVariantClustersWalker extends RodWalker<ExpandingArrayList<VariantDatum>, ExpandingArrayList<VariantDatum>> {
/////////////////////////////
// Outputs
/////////////////////////////
@Output(fullName="cluster_file", shortName="clusterFile", doc="The output cluster file", required=true)
private PrintStream CLUSTER_FILE;
/////////////////////////////
// Command Line Arguments
/////////////////////////////
@ -64,17 +74,15 @@ public class GenerateVariantClustersWalker extends RodWalker<ExpandingArrayList<
private String[] IGNORE_INPUT_FILTERS = null;
@Argument(fullName="use_annotation", shortName="an", doc="The names of the annotations which should used for calculations", required=true)
private String[] USE_ANNOTATIONS = null;
@Argument(fullName="clusterFile", shortName="clusterFile", doc="The output cluster file", required=true)
private String CLUSTER_FILENAME = "optimizer.cluster";
@Argument(fullName="maxGaussians", shortName="mG", doc="The maximum number of Gaussians to try during Bayesian clustering", required=false)
private int MAX_GAUSSIANS = 4;
@Argument(fullName="maxIterations", shortName="mI", doc="The maximum number of iterations to be performed when clustering. Clustering will normally end when convergence is detected.", required=false)
private int MAX_ITERATIONS = 200;
@Argument(fullName = "path_to_Rscript", shortName = "Rscript", doc = "The path to your implementation of Rscript. For Broad users this is maybe /broad/tools/apps/R-2.6.0/bin/Rscript", required = false)
private String PATH_TO_RSCRIPT = "Rscript";
@Argument(fullName = "path_to_resources", shortName = "resources", doc = "Path to resources folder holding the Sting R scripts.", required = false)
private String PATH_TO_RESOURCES = "R/";
@Argument(fullName="weightNovels", shortName="weightNovels", doc="The weight for novel variants during clustering", required=false)
@Argument(fullName = "path_to_Rscript", shortName="Rscript", doc="The path to your implementation of Rscript. For Broad users this is maybe /broad/tools/apps/R-2.6.0/bin/Rscript", required = false)
// private String PATH_TO_RSCRIPT = "Rscript";
// @Argument(fullName = "path_to_resources", shortName = "resources", doc="Path to resources folder holding the Sting R scripts.", required=false)
// private String PATH_TO_RESOURCES = "R/";
// @Argument(fullName="weightNovels", shortName="weightNovels", doc="The weight for novel variants during clustering", required=false)
private double WEIGHT_NOVELS = 0.0;
@Argument(fullName="weightKnowns", shortName="weightKnowns", doc="The weight for MQ2+ known variants during clustering", required=false)
private double WEIGHT_KNOWNS = 0.0;
@ -89,13 +97,12 @@ public class GenerateVariantClustersWalker extends RodWalker<ExpandingArrayList<
@Argument(fullName="stdThreshold", shortName="std", doc="If a variant has annotations more than -std standard deviations away from mean then don't use it for clustering.", required=false)
private double STD_THRESHOLD = 4.5;
@Argument(fullName="qualThreshold", shortName="qual", doc="If a known variant has raw QUAL value less than -qual then don't use it for clustering.", required=false)
private double QUAL_THRESHOLD = 1000.0;
private double QUAL_THRESHOLD = 800.0;
@Argument(fullName="shrinkage", shortName="shrinkage", doc="The shrinkage parameter in variational Bayes algorithm.", required=false)
private double SHRINKAGE = 0.0001;
@Argument(fullName="dirichlet", shortName="dirichlet", doc="The dirichlet parameter in variational Bayes algoirthm.", required=false)
private double DIRICHLET_PARAMETER = 1000.0;
//@Argument(fullName="knn", shortName="knn", doc="The number of nearest neighbors to be used in the k-Nearest Neighbors model", required=false)
//private int NUM_KNN = 2000;
//@Argument(fullName = "optimization_model", shortName = "om", doc = "Optimization calculation model to employ -- GAUSSIAN_MIXTURE_MODEL is currently the default, while K_NEAREST_NEIGHBORS is also available for small callsets.", required = false)
@ -115,7 +122,7 @@ public class GenerateVariantClustersWalker extends RodWalker<ExpandingArrayList<
//---------------------------------------------------------------------------------------------------------------
public void initialize() {
if( !PATH_TO_RESOURCES.endsWith("/") ) { PATH_TO_RESOURCES = PATH_TO_RESOURCES + "/"; }
// if( !PATH_TO_RESOURCES.endsWith("/") ) { PATH_TO_RESOURCES = PATH_TO_RESOURCES + "/"; }
annotationKeys = new ExpandingArrayList<String>(Arrays.asList(USE_ANNOTATIONS));
@ -222,8 +229,8 @@ public class GenerateVariantClustersWalker extends RodWalker<ExpandingArrayList<
VariantGaussianMixtureModel theModel;
switch (OPTIMIZATION_MODEL) {
case GAUSSIAN_MIXTURE_MODEL:
theModel = new VariantGaussianMixtureModel( dataManager, MAX_GAUSSIANS, MAX_ITERATIONS, maxAC, FORCE_INDEPENDENT,
STD_THRESHOLD, QUAL_THRESHOLD, SHRINKAGE, DIRICHLET_PARAMETER );
theModel = new VariantGaussianMixtureModel( dataManager, MAX_GAUSSIANS, MAX_ITERATIONS, FORCE_INDEPENDENT,
STD_THRESHOLD, SHRINKAGE, DIRICHLET_PARAMETER );
break;
//case K_NEAREST_NEIGHBORS:
// theModel = new VariantNearestNeighborsModel( dataManager, TARGET_TITV, NUM_KNN );
@ -232,7 +239,9 @@ public class GenerateVariantClustersWalker extends RodWalker<ExpandingArrayList<
throw new StingException( "Variant Optimization Model is unrecognized. Implemented options are GAUSSIAN_MIXTURE_MODEL and K_NEAREST_NEIGHBORS" );
}
theModel.run( CLUSTER_FILENAME );
theModel.run( CLUSTER_FILE );
/*
theModel.outputClusterReports( CLUSTER_FILENAME );
for( final String annotation : annotationKeys ) {
@ -248,6 +257,6 @@ public class GenerateVariantClustersWalker extends RodWalker<ExpandingArrayList<
Utils.warnUser("Unable to execute the RScript command because of [" + e.getMessage() + "]. While not critical to the calculations themselves, the script outputs a report that is extremely useful for confirming that the clustering proceded as expected. We highly recommend trying to rerun the script manually if possible.");
}
}
*/
}
}

View File

@ -73,9 +73,7 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
private final Matrix[] sigmaInverse;
private double[] pClusterLog10;
private final double[] determinant;
private final double[] alleleCountFactorArray;
private final double stdThreshold;
private final double qualThreshold;
private double[] empiricalMu;
private Matrix empiricalSigma;
@ -85,12 +83,10 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
private final double[] hyperParameter_lambda;
private static final Pattern ANNOTATION_PATTERN = Pattern.compile("^@!ANNOTATION.*");
private static final Pattern ALLELECOUNT_PATTERN = Pattern.compile("^@!ALLELECOUNT.*");
private static final Pattern CLUSTER_PATTERN = Pattern.compile("^@!CLUSTER.*");
public VariantGaussianMixtureModel( final VariantDataManager _dataManager, final int _maxGaussians, final int _maxIterations,
final int maxAC, final boolean _forceIndependent, final double _stdThreshold, final double _qualThreshold,
final double _shrinkage, final double _dirichlet) {
final boolean _forceIndependent, final double _stdThreshold, final double _shrinkage, final double _dirichlet) {
dataManager = _dataManager;
maxGaussians = _maxGaussians;
maxIterations = _maxIterations;
@ -99,9 +95,7 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
sigma = new Matrix[maxGaussians];
determinant = new double[maxGaussians];
pClusterLog10 = new double[maxGaussians];
alleleCountFactorArray = new double[maxAC + 1];
stdThreshold = _stdThreshold;
qualThreshold = _qualThreshold;
FORCE_INDEPENDENT_ANNOTATIONS = _forceIndependent;
hyperParameter_a = new double[maxGaussians];
hyperParameter_b = new double[maxGaussians];
@ -112,26 +106,23 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
DIRICHLET_PARAMETER = _dirichlet;
}
public VariantGaussianMixtureModel( final double _targetTITV, final String clusterFileName, final double backOffGaussianFactor ) {
public VariantGaussianMixtureModel( final double _targetTITV, final File clusterFile, final double backOffGaussianFactor ) {
super( _targetTITV );
final ExpandingArrayList<String> annotationLines = new ExpandingArrayList<String>();
final ExpandingArrayList<String> alleleCountLines = new ExpandingArrayList<String>();
final ExpandingArrayList<String> clusterLines = new ExpandingArrayList<String>();
try {
for ( final String line : new XReadLines(new File( clusterFileName )) ) {
for ( final String line : new XReadLines( clusterFile ) ) {
if( ANNOTATION_PATTERN.matcher(line).matches() ) {
annotationLines.add(line);
} else if( ALLELECOUNT_PATTERN.matcher(line).matches() ) {
alleleCountLines.add(line);
} else if( CLUSTER_PATTERN.matcher(line).matches() ) {
clusterLines.add(line);
} else {
throw new StingException("Malformed input file: " + clusterFileName);
throw new StingException("Malformed input file: " + clusterFile);
}
}
} catch ( FileNotFoundException e ) {
throw new StingException("Can not find input file: " + clusterFileName);
throw new StingException("Can not find input file: " + clusterFile);
}
dataManager = new VariantDataManager( annotationLines );
@ -140,7 +131,6 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
DIRICHLET_PARAMETER = 0;
maxIterations = 0;
stdThreshold = 0.0;
qualThreshold = 0.0;
FORCE_INDEPENDENT_ANNOTATIONS = false;
hyperParameter_a = null;
hyperParameter_b = null;
@ -155,14 +145,6 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
pClusterLog10 = new double[maxGaussians];
determinant = new double[maxGaussians];
alleleCountFactorArray = new double[alleleCountLines.size() + 1];
for( final String line : alleleCountLines ) {
final String[] vals = line.split(",");
int i = Integer.parseInt(vals[1]);
alleleCountFactorArray[i] = Double.parseDouble(vals[2]);
double oldACPrior = alleleCountFactorArray[i];
}
int kkk = 0;
for( final String line : clusterLines ) {
final String[] vals = line.split(",");
@ -183,10 +165,7 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
logger.info("Found " + maxGaussians + " clusters using " + dataManager.numAnnotations + " annotations: " + dataManager.annotationKeys);
}
public final void run( final String clusterFileName ) {
// Initialize the Allele Count prior
generateAlleleCountPrior();
public final void run( final PrintStream clusterFile ) {
// int numValid = 0;
// int numOutlier = 0;
@ -215,7 +194,7 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
logger.info("Initializing using k-means...");
initializeUsingKMeans( dataManager.data );
logger.info("... done!");
createClusters( dataManager.data, 0, maxGaussians, clusterFileName );
createClusters( dataManager.data, 0, maxGaussians, clusterFile );
// Simply cluster with all the variants. The knowns have been given more weight than the novels
//logger.info("Clustering with " + dataManager.data.length + " variants.");
@ -260,43 +239,12 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
//empiricalSigma.timesEquals(1.0 / (Math.pow(maxGaussians, 2.0 / ((double) numAnnotations))));
}
private void generateAlleleCountPrior() {
final double[] acExpectation = new double[alleleCountFactorArray.length];
final double[] acActual = new double[alleleCountFactorArray.length];
final int[] alleleCount = new int[alleleCountFactorArray.length];
double sumExpectation = 0.0;
for( int iii = 1; iii < alleleCountFactorArray.length; iii++ ) {
acExpectation[iii] = 1.0 / ((double) iii);
sumExpectation += acExpectation[iii];
}
for( int iii = 1; iii < alleleCountFactorArray.length; iii++ ) {
acExpectation[iii] /= sumExpectation; // Turn acExpectation into a probability distribution
alleleCount[iii] = 1; // Start off with one count to smooth the estimate
}
for( final VariantDatum datum : dataManager.data ) {
alleleCount[datum.alleleCount]++;
}
for( int iii = 1; iii < alleleCountFactorArray.length; iii++ ) {
acActual[iii] = ((double)alleleCount[iii]) / ((double) (dataManager.data.length+(alleleCountFactorArray.length-1))); // Turn acActual into a probability distribution
}
for( int iii = 1; iii < alleleCountFactorArray.length; iii++ ) {
alleleCountFactorArray[iii] = acExpectation[iii] / acActual[iii]; // Prior is (expected / observed)
}
}
public void setSingletonFPRate( final double rate ) {
this.singletonFPRate = rate;
}
public final double getAlleleCountPrior( final int alleleCount ) {
if ( this.singletonFPRate == -1 )
return alleleCountFactorArray[alleleCount];
else {
return Math.min(0.95, 1.0 - Math.pow(singletonFPRate, alleleCount)); //TODO -- define the vals
}
return Math.min(0.95, 1.0 - Math.pow(singletonFPRate, alleleCount)); //TODO -- define the vals
}
private void initializeUsingKMeans( final VariantDatum[] data ) {
@ -365,7 +313,7 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
}
}
public final void createClusters( final VariantDatum[] data, final int startCluster, final int stopCluster, final String clusterFileName ) {
public final void createClusters( final VariantDatum[] data, final int startCluster, final int stopCluster, final PrintStream clusterFile ) {
final int numVariants = data.length;
final int numAnnotations = data[0].annotations.length;
@ -439,38 +387,28 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Output the final cluster parameters
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////
printClusterParameters( clusterFileName );
printClusterParameters( clusterFile );
}
private void printClusterParameters( final String clusterFileName ) {
try {
final PrintStream outputFile = new PrintStream( clusterFileName );
dataManager.printClusterFileHeader( outputFile );
for( int iii = 1; iii < alleleCountFactorArray.length; iii++ ) {
outputFile.print("@!ALLELECOUNT,");
outputFile.println(iii + "," + alleleCountFactorArray[iii]);
}
private void printClusterParameters( final PrintStream clusterFile ) {
dataManager.printClusterFileHeader( clusterFile );
final int numAnnotations = mu[0].length;
for( int kkk = 0; kkk < maxGaussians; kkk++ ) {
if( Math.pow(10.0, pClusterLog10[kkk]) > 1E-4 ) { // BUGBUG: make this a command line argument
final double sigmaVals[][] = sigma[kkk].getArray();
outputFile.print("@!CLUSTER");
outputFile.print("," + Math.pow(10.0, pClusterLog10[kkk]));
for(int jjj = 0; jjj < numAnnotations; jjj++ ) {
outputFile.print("," + mu[kkk][jjj]);
}
for(int jjj = 0; jjj < numAnnotations; jjj++ ) {
for(int ppp = 0; ppp < numAnnotations; ppp++ ) {
outputFile.print("," + (sigmaVals[jjj][ppp] / hyperParameter_a[kkk]) );
}
}
outputFile.println();
final int numAnnotations = mu[0].length;
for( int kkk = 0; kkk < maxGaussians; kkk++ ) {
if( Math.pow(10.0, pClusterLog10[kkk]) > 1E-4 ) { // BUGBUG: make this a command line argument
final double sigmaVals[][] = sigma[kkk].getArray();
clusterFile.print("@!CLUSTER");
clusterFile.print("," + Math.pow(10.0, pClusterLog10[kkk]));
for(int jjj = 0; jjj < numAnnotations; jjj++ ) {
clusterFile.print("," + mu[kkk][jjj]);
}
for(int jjj = 0; jjj < numAnnotations; jjj++ ) {
for(int ppp = 0; ppp < numAnnotations; ppp++ ) {
clusterFile.print("," + (sigmaVals[jjj][ppp] / hyperParameter_a[kkk]) );
}
}
clusterFile.println();
}
outputFile.close();
} catch (Exception e) {
throw new StingException( "Unable to create output file: " + clusterFileName );
}
}
@ -563,13 +501,10 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
}
annIndex++;
outputFile.close();
}
// BUGBUG: next output the actual cluster on top by integrating out every other annotation
}
public final void outputOptimizationCurve( final VariantDatum[] data, final String outputPrefix,
public final void outputOptimizationCurve( final VariantDatum[] data, final PrintStream outputReportDatFile, final PrintStream outputTranchesFile,
final int desiredNumVariants, final Double[] FDRtranches, final double QUAL_STEP ) {
final int numVariants = data.length;
@ -595,16 +530,16 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
PrintStream outputFile;
try {
outputFile = new PrintStream( outputPrefix + ".dat" );
outputFile = new PrintStream( outputReportDatFile );
} catch (Exception e) {
throw new StingException( "Unable to create output file: " + outputPrefix + ".dat" );
throw new StingException( "Unable to create output file: " + outputReportDatFile );
}
PrintStream tranchesOutputFile;
try {
tranchesOutputFile = new PrintStream( outputPrefix + ".dat.tranches" );
tranchesOutputFile = new PrintStream( outputTranchesFile );
tranchesOutputFile.println("FDRtranche,novelTITV,pCut,numNovel,filterName");
} catch (Exception e) {
throw new StingException( "Unable to create output file: " + outputPrefix + ".dat.tranches" );
throw new StingException( "Unable to create output file: " + outputTranchesFile );
}
int numKnown = 0;
@ -706,12 +641,8 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
logger.info("\t" + String.format("--> with an implied novel FDR of %.2f percent", Math.abs(100.0 * (1.0-((novelTiTvAtCut[jjj] - 0.5) / (targetTITV - 0.5))))));
}
}
outputFile.close();
tranchesOutputFile.close();
}
private double evaluateGaussians( final VariantDatum[] data, final double[][] pVarInCluster, final int startCluster, final int stopCluster ) {
final int numAnnotations = data[0].annotations.length;

View File

@ -1,5 +1,7 @@
package org.broadinstitute.sting.gatk.walkers.variantrecalibration;
import java.io.PrintStream;
/*
* Copyright (c) 2010 The Broad Institute
*
@ -32,5 +34,5 @@ package org.broadinstitute.sting.gatk.walkers.variantrecalibration;
*/
public interface VariantOptimizationInterface {
public void run( String outputPrefix );
public void run( PrintStream outputPrefix );
}

View File

@ -28,6 +28,8 @@ package org.broadinstitute.sting.gatk.walkers.variantrecalibration;
import org.broad.tribble.dbsnp.DbSNPFeature;
import org.broad.tribble.util.variantcontext.VariantContext;
import org.broad.tribble.vcf.*;
import org.broadinstitute.sting.commandline.Input;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils;
@ -42,7 +44,9 @@ import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.utils.vcf.VCFUtils;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.PrintStream;
import java.util.*;
/**
@ -56,12 +60,29 @@ import java.util.*;
public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDatum>, ExpandingArrayList<VariantDatum>> {
/////////////////////////////
// Inputs
/////////////////////////////
@Input(fullName="cluster_file", shortName="clusterFile", doc="The input cluster file generated by GenerateVariantClusters", required=true)
private File CLUSTER_FILE;
/////////////////////////////
// Outputs
/////////////////////////////
@Output(fullName="tranches_file", shortName="tranchesFile", doc="The output tranches file used by ApplyVariantCuts", required=true)
private PrintStream TRANCHES_FILE;
@Output(fullName="report_dat_file", shortName="reportDatFile", doc="The output report .dat file used with Rscript to create the optimization curve PDF file", required=true)
private File REPORT_DAT_FILE;
@Output(doc="File to which recalibrated variants should be written",required=true)
private VCFWriter vcfWriter = null;
/////////////////////////////
// Command Line Arguments
/////////////////////////////
@Argument(fullName="target_titv", shortName="titv", doc="The expected Ti/Tv ratio to display on optimization curve output figures. (~~2.1 for whole genome experiments)", required=false)
private double TARGET_TITV = 2.1;
@Argument(fullName="backOff", shortName="backOff", doc="The Gaussian back off factor, used to prevent overfitting by spreading out the Gaussians.", required=false)
@Argument(fullName="target_titv", shortName="titv", doc="The expected novel Ti/Tv ratio to use when calculating FDR tranches and for display on optimization curve output figures. (~~2.07 for whole genome experiments)", required=true)
private double TARGET_TITV = 2.07;
@Argument(fullName="backOff", shortName="backOff", doc="The Gaussian back off factor, used to prevent overfitting by enlarging out the Gaussians.", required=false)
private double BACKOFF_FACTOR = 1.3;
@Argument(fullName="desired_num_variants", shortName="dV", doc="The desired number of variants to keep in a theoretically filtered set", required=false)
private int DESIRED_NUM_VARIANTS = 0;
@ -75,14 +96,8 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
private double KNOWN_QUAL_PRIOR = 10.0;
@Argument(fullName="novel_prior", shortName="novelPrior", doc="A prior on the quality of novel variants, a phred scaled probability of being true.", required=false)
private double NOVEL_QUAL_PRIOR = 2.0;
@Argument(fullName="output_prefix", shortName="output", doc="The prefix added to output VCF file name and optimization curve pdf file name", required=false)
private String OUTPUT_PREFIX = "optimizer";
@Argument(fullName="clusterFile", shortName="clusterFile", doc="The output cluster file", required=true)
private String CLUSTER_FILENAME = "optimizer.cluster";
@Argument(fullName="FDRtranche", shortName="tranche", doc="The levels of novel false discovery rate (FDR, implied by ti/tv) at which to slice the data. (in percent, that is 1.0 for 1 percent)", required=false)
private Double[] FDR_TRANCHES = null;
//@Argument(fullName = "optimization_model", shortName = "om", doc = "Optimization calculation model to employ -- GAUSSIAN_MIXTURE_MODEL is currently the default, while K_NEAREST_NEIGHBORS is also available for small callsets.", required = false)
private VariantOptimizationModel.Model OPTIMIZATION_MODEL = VariantOptimizationModel.Model.GAUSSIAN_MIXTURE_MODEL;
@Argument(fullName = "path_to_Rscript", shortName = "Rscript", doc = "The path to your implementation of Rscript. For Broad users this is maybe /broad/tools/apps/R-2.6.0/bin/Rscript", required = false)
private String PATH_TO_RSCRIPT = "Rscript";
@Argument(fullName = "path_to_resources", shortName = "resources", doc = "Path to resources folder holding the Sting R scripts.", required = false)
@ -97,8 +112,8 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
/////////////////////////////
// Private Member Variables
/////////////////////////////
private VariantOptimizationModel.Model OPTIMIZATION_MODEL = VariantOptimizationModel.Model.GAUSSIAN_MIXTURE_MODEL;
private VariantGaussianMixtureModel theModel = null;
private VCFWriter vcfWriter;
private Set<String> ignoreInputFilterSet = null;
//---------------------------------------------------------------------------------------------------------------
@ -116,9 +131,10 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
switch (OPTIMIZATION_MODEL) {
case GAUSSIAN_MIXTURE_MODEL:
theModel = new VariantGaussianMixtureModel( TARGET_TITV, CLUSTER_FILENAME, BACKOFF_FACTOR );
if ( SINGLETON_FP_RATE != -1 )
theModel = new VariantGaussianMixtureModel( TARGET_TITV, CLUSTER_FILE, BACKOFF_FACTOR );
if ( SINGLETON_FP_RATE != -1 ) {
theModel.setSingletonFPRate(SINGLETON_FP_RATE);
}
break;
//case K_NEAREST_NEIGHBORS:
// theModel = new VariantNearestNeighborsModel( dataManager, TARGET_TITV, NUM_KNN );
@ -136,7 +152,6 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
hInfo.add(new VCFHeaderLine("source", "VariantOptimizer"));
samples.addAll(SampleUtils.getUniqueSamplesFromRods(getToolkit()));
vcfWriter = new StandardVCFWriter( new File(OUTPUT_PREFIX + ".vcf") );
final VCFHeader vcfHeader = new VCFHeader(hInfo, samples);
vcfWriter.writeHeader(vcfHeader);
@ -154,13 +169,12 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
// Set up default values for the FDR tranches if necessary
if( FDR_TRANCHES == null ) {
FDR_TRANCHES = new Double[6];
FDR_TRANCHES = new Double[5];
FDR_TRANCHES[0] = 0.1;
FDR_TRANCHES[1] = 1.0;
FDR_TRANCHES[2] = 5.0;
FDR_TRANCHES[3] = 12.5;
FDR_TRANCHES[4] = 25.0;
FDR_TRANCHES[5] = 35.0;
FDR_TRANCHES[3] = 10.0;
FDR_TRANCHES[4] = 15.0;
}
}
@ -240,25 +254,31 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
public void onTraversalDone( ExpandingArrayList<VariantDatum> reduceSum ) {
vcfWriter.close();
final VariantDataManager dataManager = new VariantDataManager( reduceSum, theModel.dataManager.annotationKeys );
reduceSum.clear(); // Don't need this ever again, clean up some memory
theModel.outputOptimizationCurve( dataManager.data, OUTPUT_PREFIX, DESIRED_NUM_VARIANTS, FDR_TRANCHES, QUAL_STEP );
try {
PrintStream reportDatFilePrintStream = new PrintStream(REPORT_DAT_FILE);
theModel.outputOptimizationCurve( dataManager.data, reportDatFilePrintStream, TRANCHES_FILE, DESIRED_NUM_VARIANTS, FDR_TRANCHES, QUAL_STEP );
} catch ( FileNotFoundException e ) {
throw new StingException("Can't create report dat file: " + REPORT_DAT_FILE.getName());
}
// Execute Rscript command to plot the optimization curve
// Print out the command line to make it clear to the user what is being executed and how one might modify it
final String rScriptOptimizationCurveCommandLine = PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_OptimizationCurve.R" + " " + OUTPUT_PREFIX + ".dat" + " " + TARGET_TITV;
final String rScriptTranchesCommandLine = PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_Tranches.R" + " " + OUTPUT_PREFIX + ".dat" + " " + TARGET_TITV;
final String rScriptOptimizationCurveCommandLine = PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_OptimizationCurve.R" + " " + REPORT_DAT_FILE.getName() + " " + TARGET_TITV;
final String rScriptTranchesCommandLine = PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_Tranches.R" + " " + REPORT_DAT_FILE.getName() + " " + TARGET_TITV;
logger.info( rScriptOptimizationCurveCommandLine );
logger.info( rScriptTranchesCommandLine );
// Execute the RScript command to plot the table of truth values
try {
Runtime.getRuntime().exec( rScriptOptimizationCurveCommandLine );
Runtime.getRuntime().exec( rScriptTranchesCommandLine );
} catch ( IOException e ) {
Process p;
p = Runtime.getRuntime().exec( rScriptOptimizationCurveCommandLine );
p.waitFor();
p = Runtime.getRuntime().exec( rScriptTranchesCommandLine );
p.waitFor();
} catch ( Exception e ) {
Utils.warnUser("Unable to execute the RScript command. While not critical to the calculations themselves, the script outputs a report that is extremely useful for confirming that the recalibration proceded as expected. We highly recommend trying to rerun the script manually if possible.");
}
}

View File

@ -7,12 +7,14 @@ import java.util.*;
import java.io.File;
public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest {
static HashMap<String, String> paramsFiles = new HashMap<String, String>();
static HashMap<String, String> clusterFiles = new HashMap<String, String>();
static HashMap<String, String> tranchesFiles = new HashMap<String, String>();
static HashMap<String, String> inputVCFFiles = new HashMap<String, String>();
@Test
public void testGenerateVariantClusters() {
HashMap<String, String> e = new HashMap<String, String>();
e.put( validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", "005898194773d2f2be45a68f6e4c4c25" );
e.put( validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", "e6724946c3298b3d74bb1ba1396a9190" );
for ( Map.Entry<String, String> entry : e.entrySet() ) {
String vcf = entry.getKey();
@ -20,7 +22,7 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-R " + b36KGReference +
" --DBSNP /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod" +
" --DBSNP " + GATKDataLocation + "dbsnp_129_b36.rod" +
" -T GenerateVariantClusters" +
" -B input,VCF," + vcf +
" -L 1:1-100,000,000" +
@ -30,37 +32,69 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest {
1, // just one output file
Arrays.asList(md5));
List<File> result = executeTest("testGenerateVariantClusters", spec).getFirst();
paramsFiles.put(vcf, result.get(0).getAbsolutePath());
clusterFiles.put(vcf, result.get(0).getAbsolutePath());
}
}
@Test
public void testVariantRecalibrator() {
HashMap<String, String> e = new HashMap<String, String>();
e.put( validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", "acb89428115564cbf35ab7c006252108" );
e.put( validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", "b97ab64b86ce8c8698855058d32853ce" );
for ( Map.Entry<String, String> entry : e.entrySet() ) {
String vcf = entry.getKey();
String md5 = entry.getValue();
String paramsFile = paramsFiles.get(vcf);
System.out.printf("PARAMS FOR %s is %s%n", vcf, paramsFile);
if ( paramsFile != null ) {
File file = createTempFile("cluster",".vcf");
WalkerTestSpec spec = new WalkerTestSpec(
String clusterFile = clusterFiles.get(vcf);
System.out.printf("PARAMS FOR %s is %s%n", vcf, clusterFile);
if ( clusterFile != null ) {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-R " + b36KGReference +
" --DBSNP /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod" +
" --DBSNP " + GATKDataLocation + "dbsnp_129_b36.rod" +
" -T VariantRecalibrator" +
" -B input,VCF," + vcf +
" -L 1:40,000,000-100,000,000" +
" --ignore_filter GATK_STANDARD" +
" -output " + file.getAbsolutePath().substring(0,file.getAbsolutePath().length()-4) +
" -clusterFile " + paramsFile,
0,
new ArrayList<String>(0));
" --ignore_filter HARD_TO_VALIDATE" +
" -clusterFile " + clusterFile +
" -titv 2.07" +
" -o %s" +
" -tranchesFile %s" +
" -reportDatFile %s",
3, // two output file
Arrays.asList(md5, "f603aa2052ae6e81a6bde63a8a3f9539","951a17f9c11de391763b9a8cb239205a"));
List<File> result = executeTest("testVariantRecalibrator", spec).getFirst();
inputVCFFiles.put(vcf, result.get(0).getAbsolutePath());
tranchesFiles.put(vcf, result.get(1).getAbsolutePath());
}
}
spec.addAuxFile(md5, file);
executeTest("testVariantRecalibrator", spec);
}
@Test
public void testApplyVariantCuts() {
HashMap<String, String> e = new HashMap<String, String>();
e.put( validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", "e80f26d68be2b183fd7f062039cef28a" );
for ( Map.Entry<String, String> entry : e.entrySet() ) {
String vcf = entry.getKey();
String md5 = entry.getValue();
String inputVCFFile = inputVCFFiles.get(vcf);
String tranchesFile = tranchesFiles.get(vcf);
System.out.printf("PARAMS FOR %s is %s%n", vcf, inputVCFFile);
System.out.printf("PARAMS FOR %s is %s%n", vcf, tranchesFile);
if ( inputVCFFile != null && tranchesFile != null ) {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-R " + b36KGReference +
" --DBSNP " + GATKDataLocation + "dbsnp_129_b36.rod" +
" -T ApplyVariantCuts" +
" -L 1:40,000,000-100,000,000" +
" -B input,VCF," + inputVCFFile +
" -o %s" +
" -tranchesFile " + tranchesFile,
1, // just one output file
Arrays.asList(md5));
List<File> result = executeTest("testApplyVariantCuts", spec).getFirst();
}
}
}