Some clean up for the variant recalibrator. Now uses @Input and @Output so that it can join the Queue party. Users now specify a -o, -clusterFile, -tranchesFile, and -reportDatFile. Example on the wiki. ApplyVariantCuts now has an integration test. Base quality recalibrator now requires a dbsnp rod or vcf file. Now that the base quality recalibrator is using @Output the PrintStream shouldn't be closed in OnTraversalDone.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4101 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
f2b138d975
commit
85007ffa87
|
|
@ -104,6 +104,8 @@ public class CovariateCounterWalker extends LocusWalker<CovariateCounterWalker.C
|
|||
/////////////////////////////
|
||||
@Argument(fullName="dont_sort_output", shortName="unsorted", required=false, doc="If specified, the output table recalibration csv file will be in an unsorted, arbitrary order to save some run time.")
|
||||
private boolean DONT_SORT_OUTPUT = false;
|
||||
@Argument(fullName="run_without_dbsnp_potentially_ruining_quality", shortName="run_without_dbsnp_potentially_ruining_quality", required=false, doc="If specified, allows the recalibrator to be used without a dbsnp rod. Very unsafe and for expert users only.")
|
||||
private boolean RUN_WITHOUT_DBSNP = false;
|
||||
|
||||
/////////////////////////////
|
||||
// Private Member Variables
|
||||
|
|
@ -183,8 +185,8 @@ public class CovariateCounterWalker extends LocusWalker<CovariateCounterWalker.C
|
|||
break;
|
||||
}
|
||||
}
|
||||
if( !foundDBSNP ) {
|
||||
Utils.warnUser("This calculation is critically dependent on being able to skip over known variant sites. Are you sure you want to be running without a dbSNP rod specified?");
|
||||
if( !foundDBSNP && !RUN_WITHOUT_DBSNP ) {
|
||||
throw new StingException("This calculation is critically dependent on being able to skip over known variant sites. Please provide a dbSNP ROD or a VCF file containing known sites of genetic variation.");
|
||||
}
|
||||
|
||||
// Initialize the requested covariates by parsing the -cov argument
|
||||
|
|
@ -347,7 +349,7 @@ public class CovariateCounterWalker extends LocusWalker<CovariateCounterWalker.C
|
|||
updateMismatchCounts(counter, context, ref.getBase()); // For sanity check to ensure novel mismatch rate vs dnsnp mismatch rate is reasonable
|
||||
}
|
||||
|
||||
return counter; // This value isn't actually used anywhere
|
||||
return counter;
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -468,8 +470,6 @@ public class CovariateCounterWalker extends LocusWalker<CovariateCounterWalker.C
|
|||
logger.info( "Writing raw recalibration data..." );
|
||||
outputToCSV( sum, RECAL_FILE );
|
||||
logger.info( "...done!" );
|
||||
|
||||
RECAL_FILE.close();
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -27,6 +27,8 @@ package org.broadinstitute.sting.gatk.walkers.variantrecalibration;
|
|||
|
||||
import org.broad.tribble.util.variantcontext.VariantContext;
|
||||
import org.broad.tribble.vcf.*;
|
||||
import org.broadinstitute.sting.commandline.Input;
|
||||
import org.broadinstitute.sting.commandline.Output;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
|
|
@ -53,21 +55,28 @@ import java.util.*;
|
|||
|
||||
public class ApplyVariantCuts extends RodWalker<Integer, Integer> {
|
||||
|
||||
|
||||
/////////////////////////////
|
||||
// Inputs
|
||||
/////////////////////////////
|
||||
@Input(fullName="tranches_file", shortName="tranchesFile", doc="The input tranches file describing where to cut the data", required=true)
|
||||
private File TRANCHES_FILE;
|
||||
|
||||
/////////////////////////////
|
||||
// Outputs
|
||||
/////////////////////////////
|
||||
@Output( doc="The output filtered VCF file", required=true)
|
||||
private VCFWriter vcfWriter = null;
|
||||
|
||||
/////////////////////////////
|
||||
// Command Line Arguments
|
||||
/////////////////////////////
|
||||
@Argument(fullName="tranchesFile", shortName="tf", doc="The input tranches file describing where to cut the data", required=true)
|
||||
private String TRANCHE_FILENAME = "optimizer.dat.tranches";
|
||||
@Argument(fullName="outputVCFFile", shortName="outputVCF", doc="The output filtered VCF file", required=true)
|
||||
private String OUTPUT_FILENAME = "optimizer.vcf";
|
||||
@Argument(fullName="fdr_filter_level", shortName="fdr_filter_level", doc="The FDR level at which to start filtering.", required=false)
|
||||
private double FDR_FILTER_LEVEL = 0.0;
|
||||
|
||||
|
||||
/////////////////////////////
|
||||
// Private Member Variables
|
||||
/////////////////////////////
|
||||
private VCFWriter vcfWriter;
|
||||
final ExpandingArrayList<Double> qCuts = new ExpandingArrayList<Double>();
|
||||
final ExpandingArrayList<String> filterName = new ExpandingArrayList<String>();
|
||||
|
||||
|
|
@ -128,7 +137,7 @@ public class ApplyVariantCuts extends RodWalker<Integer, Integer> {
|
|||
// todo -- I would have updated your code but there's no integration test to protect me from unexpected effects
|
||||
boolean firstLine = true;
|
||||
try {
|
||||
for( final String line : new XReadLines(new File( TRANCHE_FILENAME )) ) {
|
||||
for( final String line : new XReadLines( TRANCHES_FILE ) ) {
|
||||
if( !firstLine ) {
|
||||
final String[] vals = line.split(",");
|
||||
double FDR = Double.parseDouble(vals[0]);
|
||||
|
|
@ -147,7 +156,7 @@ public class ApplyVariantCuts extends RodWalker<Integer, Integer> {
|
|||
firstLine = false;
|
||||
}
|
||||
} catch( FileNotFoundException e ) {
|
||||
throw new StingException("Can not find input file: " + TRANCHE_FILENAME);
|
||||
throw new StingException("Can not find input file: " + TRANCHES_FILE);
|
||||
}
|
||||
|
||||
// setup the header fields
|
||||
|
|
@ -155,7 +164,6 @@ public class ApplyVariantCuts extends RodWalker<Integer, Integer> {
|
|||
hInfo.addAll(VCFUtils.getHeaderFields(getToolkit()));
|
||||
hInfo.add(new VCFInfoHeaderLine("OQ", 1, VCFHeaderLineType.Float, "The original variant quality score"));
|
||||
hInfo.add(new VCFHeaderLine("source", "VariantOptimizer"));
|
||||
vcfWriter = new StandardVCFWriter( new File(OUTPUT_FILENAME) );
|
||||
final TreeSet<String> samples = new TreeSet<String>();
|
||||
samples.addAll(SampleUtils.getSampleListWithVCFHeader(getToolkit(), null));
|
||||
|
||||
|
|
@ -195,8 +203,9 @@ public class ApplyVariantCuts extends RodWalker<Integer, Integer> {
|
|||
break;
|
||||
}
|
||||
}
|
||||
if( filterString == null )
|
||||
if( filterString == null ) {
|
||||
filterString = filterName.get(0)+"+";
|
||||
}
|
||||
|
||||
if ( !filterString.equals(VCFConstants.PASSES_FILTERS_v4) ) {
|
||||
Set<String> filters = new HashSet<String>();
|
||||
|
|
@ -227,7 +236,6 @@ public class ApplyVariantCuts extends RodWalker<Integer, Integer> {
|
|||
}
|
||||
|
||||
public void onTraversalDone( Integer reduceSum ) {
|
||||
vcfWriter.close();
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -27,6 +27,7 @@ package org.broadinstitute.sting.gatk.walkers.variantrecalibration;
|
|||
|
||||
import org.broad.tribble.dbsnp.DbSNPFeature;
|
||||
import org.broad.tribble.util.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.commandline.Output;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils;
|
||||
|
|
@ -41,7 +42,9 @@ import org.broadinstitute.sting.utils.StingException;
|
|||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.PrintStream;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
|
|
@ -55,6 +58,13 @@ import java.util.*;
|
|||
|
||||
public class GenerateVariantClustersWalker extends RodWalker<ExpandingArrayList<VariantDatum>, ExpandingArrayList<VariantDatum>> {
|
||||
|
||||
/////////////////////////////
|
||||
// Outputs
|
||||
/////////////////////////////
|
||||
|
||||
@Output(fullName="cluster_file", shortName="clusterFile", doc="The output cluster file", required=true)
|
||||
private PrintStream CLUSTER_FILE;
|
||||
|
||||
/////////////////////////////
|
||||
// Command Line Arguments
|
||||
/////////////////////////////
|
||||
|
|
@ -64,17 +74,15 @@ public class GenerateVariantClustersWalker extends RodWalker<ExpandingArrayList<
|
|||
private String[] IGNORE_INPUT_FILTERS = null;
|
||||
@Argument(fullName="use_annotation", shortName="an", doc="The names of the annotations which should used for calculations", required=true)
|
||||
private String[] USE_ANNOTATIONS = null;
|
||||
@Argument(fullName="clusterFile", shortName="clusterFile", doc="The output cluster file", required=true)
|
||||
private String CLUSTER_FILENAME = "optimizer.cluster";
|
||||
@Argument(fullName="maxGaussians", shortName="mG", doc="The maximum number of Gaussians to try during Bayesian clustering", required=false)
|
||||
private int MAX_GAUSSIANS = 4;
|
||||
@Argument(fullName="maxIterations", shortName="mI", doc="The maximum number of iterations to be performed when clustering. Clustering will normally end when convergence is detected.", required=false)
|
||||
private int MAX_ITERATIONS = 200;
|
||||
@Argument(fullName = "path_to_Rscript", shortName = "Rscript", doc = "The path to your implementation of Rscript. For Broad users this is maybe /broad/tools/apps/R-2.6.0/bin/Rscript", required = false)
|
||||
private String PATH_TO_RSCRIPT = "Rscript";
|
||||
@Argument(fullName = "path_to_resources", shortName = "resources", doc = "Path to resources folder holding the Sting R scripts.", required = false)
|
||||
private String PATH_TO_RESOURCES = "R/";
|
||||
@Argument(fullName="weightNovels", shortName="weightNovels", doc="The weight for novel variants during clustering", required=false)
|
||||
@Argument(fullName = "path_to_Rscript", shortName="Rscript", doc="The path to your implementation of Rscript. For Broad users this is maybe /broad/tools/apps/R-2.6.0/bin/Rscript", required = false)
|
||||
// private String PATH_TO_RSCRIPT = "Rscript";
|
||||
// @Argument(fullName = "path_to_resources", shortName = "resources", doc="Path to resources folder holding the Sting R scripts.", required=false)
|
||||
// private String PATH_TO_RESOURCES = "R/";
|
||||
// @Argument(fullName="weightNovels", shortName="weightNovels", doc="The weight for novel variants during clustering", required=false)
|
||||
private double WEIGHT_NOVELS = 0.0;
|
||||
@Argument(fullName="weightKnowns", shortName="weightKnowns", doc="The weight for MQ2+ known variants during clustering", required=false)
|
||||
private double WEIGHT_KNOWNS = 0.0;
|
||||
|
|
@ -89,13 +97,12 @@ public class GenerateVariantClustersWalker extends RodWalker<ExpandingArrayList<
|
|||
@Argument(fullName="stdThreshold", shortName="std", doc="If a variant has annotations more than -std standard deviations away from mean then don't use it for clustering.", required=false)
|
||||
private double STD_THRESHOLD = 4.5;
|
||||
@Argument(fullName="qualThreshold", shortName="qual", doc="If a known variant has raw QUAL value less than -qual then don't use it for clustering.", required=false)
|
||||
private double QUAL_THRESHOLD = 1000.0;
|
||||
private double QUAL_THRESHOLD = 800.0;
|
||||
@Argument(fullName="shrinkage", shortName="shrinkage", doc="The shrinkage parameter in variational Bayes algorithm.", required=false)
|
||||
private double SHRINKAGE = 0.0001;
|
||||
@Argument(fullName="dirichlet", shortName="dirichlet", doc="The dirichlet parameter in variational Bayes algoirthm.", required=false)
|
||||
private double DIRICHLET_PARAMETER = 1000.0;
|
||||
|
||||
|
||||
//@Argument(fullName="knn", shortName="knn", doc="The number of nearest neighbors to be used in the k-Nearest Neighbors model", required=false)
|
||||
//private int NUM_KNN = 2000;
|
||||
//@Argument(fullName = "optimization_model", shortName = "om", doc = "Optimization calculation model to employ -- GAUSSIAN_MIXTURE_MODEL is currently the default, while K_NEAREST_NEIGHBORS is also available for small callsets.", required = false)
|
||||
|
|
@ -115,7 +122,7 @@ public class GenerateVariantClustersWalker extends RodWalker<ExpandingArrayList<
|
|||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
public void initialize() {
|
||||
if( !PATH_TO_RESOURCES.endsWith("/") ) { PATH_TO_RESOURCES = PATH_TO_RESOURCES + "/"; }
|
||||
// if( !PATH_TO_RESOURCES.endsWith("/") ) { PATH_TO_RESOURCES = PATH_TO_RESOURCES + "/"; }
|
||||
|
||||
annotationKeys = new ExpandingArrayList<String>(Arrays.asList(USE_ANNOTATIONS));
|
||||
|
||||
|
|
@ -222,8 +229,8 @@ public class GenerateVariantClustersWalker extends RodWalker<ExpandingArrayList<
|
|||
VariantGaussianMixtureModel theModel;
|
||||
switch (OPTIMIZATION_MODEL) {
|
||||
case GAUSSIAN_MIXTURE_MODEL:
|
||||
theModel = new VariantGaussianMixtureModel( dataManager, MAX_GAUSSIANS, MAX_ITERATIONS, maxAC, FORCE_INDEPENDENT,
|
||||
STD_THRESHOLD, QUAL_THRESHOLD, SHRINKAGE, DIRICHLET_PARAMETER );
|
||||
theModel = new VariantGaussianMixtureModel( dataManager, MAX_GAUSSIANS, MAX_ITERATIONS, FORCE_INDEPENDENT,
|
||||
STD_THRESHOLD, SHRINKAGE, DIRICHLET_PARAMETER );
|
||||
break;
|
||||
//case K_NEAREST_NEIGHBORS:
|
||||
// theModel = new VariantNearestNeighborsModel( dataManager, TARGET_TITV, NUM_KNN );
|
||||
|
|
@ -232,7 +239,9 @@ public class GenerateVariantClustersWalker extends RodWalker<ExpandingArrayList<
|
|||
throw new StingException( "Variant Optimization Model is unrecognized. Implemented options are GAUSSIAN_MIXTURE_MODEL and K_NEAREST_NEIGHBORS" );
|
||||
}
|
||||
|
||||
theModel.run( CLUSTER_FILENAME );
|
||||
theModel.run( CLUSTER_FILE );
|
||||
|
||||
/*
|
||||
theModel.outputClusterReports( CLUSTER_FILENAME );
|
||||
|
||||
for( final String annotation : annotationKeys ) {
|
||||
|
|
@ -248,6 +257,6 @@ public class GenerateVariantClustersWalker extends RodWalker<ExpandingArrayList<
|
|||
Utils.warnUser("Unable to execute the RScript command because of [" + e.getMessage() + "]. While not critical to the calculations themselves, the script outputs a report that is extremely useful for confirming that the clustering proceded as expected. We highly recommend trying to rerun the script manually if possible.");
|
||||
}
|
||||
}
|
||||
*/
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -73,9 +73,7 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
|
|||
private final Matrix[] sigmaInverse;
|
||||
private double[] pClusterLog10;
|
||||
private final double[] determinant;
|
||||
private final double[] alleleCountFactorArray;
|
||||
private final double stdThreshold;
|
||||
private final double qualThreshold;
|
||||
|
||||
private double[] empiricalMu;
|
||||
private Matrix empiricalSigma;
|
||||
|
|
@ -85,12 +83,10 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
|
|||
private final double[] hyperParameter_lambda;
|
||||
|
||||
private static final Pattern ANNOTATION_PATTERN = Pattern.compile("^@!ANNOTATION.*");
|
||||
private static final Pattern ALLELECOUNT_PATTERN = Pattern.compile("^@!ALLELECOUNT.*");
|
||||
private static final Pattern CLUSTER_PATTERN = Pattern.compile("^@!CLUSTER.*");
|
||||
|
||||
public VariantGaussianMixtureModel( final VariantDataManager _dataManager, final int _maxGaussians, final int _maxIterations,
|
||||
final int maxAC, final boolean _forceIndependent, final double _stdThreshold, final double _qualThreshold,
|
||||
final double _shrinkage, final double _dirichlet) {
|
||||
final boolean _forceIndependent, final double _stdThreshold, final double _shrinkage, final double _dirichlet) {
|
||||
dataManager = _dataManager;
|
||||
maxGaussians = _maxGaussians;
|
||||
maxIterations = _maxIterations;
|
||||
|
|
@ -99,9 +95,7 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
|
|||
sigma = new Matrix[maxGaussians];
|
||||
determinant = new double[maxGaussians];
|
||||
pClusterLog10 = new double[maxGaussians];
|
||||
alleleCountFactorArray = new double[maxAC + 1];
|
||||
stdThreshold = _stdThreshold;
|
||||
qualThreshold = _qualThreshold;
|
||||
FORCE_INDEPENDENT_ANNOTATIONS = _forceIndependent;
|
||||
hyperParameter_a = new double[maxGaussians];
|
||||
hyperParameter_b = new double[maxGaussians];
|
||||
|
|
@ -112,26 +106,23 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
|
|||
DIRICHLET_PARAMETER = _dirichlet;
|
||||
}
|
||||
|
||||
public VariantGaussianMixtureModel( final double _targetTITV, final String clusterFileName, final double backOffGaussianFactor ) {
|
||||
public VariantGaussianMixtureModel( final double _targetTITV, final File clusterFile, final double backOffGaussianFactor ) {
|
||||
super( _targetTITV );
|
||||
final ExpandingArrayList<String> annotationLines = new ExpandingArrayList<String>();
|
||||
final ExpandingArrayList<String> alleleCountLines = new ExpandingArrayList<String>();
|
||||
final ExpandingArrayList<String> clusterLines = new ExpandingArrayList<String>();
|
||||
|
||||
try {
|
||||
for ( final String line : new XReadLines(new File( clusterFileName )) ) {
|
||||
for ( final String line : new XReadLines( clusterFile ) ) {
|
||||
if( ANNOTATION_PATTERN.matcher(line).matches() ) {
|
||||
annotationLines.add(line);
|
||||
} else if( ALLELECOUNT_PATTERN.matcher(line).matches() ) {
|
||||
alleleCountLines.add(line);
|
||||
} else if( CLUSTER_PATTERN.matcher(line).matches() ) {
|
||||
clusterLines.add(line);
|
||||
} else {
|
||||
throw new StingException("Malformed input file: " + clusterFileName);
|
||||
throw new StingException("Malformed input file: " + clusterFile);
|
||||
}
|
||||
}
|
||||
} catch ( FileNotFoundException e ) {
|
||||
throw new StingException("Can not find input file: " + clusterFileName);
|
||||
throw new StingException("Can not find input file: " + clusterFile);
|
||||
}
|
||||
|
||||
dataManager = new VariantDataManager( annotationLines );
|
||||
|
|
@ -140,7 +131,6 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
|
|||
DIRICHLET_PARAMETER = 0;
|
||||
maxIterations = 0;
|
||||
stdThreshold = 0.0;
|
||||
qualThreshold = 0.0;
|
||||
FORCE_INDEPENDENT_ANNOTATIONS = false;
|
||||
hyperParameter_a = null;
|
||||
hyperParameter_b = null;
|
||||
|
|
@ -155,14 +145,6 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
|
|||
pClusterLog10 = new double[maxGaussians];
|
||||
determinant = new double[maxGaussians];
|
||||
|
||||
alleleCountFactorArray = new double[alleleCountLines.size() + 1];
|
||||
for( final String line : alleleCountLines ) {
|
||||
final String[] vals = line.split(",");
|
||||
int i = Integer.parseInt(vals[1]);
|
||||
alleleCountFactorArray[i] = Double.parseDouble(vals[2]);
|
||||
double oldACPrior = alleleCountFactorArray[i];
|
||||
}
|
||||
|
||||
int kkk = 0;
|
||||
for( final String line : clusterLines ) {
|
||||
final String[] vals = line.split(",");
|
||||
|
|
@ -183,10 +165,7 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
|
|||
logger.info("Found " + maxGaussians + " clusters using " + dataManager.numAnnotations + " annotations: " + dataManager.annotationKeys);
|
||||
}
|
||||
|
||||
public final void run( final String clusterFileName ) {
|
||||
|
||||
// Initialize the Allele Count prior
|
||||
generateAlleleCountPrior();
|
||||
public final void run( final PrintStream clusterFile ) {
|
||||
|
||||
// int numValid = 0;
|
||||
// int numOutlier = 0;
|
||||
|
|
@ -215,7 +194,7 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
|
|||
logger.info("Initializing using k-means...");
|
||||
initializeUsingKMeans( dataManager.data );
|
||||
logger.info("... done!");
|
||||
createClusters( dataManager.data, 0, maxGaussians, clusterFileName );
|
||||
createClusters( dataManager.data, 0, maxGaussians, clusterFile );
|
||||
|
||||
// Simply cluster with all the variants. The knowns have been given more weight than the novels
|
||||
//logger.info("Clustering with " + dataManager.data.length + " variants.");
|
||||
|
|
@ -260,43 +239,12 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
|
|||
//empiricalSigma.timesEquals(1.0 / (Math.pow(maxGaussians, 2.0 / ((double) numAnnotations))));
|
||||
}
|
||||
|
||||
|
||||
private void generateAlleleCountPrior() {
|
||||
|
||||
final double[] acExpectation = new double[alleleCountFactorArray.length];
|
||||
final double[] acActual = new double[alleleCountFactorArray.length];
|
||||
final int[] alleleCount = new int[alleleCountFactorArray.length];
|
||||
|
||||
double sumExpectation = 0.0;
|
||||
for( int iii = 1; iii < alleleCountFactorArray.length; iii++ ) {
|
||||
acExpectation[iii] = 1.0 / ((double) iii);
|
||||
sumExpectation += acExpectation[iii];
|
||||
}
|
||||
for( int iii = 1; iii < alleleCountFactorArray.length; iii++ ) {
|
||||
acExpectation[iii] /= sumExpectation; // Turn acExpectation into a probability distribution
|
||||
alleleCount[iii] = 1; // Start off with one count to smooth the estimate
|
||||
}
|
||||
for( final VariantDatum datum : dataManager.data ) {
|
||||
alleleCount[datum.alleleCount]++;
|
||||
}
|
||||
for( int iii = 1; iii < alleleCountFactorArray.length; iii++ ) {
|
||||
acActual[iii] = ((double)alleleCount[iii]) / ((double) (dataManager.data.length+(alleleCountFactorArray.length-1))); // Turn acActual into a probability distribution
|
||||
}
|
||||
for( int iii = 1; iii < alleleCountFactorArray.length; iii++ ) {
|
||||
alleleCountFactorArray[iii] = acExpectation[iii] / acActual[iii]; // Prior is (expected / observed)
|
||||
}
|
||||
}
|
||||
|
||||
public void setSingletonFPRate( final double rate ) {
|
||||
this.singletonFPRate = rate;
|
||||
}
|
||||
|
||||
public final double getAlleleCountPrior( final int alleleCount ) {
|
||||
if ( this.singletonFPRate == -1 )
|
||||
return alleleCountFactorArray[alleleCount];
|
||||
else {
|
||||
return Math.min(0.95, 1.0 - Math.pow(singletonFPRate, alleleCount)); //TODO -- define the vals
|
||||
}
|
||||
return Math.min(0.95, 1.0 - Math.pow(singletonFPRate, alleleCount)); //TODO -- define the vals
|
||||
}
|
||||
|
||||
private void initializeUsingKMeans( final VariantDatum[] data ) {
|
||||
|
|
@ -365,7 +313,7 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
|
|||
}
|
||||
}
|
||||
|
||||
public final void createClusters( final VariantDatum[] data, final int startCluster, final int stopCluster, final String clusterFileName ) {
|
||||
public final void createClusters( final VariantDatum[] data, final int startCluster, final int stopCluster, final PrintStream clusterFile ) {
|
||||
|
||||
final int numVariants = data.length;
|
||||
final int numAnnotations = data[0].annotations.length;
|
||||
|
|
@ -439,38 +387,28 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
|
|||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Output the final cluster parameters
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
printClusterParameters( clusterFileName );
|
||||
printClusterParameters( clusterFile );
|
||||
}
|
||||
|
||||
private void printClusterParameters( final String clusterFileName ) {
|
||||
try {
|
||||
final PrintStream outputFile = new PrintStream( clusterFileName );
|
||||
dataManager.printClusterFileHeader( outputFile );
|
||||
for( int iii = 1; iii < alleleCountFactorArray.length; iii++ ) {
|
||||
outputFile.print("@!ALLELECOUNT,");
|
||||
outputFile.println(iii + "," + alleleCountFactorArray[iii]);
|
||||
}
|
||||
private void printClusterParameters( final PrintStream clusterFile ) {
|
||||
dataManager.printClusterFileHeader( clusterFile );
|
||||
|
||||
final int numAnnotations = mu[0].length;
|
||||
for( int kkk = 0; kkk < maxGaussians; kkk++ ) {
|
||||
if( Math.pow(10.0, pClusterLog10[kkk]) > 1E-4 ) { // BUGBUG: make this a command line argument
|
||||
final double sigmaVals[][] = sigma[kkk].getArray();
|
||||
outputFile.print("@!CLUSTER");
|
||||
outputFile.print("," + Math.pow(10.0, pClusterLog10[kkk]));
|
||||
for(int jjj = 0; jjj < numAnnotations; jjj++ ) {
|
||||
outputFile.print("," + mu[kkk][jjj]);
|
||||
}
|
||||
for(int jjj = 0; jjj < numAnnotations; jjj++ ) {
|
||||
for(int ppp = 0; ppp < numAnnotations; ppp++ ) {
|
||||
outputFile.print("," + (sigmaVals[jjj][ppp] / hyperParameter_a[kkk]) );
|
||||
}
|
||||
}
|
||||
outputFile.println();
|
||||
final int numAnnotations = mu[0].length;
|
||||
for( int kkk = 0; kkk < maxGaussians; kkk++ ) {
|
||||
if( Math.pow(10.0, pClusterLog10[kkk]) > 1E-4 ) { // BUGBUG: make this a command line argument
|
||||
final double sigmaVals[][] = sigma[kkk].getArray();
|
||||
clusterFile.print("@!CLUSTER");
|
||||
clusterFile.print("," + Math.pow(10.0, pClusterLog10[kkk]));
|
||||
for(int jjj = 0; jjj < numAnnotations; jjj++ ) {
|
||||
clusterFile.print("," + mu[kkk][jjj]);
|
||||
}
|
||||
for(int jjj = 0; jjj < numAnnotations; jjj++ ) {
|
||||
for(int ppp = 0; ppp < numAnnotations; ppp++ ) {
|
||||
clusterFile.print("," + (sigmaVals[jjj][ppp] / hyperParameter_a[kkk]) );
|
||||
}
|
||||
}
|
||||
clusterFile.println();
|
||||
}
|
||||
outputFile.close();
|
||||
} catch (Exception e) {
|
||||
throw new StingException( "Unable to create output file: " + clusterFileName );
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -563,13 +501,10 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
|
|||
}
|
||||
|
||||
annIndex++;
|
||||
outputFile.close();
|
||||
}
|
||||
|
||||
// BUGBUG: next output the actual cluster on top by integrating out every other annotation
|
||||
}
|
||||
|
||||
public final void outputOptimizationCurve( final VariantDatum[] data, final String outputPrefix,
|
||||
public final void outputOptimizationCurve( final VariantDatum[] data, final PrintStream outputReportDatFile, final PrintStream outputTranchesFile,
|
||||
final int desiredNumVariants, final Double[] FDRtranches, final double QUAL_STEP ) {
|
||||
|
||||
final int numVariants = data.length;
|
||||
|
|
@ -595,16 +530,16 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
|
|||
|
||||
PrintStream outputFile;
|
||||
try {
|
||||
outputFile = new PrintStream( outputPrefix + ".dat" );
|
||||
outputFile = new PrintStream( outputReportDatFile );
|
||||
} catch (Exception e) {
|
||||
throw new StingException( "Unable to create output file: " + outputPrefix + ".dat" );
|
||||
throw new StingException( "Unable to create output file: " + outputReportDatFile );
|
||||
}
|
||||
PrintStream tranchesOutputFile;
|
||||
try {
|
||||
tranchesOutputFile = new PrintStream( outputPrefix + ".dat.tranches" );
|
||||
tranchesOutputFile = new PrintStream( outputTranchesFile );
|
||||
tranchesOutputFile.println("FDRtranche,novelTITV,pCut,numNovel,filterName");
|
||||
} catch (Exception e) {
|
||||
throw new StingException( "Unable to create output file: " + outputPrefix + ".dat.tranches" );
|
||||
throw new StingException( "Unable to create output file: " + outputTranchesFile );
|
||||
}
|
||||
|
||||
int numKnown = 0;
|
||||
|
|
@ -706,12 +641,8 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
|
|||
logger.info("\t" + String.format("--> with an implied novel FDR of %.2f percent", Math.abs(100.0 * (1.0-((novelTiTvAtCut[jjj] - 0.5) / (targetTITV - 0.5))))));
|
||||
}
|
||||
}
|
||||
|
||||
outputFile.close();
|
||||
tranchesOutputFile.close();
|
||||
}
|
||||
|
||||
|
||||
private double evaluateGaussians( final VariantDatum[] data, final double[][] pVarInCluster, final int startCluster, final int stopCluster ) {
|
||||
|
||||
final int numAnnotations = data[0].annotations.length;
|
||||
|
|
|
|||
|
|
@ -1,5 +1,7 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.variantrecalibration;
|
||||
|
||||
import java.io.PrintStream;
|
||||
|
||||
/*
|
||||
* Copyright (c) 2010 The Broad Institute
|
||||
*
|
||||
|
|
@ -32,5 +34,5 @@ package org.broadinstitute.sting.gatk.walkers.variantrecalibration;
|
|||
*/
|
||||
|
||||
public interface VariantOptimizationInterface {
|
||||
public void run( String outputPrefix );
|
||||
public void run( PrintStream outputPrefix );
|
||||
}
|
||||
|
|
|
|||
|
|
@ -28,6 +28,8 @@ package org.broadinstitute.sting.gatk.walkers.variantrecalibration;
|
|||
import org.broad.tribble.dbsnp.DbSNPFeature;
|
||||
import org.broad.tribble.util.variantcontext.VariantContext;
|
||||
import org.broad.tribble.vcf.*;
|
||||
import org.broadinstitute.sting.commandline.Input;
|
||||
import org.broadinstitute.sting.commandline.Output;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils;
|
||||
|
|
@ -42,7 +44,9 @@ import org.broadinstitute.sting.commandline.Argument;
|
|||
import org.broadinstitute.sting.utils.vcf.VCFUtils;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.IOException;
|
||||
import java.io.PrintStream;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
|
|
@ -56,12 +60,29 @@ import java.util.*;
|
|||
|
||||
public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDatum>, ExpandingArrayList<VariantDatum>> {
|
||||
|
||||
/////////////////////////////
|
||||
// Inputs
|
||||
/////////////////////////////
|
||||
@Input(fullName="cluster_file", shortName="clusterFile", doc="The input cluster file generated by GenerateVariantClusters", required=true)
|
||||
private File CLUSTER_FILE;
|
||||
|
||||
/////////////////////////////
|
||||
// Outputs
|
||||
/////////////////////////////
|
||||
@Output(fullName="tranches_file", shortName="tranchesFile", doc="The output tranches file used by ApplyVariantCuts", required=true)
|
||||
private PrintStream TRANCHES_FILE;
|
||||
@Output(fullName="report_dat_file", shortName="reportDatFile", doc="The output report .dat file used with Rscript to create the optimization curve PDF file", required=true)
|
||||
private File REPORT_DAT_FILE;
|
||||
@Output(doc="File to which recalibrated variants should be written",required=true)
|
||||
private VCFWriter vcfWriter = null;
|
||||
|
||||
|
||||
/////////////////////////////
|
||||
// Command Line Arguments
|
||||
/////////////////////////////
|
||||
@Argument(fullName="target_titv", shortName="titv", doc="The expected Ti/Tv ratio to display on optimization curve output figures. (~~2.1 for whole genome experiments)", required=false)
|
||||
private double TARGET_TITV = 2.1;
|
||||
@Argument(fullName="backOff", shortName="backOff", doc="The Gaussian back off factor, used to prevent overfitting by spreading out the Gaussians.", required=false)
|
||||
@Argument(fullName="target_titv", shortName="titv", doc="The expected novel Ti/Tv ratio to use when calculating FDR tranches and for display on optimization curve output figures. (~~2.07 for whole genome experiments)", required=true)
|
||||
private double TARGET_TITV = 2.07;
|
||||
@Argument(fullName="backOff", shortName="backOff", doc="The Gaussian back off factor, used to prevent overfitting by enlarging out the Gaussians.", required=false)
|
||||
private double BACKOFF_FACTOR = 1.3;
|
||||
@Argument(fullName="desired_num_variants", shortName="dV", doc="The desired number of variants to keep in a theoretically filtered set", required=false)
|
||||
private int DESIRED_NUM_VARIANTS = 0;
|
||||
|
|
@ -75,14 +96,8 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
|
|||
private double KNOWN_QUAL_PRIOR = 10.0;
|
||||
@Argument(fullName="novel_prior", shortName="novelPrior", doc="A prior on the quality of novel variants, a phred scaled probability of being true.", required=false)
|
||||
private double NOVEL_QUAL_PRIOR = 2.0;
|
||||
@Argument(fullName="output_prefix", shortName="output", doc="The prefix added to output VCF file name and optimization curve pdf file name", required=false)
|
||||
private String OUTPUT_PREFIX = "optimizer";
|
||||
@Argument(fullName="clusterFile", shortName="clusterFile", doc="The output cluster file", required=true)
|
||||
private String CLUSTER_FILENAME = "optimizer.cluster";
|
||||
@Argument(fullName="FDRtranche", shortName="tranche", doc="The levels of novel false discovery rate (FDR, implied by ti/tv) at which to slice the data. (in percent, that is 1.0 for 1 percent)", required=false)
|
||||
private Double[] FDR_TRANCHES = null;
|
||||
//@Argument(fullName = "optimization_model", shortName = "om", doc = "Optimization calculation model to employ -- GAUSSIAN_MIXTURE_MODEL is currently the default, while K_NEAREST_NEIGHBORS is also available for small callsets.", required = false)
|
||||
private VariantOptimizationModel.Model OPTIMIZATION_MODEL = VariantOptimizationModel.Model.GAUSSIAN_MIXTURE_MODEL;
|
||||
@Argument(fullName = "path_to_Rscript", shortName = "Rscript", doc = "The path to your implementation of Rscript. For Broad users this is maybe /broad/tools/apps/R-2.6.0/bin/Rscript", required = false)
|
||||
private String PATH_TO_RSCRIPT = "Rscript";
|
||||
@Argument(fullName = "path_to_resources", shortName = "resources", doc = "Path to resources folder holding the Sting R scripts.", required = false)
|
||||
|
|
@ -97,8 +112,8 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
|
|||
/////////////////////////////
|
||||
// Private Member Variables
|
||||
/////////////////////////////
|
||||
private VariantOptimizationModel.Model OPTIMIZATION_MODEL = VariantOptimizationModel.Model.GAUSSIAN_MIXTURE_MODEL;
|
||||
private VariantGaussianMixtureModel theModel = null;
|
||||
private VCFWriter vcfWriter;
|
||||
private Set<String> ignoreInputFilterSet = null;
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
|
@ -116,9 +131,10 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
|
|||
|
||||
switch (OPTIMIZATION_MODEL) {
|
||||
case GAUSSIAN_MIXTURE_MODEL:
|
||||
theModel = new VariantGaussianMixtureModel( TARGET_TITV, CLUSTER_FILENAME, BACKOFF_FACTOR );
|
||||
if ( SINGLETON_FP_RATE != -1 )
|
||||
theModel = new VariantGaussianMixtureModel( TARGET_TITV, CLUSTER_FILE, BACKOFF_FACTOR );
|
||||
if ( SINGLETON_FP_RATE != -1 ) {
|
||||
theModel.setSingletonFPRate(SINGLETON_FP_RATE);
|
||||
}
|
||||
break;
|
||||
//case K_NEAREST_NEIGHBORS:
|
||||
// theModel = new VariantNearestNeighborsModel( dataManager, TARGET_TITV, NUM_KNN );
|
||||
|
|
@ -136,7 +152,6 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
|
|||
hInfo.add(new VCFHeaderLine("source", "VariantOptimizer"));
|
||||
samples.addAll(SampleUtils.getUniqueSamplesFromRods(getToolkit()));
|
||||
|
||||
vcfWriter = new StandardVCFWriter( new File(OUTPUT_PREFIX + ".vcf") );
|
||||
final VCFHeader vcfHeader = new VCFHeader(hInfo, samples);
|
||||
vcfWriter.writeHeader(vcfHeader);
|
||||
|
||||
|
|
@ -154,13 +169,12 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
|
|||
|
||||
// Set up default values for the FDR tranches if necessary
|
||||
if( FDR_TRANCHES == null ) {
|
||||
FDR_TRANCHES = new Double[6];
|
||||
FDR_TRANCHES = new Double[5];
|
||||
FDR_TRANCHES[0] = 0.1;
|
||||
FDR_TRANCHES[1] = 1.0;
|
||||
FDR_TRANCHES[2] = 5.0;
|
||||
FDR_TRANCHES[3] = 12.5;
|
||||
FDR_TRANCHES[4] = 25.0;
|
||||
FDR_TRANCHES[5] = 35.0;
|
||||
FDR_TRANCHES[3] = 10.0;
|
||||
FDR_TRANCHES[4] = 15.0;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -240,25 +254,31 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
|
|||
|
||||
public void onTraversalDone( ExpandingArrayList<VariantDatum> reduceSum ) {
|
||||
|
||||
vcfWriter.close();
|
||||
|
||||
final VariantDataManager dataManager = new VariantDataManager( reduceSum, theModel.dataManager.annotationKeys );
|
||||
reduceSum.clear(); // Don't need this ever again, clean up some memory
|
||||
|
||||
theModel.outputOptimizationCurve( dataManager.data, OUTPUT_PREFIX, DESIRED_NUM_VARIANTS, FDR_TRANCHES, QUAL_STEP );
|
||||
try {
|
||||
PrintStream reportDatFilePrintStream = new PrintStream(REPORT_DAT_FILE);
|
||||
theModel.outputOptimizationCurve( dataManager.data, reportDatFilePrintStream, TRANCHES_FILE, DESIRED_NUM_VARIANTS, FDR_TRANCHES, QUAL_STEP );
|
||||
} catch ( FileNotFoundException e ) {
|
||||
throw new StingException("Can't create report dat file: " + REPORT_DAT_FILE.getName());
|
||||
}
|
||||
|
||||
// Execute Rscript command to plot the optimization curve
|
||||
// Print out the command line to make it clear to the user what is being executed and how one might modify it
|
||||
final String rScriptOptimizationCurveCommandLine = PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_OptimizationCurve.R" + " " + OUTPUT_PREFIX + ".dat" + " " + TARGET_TITV;
|
||||
final String rScriptTranchesCommandLine = PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_Tranches.R" + " " + OUTPUT_PREFIX + ".dat" + " " + TARGET_TITV;
|
||||
final String rScriptOptimizationCurveCommandLine = PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_OptimizationCurve.R" + " " + REPORT_DAT_FILE.getName() + " " + TARGET_TITV;
|
||||
final String rScriptTranchesCommandLine = PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_Tranches.R" + " " + REPORT_DAT_FILE.getName() + " " + TARGET_TITV;
|
||||
logger.info( rScriptOptimizationCurveCommandLine );
|
||||
logger.info( rScriptTranchesCommandLine );
|
||||
|
||||
// Execute the RScript command to plot the table of truth values
|
||||
try {
|
||||
Runtime.getRuntime().exec( rScriptOptimizationCurveCommandLine );
|
||||
Runtime.getRuntime().exec( rScriptTranchesCommandLine );
|
||||
} catch ( IOException e ) {
|
||||
Process p;
|
||||
p = Runtime.getRuntime().exec( rScriptOptimizationCurveCommandLine );
|
||||
p.waitFor();
|
||||
p = Runtime.getRuntime().exec( rScriptTranchesCommandLine );
|
||||
p.waitFor();
|
||||
} catch ( Exception e ) {
|
||||
Utils.warnUser("Unable to execute the RScript command. While not critical to the calculations themselves, the script outputs a report that is extremely useful for confirming that the recalibration proceded as expected. We highly recommend trying to rerun the script manually if possible.");
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -7,12 +7,14 @@ import java.util.*;
|
|||
import java.io.File;
|
||||
|
||||
public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest {
|
||||
static HashMap<String, String> paramsFiles = new HashMap<String, String>();
|
||||
|
||||
static HashMap<String, String> clusterFiles = new HashMap<String, String>();
|
||||
static HashMap<String, String> tranchesFiles = new HashMap<String, String>();
|
||||
static HashMap<String, String> inputVCFFiles = new HashMap<String, String>();
|
||||
|
||||
@Test
|
||||
public void testGenerateVariantClusters() {
|
||||
HashMap<String, String> e = new HashMap<String, String>();
|
||||
e.put( validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", "005898194773d2f2be45a68f6e4c4c25" );
|
||||
e.put( validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", "e6724946c3298b3d74bb1ba1396a9190" );
|
||||
|
||||
for ( Map.Entry<String, String> entry : e.entrySet() ) {
|
||||
String vcf = entry.getKey();
|
||||
|
|
@ -20,7 +22,7 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest {
|
|||
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-R " + b36KGReference +
|
||||
" --DBSNP /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod" +
|
||||
" --DBSNP " + GATKDataLocation + "dbsnp_129_b36.rod" +
|
||||
" -T GenerateVariantClusters" +
|
||||
" -B input,VCF," + vcf +
|
||||
" -L 1:1-100,000,000" +
|
||||
|
|
@ -30,37 +32,69 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest {
|
|||
1, // just one output file
|
||||
Arrays.asList(md5));
|
||||
List<File> result = executeTest("testGenerateVariantClusters", spec).getFirst();
|
||||
paramsFiles.put(vcf, result.get(0).getAbsolutePath());
|
||||
clusterFiles.put(vcf, result.get(0).getAbsolutePath());
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testVariantRecalibrator() {
|
||||
HashMap<String, String> e = new HashMap<String, String>();
|
||||
e.put( validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", "acb89428115564cbf35ab7c006252108" );
|
||||
e.put( validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", "b97ab64b86ce8c8698855058d32853ce" );
|
||||
|
||||
for ( Map.Entry<String, String> entry : e.entrySet() ) {
|
||||
String vcf = entry.getKey();
|
||||
String md5 = entry.getValue();
|
||||
String paramsFile = paramsFiles.get(vcf);
|
||||
System.out.printf("PARAMS FOR %s is %s%n", vcf, paramsFile);
|
||||
if ( paramsFile != null ) {
|
||||
File file = createTempFile("cluster",".vcf");
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
String clusterFile = clusterFiles.get(vcf);
|
||||
System.out.printf("PARAMS FOR %s is %s%n", vcf, clusterFile);
|
||||
if ( clusterFile != null ) {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-R " + b36KGReference +
|
||||
" --DBSNP /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod" +
|
||||
" --DBSNP " + GATKDataLocation + "dbsnp_129_b36.rod" +
|
||||
" -T VariantRecalibrator" +
|
||||
" -B input,VCF," + vcf +
|
||||
" -L 1:40,000,000-100,000,000" +
|
||||
" --ignore_filter GATK_STANDARD" +
|
||||
" -output " + file.getAbsolutePath().substring(0,file.getAbsolutePath().length()-4) +
|
||||
" -clusterFile " + paramsFile,
|
||||
0,
|
||||
new ArrayList<String>(0));
|
||||
" --ignore_filter HARD_TO_VALIDATE" +
|
||||
" -clusterFile " + clusterFile +
|
||||
" -titv 2.07" +
|
||||
" -o %s" +
|
||||
" -tranchesFile %s" +
|
||||
" -reportDatFile %s",
|
||||
3, // two output file
|
||||
Arrays.asList(md5, "f603aa2052ae6e81a6bde63a8a3f9539","951a17f9c11de391763b9a8cb239205a"));
|
||||
List<File> result = executeTest("testVariantRecalibrator", spec).getFirst();
|
||||
inputVCFFiles.put(vcf, result.get(0).getAbsolutePath());
|
||||
tranchesFiles.put(vcf, result.get(1).getAbsolutePath());
|
||||
}
|
||||
}
|
||||
|
||||
spec.addAuxFile(md5, file);
|
||||
|
||||
executeTest("testVariantRecalibrator", spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testApplyVariantCuts() {
|
||||
HashMap<String, String> e = new HashMap<String, String>();
|
||||
e.put( validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", "e80f26d68be2b183fd7f062039cef28a" );
|
||||
|
||||
for ( Map.Entry<String, String> entry : e.entrySet() ) {
|
||||
String vcf = entry.getKey();
|
||||
String md5 = entry.getValue();
|
||||
String inputVCFFile = inputVCFFiles.get(vcf);
|
||||
String tranchesFile = tranchesFiles.get(vcf);
|
||||
System.out.printf("PARAMS FOR %s is %s%n", vcf, inputVCFFile);
|
||||
System.out.printf("PARAMS FOR %s is %s%n", vcf, tranchesFile);
|
||||
if ( inputVCFFile != null && tranchesFile != null ) {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-R " + b36KGReference +
|
||||
" --DBSNP " + GATKDataLocation + "dbsnp_129_b36.rod" +
|
||||
" -T ApplyVariantCuts" +
|
||||
" -L 1:40,000,000-100,000,000" +
|
||||
" -B input,VCF," + inputVCFFile +
|
||||
" -o %s" +
|
||||
" -tranchesFile " + tranchesFile,
|
||||
1, // just one output file
|
||||
Arrays.asList(md5));
|
||||
List<File> result = executeTest("testApplyVariantCuts", spec).getFirst();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue