Automatic cutting of recalibrated variant calls using ApplyVariantCuts. VariantRecalibrator produces the tranches plot alongside the optimization curve. Specify the levels using -tranche 1.0 -tranche 5.0 etc

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3472 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
rpoplin 2010-06-02 15:03:00 +00:00
parent 4a555827aa
commit 290771a8c2
5 changed files with 253 additions and 3 deletions

View File

@ -32,4 +32,18 @@ points(data$pCut, data$numNovel,col="DarkGreen",pch=20)
axis(side=4,col="DarkGreen")
mtext("Number of Variants", side=4, line=2, col="DarkGreen",cex=1.4)
legend("topright", c("Known","Novel"), col=c("Green","DarkGreen"), pch=c(20,20),cex=0.7)
dev.off()
dev.off()
data2 = read.table(paste(input,".tranches",sep=""),sep=",",head=T)
outfile = paste(input, ".FDRtranches.pdf", sep="")
pdf(outfile, height=7, width=8)
alpha = (data2$novelTITV - 0.5) / (targetTITV - 0.5);
numGood = round(alpha * data2$numNovel);
numBad = data2$numNovel - numGood;
d=matrix(c(numGood,numBad),2,byrow=TRUE)
barplot(d,horiz=TRUE,col=c(1,2),space=0.2,xlab="Number of Novel Variants",ylab="Novel Ti/Tv --> FDR (%)")
legend('topright',c('implied TP','implied FP'),col=c(1,2),lty=1,lwd=16)
axis(2,line=-1,at=0.7+(0:(length(data2$FDRtranche)-1))*1.2,tick=FALSE,labels=data2$FDRtranche)
axis(2,line=0.4,at=0.7+(0:(length(data2$FDRtranche)-1))*1.2,tick=FALSE,labels=data2$novelTITV)
dev.off()

View File

@ -0,0 +1,195 @@
/*
* Copyright (c) 2010 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.variantrecalibration;
import Jama.Matrix;
import org.broad.tribble.dbsnp.DbSNPFeature;
import org.broad.tribble.vcf.*;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.VariantContextAdaptors;
import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack;
import org.broadinstitute.sting.gatk.refdata.utils.helpers.DbSNPHelper;
import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.collections.ExpandingArrayList;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.utils.genotype.vcf.VCFReader;
import org.broadinstitute.sting.utils.genotype.vcf.VCFUtils;
import org.broadinstitute.sting.utils.genotype.vcf.VCFWriter;
import org.broadinstitute.sting.utils.text.XReadLines;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.*;
/**
* Applies cuts to the input vcf file (by adding filter lines) to achieve the desired novel FDR levels which were specified during VariantRecalibration
*
* @author rpoplin
* @since Jun 2, 2010
*
* @help.summary Applies cuts to the input vcf file (by adding filter lines) to achieve the desired novel FDR levels which were specified during VariantRecalibration
*/
public class ApplyVariantCuts extends RodWalker<Integer, Integer> {
/////////////////////////////
// Command Line Arguments
/////////////////////////////
@Argument(fullName="tranchesFile", shortName="tf", doc="The input tranches file describing where to cut the data", required=true)
private String TRANCHE_FILENAME = "optimizer.dat.tranches";
@Argument(fullName="outputVCFFile", shortName="outputVCF", doc="The output filtered VCF file", required=true)
private String OUTPUT_FILENAME = "optimizer.vcf";
/////////////////////////////
// Private Member Variables
/////////////////////////////
private VCFWriter vcfWriter;
private final ArrayList<String> ALLOWED_FORMAT_FIELDS = new ArrayList<String>();
final ExpandingArrayList<Double> qCuts = new ExpandingArrayList<Double>();
final ExpandingArrayList<String> filterName = new ExpandingArrayList<String>();
//---------------------------------------------------------------------------------------------------------------
//
// initialize
//
//---------------------------------------------------------------------------------------------------------------
public void initialize() {
boolean firstLine = true;
try {
for( final String line : new XReadLines(new File( TRANCHE_FILENAME )) ) {
if( !firstLine ) {
final String[] vals = line.split(",");
qCuts.add(Double.parseDouble(vals[2]));
filterName.add(vals[4]);
}
firstLine = false;
}
} catch( FileNotFoundException e ) {
throw new StingException("Can not find input file: " + TRANCHE_FILENAME);
}
ALLOWED_FORMAT_FIELDS.add(VCFGenotypeRecord.GENOTYPE_KEY); // copied from VariantsToVCF
ALLOWED_FORMAT_FIELDS.add(VCFGenotypeRecord.GENOTYPE_QUALITY_KEY);
ALLOWED_FORMAT_FIELDS.add(VCFGenotypeRecord.DEPTH_KEY);
ALLOWED_FORMAT_FIELDS.add(VCFGenotypeRecord.GENOTYPE_LIKELIHOODS_KEY);
// setup the header fields
final Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
hInfo.addAll(VCFUtils.getHeaderFields(getToolkit()));
hInfo.add(new VCFInfoHeaderLine("OQ", 1, VCFInfoHeaderLine.INFO_TYPE.Float, "The original variant quality score"));
hInfo.add(new VCFHeaderLine("source", "VariantOptimizer"));
vcfWriter = new VCFWriter( new File(OUTPUT_FILENAME) );
final TreeSet<String> samples = new TreeSet<String>();
final List<ReferenceOrderedDataSource> dataSources = this.getToolkit().getRodDataSources();
for( final ReferenceOrderedDataSource source : dataSources ) {
final RMDTrack rod = source.getReferenceOrderedData();
if( rod.getRecordType().equals(VCFRecord.class) ) {
final VCFReader reader = new VCFReader(rod.getFile());
final Set<String> vcfSamples = reader.getHeader().getGenotypeSamples();
samples.addAll(vcfSamples);
reader.close();
}
}
// BUGBUG : why doesn't this work? VCFFilterHeaderLine is protected?
//for( int iii = 1; iii < filterName.size(); iii++ ) {
// hInfo.add(new VCFFilterHeaderLine(filterName.get(iii)), String.format("FDR tranche level at qual " + qCuts.get(iii)));
//}
//hInfo.add(new VCFFilterHeaderLine(filterName.get(0)+"+"), String.format("FDR tranche level at qual > " + qCuts.get(filterName.size()-1)));
final VCFHeader vcfHeader = new VCFHeader(hInfo, samples);
vcfWriter.writeHeader(vcfHeader);
}
//---------------------------------------------------------------------------------------------------------------
//
// map
//
//---------------------------------------------------------------------------------------------------------------
public Integer map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) {
if( tracker == null ) { // For some reason RodWalkers get map calls with null trackers
return 1;
}
for( final VariantContext vc : tracker.getAllVariantContexts(ref, null, context.getLocation(), false, false) ) {
if( vc != null && !vc.getName().equals(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME) && vc.isSNP() ) {
final VCFRecord vcf = VariantContextAdaptors.toVCF(vc, ref.getBase(), ALLOWED_FORMAT_FIELDS, false, false);
if( !vc.isFiltered() ) {
final double qual = vc.getPhredScaledQual();
boolean setFilter = false;
for( int tranche = qCuts.size() - 1; tranche >= 0; tranche-- ) {
if( qual >= qCuts.get(tranche) ) {
if(tranche == qCuts.size() - 1) {
vcf.setFilterString(VCFRecord.PASSES_FILTERS);
setFilter = true;
} else {
vcf.setFilterString(filterName.get(tranche));
setFilter = true;
}
break;
}
}
if( !setFilter ) {
vcf.setFilterString(filterName.get(0)+"+");
}
}
vcfWriter.addRecord( vcf );
}
}
return 1; // This value isn't used for anything
}
//---------------------------------------------------------------------------------------------------------------
//
// reduce
//
//---------------------------------------------------------------------------------------------------------------
public Integer reduceInit() {
return 1;
}
public Integer reduce( final Integer mapValue, final Integer reduceSum ) {
return 1;
}
public void onTraversalDone( ExpandingArrayList<VariantDatum> reduceSum ) {
vcfWriter.close();
}
}

View File

@ -102,6 +102,8 @@ public class GenerateVariantClustersWalker extends RodWalker<ExpandingArrayList<
//---------------------------------------------------------------------------------------------------------------
public void initialize() {
if( !PATH_TO_RESOURCES.endsWith("/") ) { PATH_TO_RESOURCES = PATH_TO_RESOURCES + "/"; }
annotationKeys = new ExpandingArrayList<String>(Arrays.asList(USE_ANNOTATIONS));
if( IGNORE_INPUT_FILTERS != null ) {

View File

@ -367,7 +367,8 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
// BUGBUG: next output the actual cluster on top by integrating out every other annotation
}
public final void outputOptimizationCurve( final VariantDatum[] data, final String outputPrefix, final int desiredNumVariants ) {
public final void outputOptimizationCurve( final VariantDatum[] data, final String outputPrefix,
final int desiredNumVariants, final Double[] FDRtranches ) {
final int numVariants = data.length;
final boolean[] markedVariant = new boolean[numVariants];
@ -382,6 +383,11 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
final double novelTiTvAtCut[] = new double[NUM_BINS];
final double theCut[] = new double[NUM_BINS];
final double fdrCutAsTiTv[] = new double[FDRtranches.length];
for( int iii = 0; iii < FDRtranches.length; iii++ ) {
fdrCutAsTiTv[iii] = (1.0 - FDRtranches[iii] / 100.0) * (targetTITV - 0.5) + 0.5;
}
for( int iii = 0; iii < numVariants; iii++ ) {
markedVariant[iii] = false;
}
@ -392,6 +398,13 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
} catch (Exception e) {
throw new StingException( "Unable to create output file: " + outputPrefix + ".dat" );
}
PrintStream tranchesOutputFile;
try {
tranchesOutputFile = new PrintStream( outputPrefix + ".dat.tranches" );
tranchesOutputFile.println("FDRtranche,novelTITV,pCut,numNovel,filterName");
} catch (Exception e) {
throw new StingException( "Unable to create output file: " + outputPrefix + ".dat.tranches" );
}
int numKnown = 0;
int numNovel = 0;
@ -447,7 +460,16 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
// loop back through the data points looking for appropriate places to cut the data to get the target novel titv ratio
int checkQuantile = 0;
int tranche = FDRtranches.length - 1;
for( jjj = NUM_BINS-1; jjj >= 0; jjj-- ) {
if( tranche >= 0 && novelTiTvAtCut[jjj] >= fdrCutAsTiTv[tranche] ) {
tranchesOutputFile.println(String.format("%.2f,%.2f,%.2f,%d,FDRtranche%.2fto%.2f",
FDRtranches[tranche],novelTiTvAtCut[jjj],theCut[jjj],numNovelAtCut[jjj],
(tranche == 0 ? 0.0 : FDRtranches[tranche-1]) ,FDRtranches[tranche]));
tranche--;
}
boolean foundCut = false;
if( checkQuantile == 0 ) {
if( novelTiTvAtCut[jjj] >= 0.9 * targetTITV ) {
@ -480,10 +502,12 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
String.format("%.2f",((double) numKnownAtCut[jjj] * 100.0) / ((double) numKnownAtCut[jjj] + numNovelAtCut[jjj]) ) + "%)");
logger.info("\t" + String.format("%.4f known Ti/Tv ratio", knownTiTvAtCut[jjj]));
logger.info("\t" + String.format("%.4f novel Ti/Tv ratio", novelTiTvAtCut[jjj]));
logger.info("\t" + String.format("--> with an implied novel FDR of %.2f percent", Math.abs(100.0 * (1.0-((novelTiTvAtCut[jjj] - 0.5) / (targetTITV - 0.5))))));
}
}
outputFile.close();
tranchesOutputFile.close();
}

View File

@ -81,6 +81,8 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
private String OUTPUT_PREFIX = "optimizer";
@Argument(fullName="clusterFile", shortName="clusterFile", doc="The output cluster file", required=true)
private String CLUSTER_FILENAME = "optimizer.cluster";
@Argument(fullName="FDRtranche", shortName="tranche", doc="The levels of novel false discovery rate (FDR, implied by ti/tv) at which to slice the data. (in percent, that is 1.0 for 1 percent)", required=false)
private Double[] FDR_TRANCHES = null;
//@Argument(fullName = "optimization_model", shortName = "om", doc = "Optimization calculation model to employ -- GAUSSIAN_MIXTURE_MODEL is currently the default, while K_NEAREST_NEIGHBORS is also available for small callsets.", required = false)
private VariantOptimizationModel.Model OPTIMIZATION_MODEL = VariantOptimizationModel.Model.GAUSSIAN_MIXTURE_MODEL;
@Argument(fullName = "path_to_Rscript", shortName = "Rscript", doc = "The path to your implementation of Rscript. For Broad users this is probably /broad/tools/apps/R-2.6.0/bin/Rscript", required = false)
@ -156,6 +158,19 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
if(!foundDBSNP) {
throw new StingException("dbSNP track is required. This calculation is critically dependent on being able to distinguish known and novel sites.");
}
// Set up default values for the FDR tranches if necessary
if( FDR_TRANCHES == null ) {
FDR_TRANCHES = new Double[3];
FDR_TRANCHES[0] = 12.5;
FDR_TRANCHES[1] = 20.0;
FDR_TRANCHES[2] = 30.0;
//FDR_TRANCHES[0] = 1.0;
//FDR_TRANCHES[1] = 5.0;
//FDR_TRANCHES[2] = 10.0;
//FDR_TRANCHES[3] = 20.0;
//FDR_TRANCHES[4] = 30.0;
}
}
//---------------------------------------------------------------------------------------------------------------
@ -226,7 +241,7 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
final VariantDataManager dataManager = new VariantDataManager( reduceSum, theModel.dataManager.annotationKeys );
reduceSum.clear(); // Don't need this ever again, clean up some memory
theModel.outputOptimizationCurve( dataManager.data, OUTPUT_PREFIX, DESIRED_NUM_VARIANTS );
theModel.outputOptimizationCurve( dataManager.data, OUTPUT_PREFIX, DESIRED_NUM_VARIANTS, FDR_TRANCHES );
// Execute Rscript command to plot the optimization curve
// Print out the command line to make it clear to the user what is being executed and how one might modify it