Automatic cutting of recalibrated variant calls using ApplyVariantCuts. VariantRecalibrator produces the tranches plot alongside the optimization curve. Specify the levels using -tranche 1.0 -tranche 5.0 etc
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3472 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
4a555827aa
commit
290771a8c2
|
|
@ -32,4 +32,18 @@ points(data$pCut, data$numNovel,col="DarkGreen",pch=20)
|
|||
axis(side=4,col="DarkGreen")
|
||||
mtext("Number of Variants", side=4, line=2, col="DarkGreen",cex=1.4)
|
||||
legend("topright", c("Known","Novel"), col=c("Green","DarkGreen"), pch=c(20,20),cex=0.7)
|
||||
dev.off()
|
||||
dev.off()
|
||||
|
||||
|
||||
data2 = read.table(paste(input,".tranches",sep=""),sep=",",head=T)
|
||||
outfile = paste(input, ".FDRtranches.pdf", sep="")
|
||||
pdf(outfile, height=7, width=8)
|
||||
alpha = (data2$novelTITV - 0.5) / (targetTITV - 0.5);
|
||||
numGood = round(alpha * data2$numNovel);
|
||||
numBad = data2$numNovel - numGood;
|
||||
d=matrix(c(numGood,numBad),2,byrow=TRUE)
|
||||
barplot(d,horiz=TRUE,col=c(1,2),space=0.2,xlab="Number of Novel Variants",ylab="Novel Ti/Tv --> FDR (%)")
|
||||
legend('topright',c('implied TP','implied FP'),col=c(1,2),lty=1,lwd=16)
|
||||
axis(2,line=-1,at=0.7+(0:(length(data2$FDRtranche)-1))*1.2,tick=FALSE,labels=data2$FDRtranche)
|
||||
axis(2,line=0.4,at=0.7+(0:(length(data2$FDRtranche)-1))*1.2,tick=FALSE,labels=data2$novelTITV)
|
||||
dev.off()
|
||||
|
|
|
|||
|
|
@ -0,0 +1,195 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.variantrecalibration;
|
||||
|
||||
import Jama.Matrix;
|
||||
import org.broad.tribble.dbsnp.DbSNPFeature;
|
||||
import org.broad.tribble.vcf.*;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.refdata.VariantContextAdaptors;
|
||||
import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.helpers.DbSNPHelper;
|
||||
import org.broadinstitute.sting.gatk.walkers.RodWalker;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.collections.ExpandingArrayList;
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
import org.broadinstitute.sting.utils.genotype.vcf.VCFReader;
|
||||
import org.broadinstitute.sting.utils.genotype.vcf.VCFUtils;
|
||||
import org.broadinstitute.sting.utils.genotype.vcf.VCFWriter;
|
||||
import org.broadinstitute.sting.utils.text.XReadLines;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.IOException;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Applies cuts to the input vcf file (by adding filter lines) to achieve the desired novel FDR levels which were specified during VariantRecalibration
|
||||
*
|
||||
* @author rpoplin
|
||||
* @since Jun 2, 2010
|
||||
*
|
||||
* @help.summary Applies cuts to the input vcf file (by adding filter lines) to achieve the desired novel FDR levels which were specified during VariantRecalibration
|
||||
*/
|
||||
|
||||
public class ApplyVariantCuts extends RodWalker<Integer, Integer> {
|
||||
|
||||
/////////////////////////////
|
||||
// Command Line Arguments
|
||||
/////////////////////////////
|
||||
@Argument(fullName="tranchesFile", shortName="tf", doc="The input tranches file describing where to cut the data", required=true)
|
||||
private String TRANCHE_FILENAME = "optimizer.dat.tranches";
|
||||
@Argument(fullName="outputVCFFile", shortName="outputVCF", doc="The output filtered VCF file", required=true)
|
||||
private String OUTPUT_FILENAME = "optimizer.vcf";
|
||||
|
||||
/////////////////////////////
|
||||
// Private Member Variables
|
||||
/////////////////////////////
|
||||
private VCFWriter vcfWriter;
|
||||
private final ArrayList<String> ALLOWED_FORMAT_FIELDS = new ArrayList<String>();
|
||||
final ExpandingArrayList<Double> qCuts = new ExpandingArrayList<Double>();
|
||||
final ExpandingArrayList<String> filterName = new ExpandingArrayList<String>();
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// initialize
|
||||
//
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
public void initialize() {
|
||||
|
||||
boolean firstLine = true;
|
||||
try {
|
||||
for( final String line : new XReadLines(new File( TRANCHE_FILENAME )) ) {
|
||||
if( !firstLine ) {
|
||||
final String[] vals = line.split(",");
|
||||
qCuts.add(Double.parseDouble(vals[2]));
|
||||
filterName.add(vals[4]);
|
||||
}
|
||||
firstLine = false;
|
||||
}
|
||||
} catch( FileNotFoundException e ) {
|
||||
throw new StingException("Can not find input file: " + TRANCHE_FILENAME);
|
||||
}
|
||||
|
||||
ALLOWED_FORMAT_FIELDS.add(VCFGenotypeRecord.GENOTYPE_KEY); // copied from VariantsToVCF
|
||||
ALLOWED_FORMAT_FIELDS.add(VCFGenotypeRecord.GENOTYPE_QUALITY_KEY);
|
||||
ALLOWED_FORMAT_FIELDS.add(VCFGenotypeRecord.DEPTH_KEY);
|
||||
ALLOWED_FORMAT_FIELDS.add(VCFGenotypeRecord.GENOTYPE_LIKELIHOODS_KEY);
|
||||
|
||||
// setup the header fields
|
||||
final Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
|
||||
hInfo.addAll(VCFUtils.getHeaderFields(getToolkit()));
|
||||
hInfo.add(new VCFInfoHeaderLine("OQ", 1, VCFInfoHeaderLine.INFO_TYPE.Float, "The original variant quality score"));
|
||||
hInfo.add(new VCFHeaderLine("source", "VariantOptimizer"));
|
||||
vcfWriter = new VCFWriter( new File(OUTPUT_FILENAME) );
|
||||
final TreeSet<String> samples = new TreeSet<String>();
|
||||
final List<ReferenceOrderedDataSource> dataSources = this.getToolkit().getRodDataSources();
|
||||
for( final ReferenceOrderedDataSource source : dataSources ) {
|
||||
final RMDTrack rod = source.getReferenceOrderedData();
|
||||
if( rod.getRecordType().equals(VCFRecord.class) ) {
|
||||
final VCFReader reader = new VCFReader(rod.getFile());
|
||||
final Set<String> vcfSamples = reader.getHeader().getGenotypeSamples();
|
||||
samples.addAll(vcfSamples);
|
||||
reader.close();
|
||||
}
|
||||
}
|
||||
|
||||
// BUGBUG : why doesn't this work? VCFFilterHeaderLine is protected?
|
||||
//for( int iii = 1; iii < filterName.size(); iii++ ) {
|
||||
// hInfo.add(new VCFFilterHeaderLine(filterName.get(iii)), String.format("FDR tranche level at qual " + qCuts.get(iii)));
|
||||
//}
|
||||
//hInfo.add(new VCFFilterHeaderLine(filterName.get(0)+"+"), String.format("FDR tranche level at qual > " + qCuts.get(filterName.size()-1)));
|
||||
|
||||
final VCFHeader vcfHeader = new VCFHeader(hInfo, samples);
|
||||
vcfWriter.writeHeader(vcfHeader);
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// map
|
||||
//
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
public Integer map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) {
|
||||
|
||||
if( tracker == null ) { // For some reason RodWalkers get map calls with null trackers
|
||||
return 1;
|
||||
}
|
||||
|
||||
for( final VariantContext vc : tracker.getAllVariantContexts(ref, null, context.getLocation(), false, false) ) {
|
||||
if( vc != null && !vc.getName().equals(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME) && vc.isSNP() ) {
|
||||
final VCFRecord vcf = VariantContextAdaptors.toVCF(vc, ref.getBase(), ALLOWED_FORMAT_FIELDS, false, false);
|
||||
if( !vc.isFiltered() ) {
|
||||
final double qual = vc.getPhredScaledQual();
|
||||
boolean setFilter = false;
|
||||
for( int tranche = qCuts.size() - 1; tranche >= 0; tranche-- ) {
|
||||
if( qual >= qCuts.get(tranche) ) {
|
||||
if(tranche == qCuts.size() - 1) {
|
||||
vcf.setFilterString(VCFRecord.PASSES_FILTERS);
|
||||
setFilter = true;
|
||||
} else {
|
||||
vcf.setFilterString(filterName.get(tranche));
|
||||
setFilter = true;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
if( !setFilter ) {
|
||||
vcf.setFilterString(filterName.get(0)+"+");
|
||||
}
|
||||
}
|
||||
vcfWriter.addRecord( vcf );
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return 1; // This value isn't used for anything
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// reduce
|
||||
//
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
public Integer reduceInit() {
|
||||
return 1;
|
||||
}
|
||||
|
||||
public Integer reduce( final Integer mapValue, final Integer reduceSum ) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
public void onTraversalDone( ExpandingArrayList<VariantDatum> reduceSum ) {
|
||||
vcfWriter.close();
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -102,6 +102,8 @@ public class GenerateVariantClustersWalker extends RodWalker<ExpandingArrayList<
|
|||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
public void initialize() {
|
||||
if( !PATH_TO_RESOURCES.endsWith("/") ) { PATH_TO_RESOURCES = PATH_TO_RESOURCES + "/"; }
|
||||
|
||||
annotationKeys = new ExpandingArrayList<String>(Arrays.asList(USE_ANNOTATIONS));
|
||||
|
||||
if( IGNORE_INPUT_FILTERS != null ) {
|
||||
|
|
|
|||
|
|
@ -367,7 +367,8 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
|
|||
// BUGBUG: next output the actual cluster on top by integrating out every other annotation
|
||||
}
|
||||
|
||||
public final void outputOptimizationCurve( final VariantDatum[] data, final String outputPrefix, final int desiredNumVariants ) {
|
||||
public final void outputOptimizationCurve( final VariantDatum[] data, final String outputPrefix,
|
||||
final int desiredNumVariants, final Double[] FDRtranches ) {
|
||||
|
||||
final int numVariants = data.length;
|
||||
final boolean[] markedVariant = new boolean[numVariants];
|
||||
|
|
@ -382,6 +383,11 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
|
|||
final double novelTiTvAtCut[] = new double[NUM_BINS];
|
||||
final double theCut[] = new double[NUM_BINS];
|
||||
|
||||
final double fdrCutAsTiTv[] = new double[FDRtranches.length];
|
||||
for( int iii = 0; iii < FDRtranches.length; iii++ ) {
|
||||
fdrCutAsTiTv[iii] = (1.0 - FDRtranches[iii] / 100.0) * (targetTITV - 0.5) + 0.5;
|
||||
}
|
||||
|
||||
for( int iii = 0; iii < numVariants; iii++ ) {
|
||||
markedVariant[iii] = false;
|
||||
}
|
||||
|
|
@ -392,6 +398,13 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
|
|||
} catch (Exception e) {
|
||||
throw new StingException( "Unable to create output file: " + outputPrefix + ".dat" );
|
||||
}
|
||||
PrintStream tranchesOutputFile;
|
||||
try {
|
||||
tranchesOutputFile = new PrintStream( outputPrefix + ".dat.tranches" );
|
||||
tranchesOutputFile.println("FDRtranche,novelTITV,pCut,numNovel,filterName");
|
||||
} catch (Exception e) {
|
||||
throw new StingException( "Unable to create output file: " + outputPrefix + ".dat.tranches" );
|
||||
}
|
||||
|
||||
int numKnown = 0;
|
||||
int numNovel = 0;
|
||||
|
|
@ -447,7 +460,16 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
|
|||
|
||||
// loop back through the data points looking for appropriate places to cut the data to get the target novel titv ratio
|
||||
int checkQuantile = 0;
|
||||
int tranche = FDRtranches.length - 1;
|
||||
for( jjj = NUM_BINS-1; jjj >= 0; jjj-- ) {
|
||||
|
||||
if( tranche >= 0 && novelTiTvAtCut[jjj] >= fdrCutAsTiTv[tranche] ) {
|
||||
tranchesOutputFile.println(String.format("%.2f,%.2f,%.2f,%d,FDRtranche%.2fto%.2f",
|
||||
FDRtranches[tranche],novelTiTvAtCut[jjj],theCut[jjj],numNovelAtCut[jjj],
|
||||
(tranche == 0 ? 0.0 : FDRtranches[tranche-1]) ,FDRtranches[tranche]));
|
||||
tranche--;
|
||||
}
|
||||
|
||||
boolean foundCut = false;
|
||||
if( checkQuantile == 0 ) {
|
||||
if( novelTiTvAtCut[jjj] >= 0.9 * targetTITV ) {
|
||||
|
|
@ -480,10 +502,12 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
|
|||
String.format("%.2f",((double) numKnownAtCut[jjj] * 100.0) / ((double) numKnownAtCut[jjj] + numNovelAtCut[jjj]) ) + "%)");
|
||||
logger.info("\t" + String.format("%.4f known Ti/Tv ratio", knownTiTvAtCut[jjj]));
|
||||
logger.info("\t" + String.format("%.4f novel Ti/Tv ratio", novelTiTvAtCut[jjj]));
|
||||
logger.info("\t" + String.format("--> with an implied novel FDR of %.2f percent", Math.abs(100.0 * (1.0-((novelTiTvAtCut[jjj] - 0.5) / (targetTITV - 0.5))))));
|
||||
}
|
||||
}
|
||||
|
||||
outputFile.close();
|
||||
tranchesOutputFile.close();
|
||||
}
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -81,6 +81,8 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
|
|||
private String OUTPUT_PREFIX = "optimizer";
|
||||
@Argument(fullName="clusterFile", shortName="clusterFile", doc="The output cluster file", required=true)
|
||||
private String CLUSTER_FILENAME = "optimizer.cluster";
|
||||
@Argument(fullName="FDRtranche", shortName="tranche", doc="The levels of novel false discovery rate (FDR, implied by ti/tv) at which to slice the data. (in percent, that is 1.0 for 1 percent)", required=false)
|
||||
private Double[] FDR_TRANCHES = null;
|
||||
//@Argument(fullName = "optimization_model", shortName = "om", doc = "Optimization calculation model to employ -- GAUSSIAN_MIXTURE_MODEL is currently the default, while K_NEAREST_NEIGHBORS is also available for small callsets.", required = false)
|
||||
private VariantOptimizationModel.Model OPTIMIZATION_MODEL = VariantOptimizationModel.Model.GAUSSIAN_MIXTURE_MODEL;
|
||||
@Argument(fullName = "path_to_Rscript", shortName = "Rscript", doc = "The path to your implementation of Rscript. For Broad users this is probably /broad/tools/apps/R-2.6.0/bin/Rscript", required = false)
|
||||
|
|
@ -156,6 +158,19 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
|
|||
if(!foundDBSNP) {
|
||||
throw new StingException("dbSNP track is required. This calculation is critically dependent on being able to distinguish known and novel sites.");
|
||||
}
|
||||
|
||||
// Set up default values for the FDR tranches if necessary
|
||||
if( FDR_TRANCHES == null ) {
|
||||
FDR_TRANCHES = new Double[3];
|
||||
FDR_TRANCHES[0] = 12.5;
|
||||
FDR_TRANCHES[1] = 20.0;
|
||||
FDR_TRANCHES[2] = 30.0;
|
||||
//FDR_TRANCHES[0] = 1.0;
|
||||
//FDR_TRANCHES[1] = 5.0;
|
||||
//FDR_TRANCHES[2] = 10.0;
|
||||
//FDR_TRANCHES[3] = 20.0;
|
||||
//FDR_TRANCHES[4] = 30.0;
|
||||
}
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
|
@ -226,7 +241,7 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
|
|||
final VariantDataManager dataManager = new VariantDataManager( reduceSum, theModel.dataManager.annotationKeys );
|
||||
reduceSum.clear(); // Don't need this ever again, clean up some memory
|
||||
|
||||
theModel.outputOptimizationCurve( dataManager.data, OUTPUT_PREFIX, DESIRED_NUM_VARIANTS );
|
||||
theModel.outputOptimizationCurve( dataManager.data, OUTPUT_PREFIX, DESIRED_NUM_VARIANTS, FDR_TRANCHES );
|
||||
|
||||
// Execute Rscript command to plot the optimization curve
|
||||
// Print out the command line to make it clear to the user what is being executed and how one might modify it
|
||||
|
|
|
|||
Loading…
Reference in New Issue