diff --git a/R/plot_OptimizationCurve.R b/R/plot_OptimizationCurve.R index c6bc68e67..c2410075a 100755 --- a/R/plot_OptimizationCurve.R +++ b/R/plot_OptimizationCurve.R @@ -32,4 +32,18 @@ points(data$pCut, data$numNovel,col="DarkGreen",pch=20) axis(side=4,col="DarkGreen") mtext("Number of Variants", side=4, line=2, col="DarkGreen",cex=1.4) legend("topright", c("Known","Novel"), col=c("Green","DarkGreen"), pch=c(20,20),cex=0.7) -dev.off() \ No newline at end of file +dev.off() + + +data2 = read.table(paste(input,".tranches",sep=""),sep=",",head=T) +outfile = paste(input, ".FDRtranches.pdf", sep="") +pdf(outfile, height=7, width=8) +alpha = (data2$novelTITV - 0.5) / (targetTITV - 0.5); +numGood = round(alpha * data2$numNovel); +numBad = data2$numNovel - numGood; +d=matrix(c(numGood,numBad),2,byrow=TRUE) +barplot(d,horiz=TRUE,col=c(1,2),space=0.2,xlab="Number of Novel Variants",ylab="Novel Ti/Tv --> FDR (%)") +legend('topright',c('implied TP','implied FP'),col=c(1,2),lty=1,lwd=16) +axis(2,line=-1,at=0.7+(0:(length(data2$FDRtranche)-1))*1.2,tick=FALSE,labels=data2$FDRtranche) +axis(2,line=0.4,at=0.7+(0:(length(data2$FDRtranche)-1))*1.2,tick=FALSE,labels=data2$novelTITV) +dev.off() diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyVariantCuts.java b/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyVariantCuts.java new file mode 100755 index 000000000..e1b235dd8 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyVariantCuts.java @@ -0,0 +1,195 @@ +/* + * Copyright (c) 2010 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.variantrecalibration; + +import Jama.Matrix; +import org.broad.tribble.dbsnp.DbSNPFeature; +import org.broad.tribble.vcf.*; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext; +import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.VariantContextAdaptors; +import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack; +import org.broadinstitute.sting.gatk.refdata.utils.helpers.DbSNPHelper; +import org.broadinstitute.sting.gatk.walkers.RodWalker; +import org.broadinstitute.sting.utils.*; +import org.broadinstitute.sting.utils.collections.ExpandingArrayList; +import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.utils.genotype.vcf.VCFReader; +import org.broadinstitute.sting.utils.genotype.vcf.VCFUtils; +import org.broadinstitute.sting.utils.genotype.vcf.VCFWriter; +import org.broadinstitute.sting.utils.text.XReadLines; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.util.*; + +/** + * Applies cuts to the input vcf file (by adding filter lines) to achieve the desired novel FDR levels which were specified during VariantRecalibration + * + * @author rpoplin + * @since Jun 2, 2010 + * + * @help.summary Applies cuts to the input vcf file (by adding filter lines) to achieve the desired novel FDR levels which were specified during VariantRecalibration + */ + +public class ApplyVariantCuts extends RodWalker { + + ///////////////////////////// + // Command Line Arguments + ///////////////////////////// + @Argument(fullName="tranchesFile", shortName="tf", doc="The input tranches file describing where to cut the data", required=true) + private String TRANCHE_FILENAME = "optimizer.dat.tranches"; + @Argument(fullName="outputVCFFile", shortName="outputVCF", doc="The output filtered VCF file", required=true) + private String OUTPUT_FILENAME = "optimizer.vcf"; + + ///////////////////////////// + // Private Member Variables + ///////////////////////////// + private VCFWriter vcfWriter; + private final ArrayList ALLOWED_FORMAT_FIELDS = new ArrayList(); + final ExpandingArrayList qCuts = new ExpandingArrayList(); + final ExpandingArrayList filterName = new ExpandingArrayList(); + + //--------------------------------------------------------------------------------------------------------------- + // + // initialize + // + //--------------------------------------------------------------------------------------------------------------- + + public void initialize() { + + boolean firstLine = true; + try { + for( final String line : new XReadLines(new File( TRANCHE_FILENAME )) ) { + if( !firstLine ) { + final String[] vals = line.split(","); + qCuts.add(Double.parseDouble(vals[2])); + filterName.add(vals[4]); + } + firstLine = false; + } + } catch( FileNotFoundException e ) { + throw new StingException("Can not find input file: " + TRANCHE_FILENAME); + } + + ALLOWED_FORMAT_FIELDS.add(VCFGenotypeRecord.GENOTYPE_KEY); // copied from VariantsToVCF + ALLOWED_FORMAT_FIELDS.add(VCFGenotypeRecord.GENOTYPE_QUALITY_KEY); + ALLOWED_FORMAT_FIELDS.add(VCFGenotypeRecord.DEPTH_KEY); + ALLOWED_FORMAT_FIELDS.add(VCFGenotypeRecord.GENOTYPE_LIKELIHOODS_KEY); + + // setup the header fields + final Set hInfo = new HashSet(); + hInfo.addAll(VCFUtils.getHeaderFields(getToolkit())); + hInfo.add(new VCFInfoHeaderLine("OQ", 1, VCFInfoHeaderLine.INFO_TYPE.Float, "The original variant quality score")); + hInfo.add(new VCFHeaderLine("source", "VariantOptimizer")); + vcfWriter = new VCFWriter( new File(OUTPUT_FILENAME) ); + final TreeSet samples = new TreeSet(); + final List dataSources = this.getToolkit().getRodDataSources(); + for( final ReferenceOrderedDataSource source : dataSources ) { + final RMDTrack rod = source.getReferenceOrderedData(); + if( rod.getRecordType().equals(VCFRecord.class) ) { + final VCFReader reader = new VCFReader(rod.getFile()); + final Set vcfSamples = reader.getHeader().getGenotypeSamples(); + samples.addAll(vcfSamples); + reader.close(); + } + } + + // BUGBUG : why doesn't this work? VCFFilterHeaderLine is protected? + //for( int iii = 1; iii < filterName.size(); iii++ ) { + // hInfo.add(new VCFFilterHeaderLine(filterName.get(iii)), String.format("FDR tranche level at qual " + qCuts.get(iii))); + //} + //hInfo.add(new VCFFilterHeaderLine(filterName.get(0)+"+"), String.format("FDR tranche level at qual > " + qCuts.get(filterName.size()-1))); + + final VCFHeader vcfHeader = new VCFHeader(hInfo, samples); + vcfWriter.writeHeader(vcfHeader); + } + + //--------------------------------------------------------------------------------------------------------------- + // + // map + // + //--------------------------------------------------------------------------------------------------------------- + + public Integer map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) { + + if( tracker == null ) { // For some reason RodWalkers get map calls with null trackers + return 1; + } + + for( final VariantContext vc : tracker.getAllVariantContexts(ref, null, context.getLocation(), false, false) ) { + if( vc != null && !vc.getName().equals(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME) && vc.isSNP() ) { + final VCFRecord vcf = VariantContextAdaptors.toVCF(vc, ref.getBase(), ALLOWED_FORMAT_FIELDS, false, false); + if( !vc.isFiltered() ) { + final double qual = vc.getPhredScaledQual(); + boolean setFilter = false; + for( int tranche = qCuts.size() - 1; tranche >= 0; tranche-- ) { + if( qual >= qCuts.get(tranche) ) { + if(tranche == qCuts.size() - 1) { + vcf.setFilterString(VCFRecord.PASSES_FILTERS); + setFilter = true; + } else { + vcf.setFilterString(filterName.get(tranche)); + setFilter = true; + } + break; + } + } + if( !setFilter ) { + vcf.setFilterString(filterName.get(0)+"+"); + } + } + vcfWriter.addRecord( vcf ); + } + + } + + return 1; // This value isn't used for anything + } + + //--------------------------------------------------------------------------------------------------------------- + // + // reduce + // + //--------------------------------------------------------------------------------------------------------------- + + public Integer reduceInit() { + return 1; + } + + public Integer reduce( final Integer mapValue, final Integer reduceSum ) { + return 1; + } + + public void onTraversalDone( ExpandingArrayList reduceSum ) { + vcfWriter.close(); + } +} + diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/GenerateVariantClustersWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/GenerateVariantClustersWalker.java index 4b1af125f..60e715cda 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/GenerateVariantClustersWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/GenerateVariantClustersWalker.java @@ -102,6 +102,8 @@ public class GenerateVariantClustersWalker extends RodWalker(Arrays.asList(USE_ANNOTATIONS)); if( IGNORE_INPUT_FILTERS != null ) { diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantGaussianMixtureModel.java b/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantGaussianMixtureModel.java index c5a415445..573bbc094 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantGaussianMixtureModel.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantGaussianMixtureModel.java @@ -367,7 +367,8 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel // BUGBUG: next output the actual cluster on top by integrating out every other annotation } - public final void outputOptimizationCurve( final VariantDatum[] data, final String outputPrefix, final int desiredNumVariants ) { + public final void outputOptimizationCurve( final VariantDatum[] data, final String outputPrefix, + final int desiredNumVariants, final Double[] FDRtranches ) { final int numVariants = data.length; final boolean[] markedVariant = new boolean[numVariants]; @@ -382,6 +383,11 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel final double novelTiTvAtCut[] = new double[NUM_BINS]; final double theCut[] = new double[NUM_BINS]; + final double fdrCutAsTiTv[] = new double[FDRtranches.length]; + for( int iii = 0; iii < FDRtranches.length; iii++ ) { + fdrCutAsTiTv[iii] = (1.0 - FDRtranches[iii] / 100.0) * (targetTITV - 0.5) + 0.5; + } + for( int iii = 0; iii < numVariants; iii++ ) { markedVariant[iii] = false; } @@ -392,6 +398,13 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel } catch (Exception e) { throw new StingException( "Unable to create output file: " + outputPrefix + ".dat" ); } + PrintStream tranchesOutputFile; + try { + tranchesOutputFile = new PrintStream( outputPrefix + ".dat.tranches" ); + tranchesOutputFile.println("FDRtranche,novelTITV,pCut,numNovel,filterName"); + } catch (Exception e) { + throw new StingException( "Unable to create output file: " + outputPrefix + ".dat.tranches" ); + } int numKnown = 0; int numNovel = 0; @@ -447,7 +460,16 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel // loop back through the data points looking for appropriate places to cut the data to get the target novel titv ratio int checkQuantile = 0; + int tranche = FDRtranches.length - 1; for( jjj = NUM_BINS-1; jjj >= 0; jjj-- ) { + + if( tranche >= 0 && novelTiTvAtCut[jjj] >= fdrCutAsTiTv[tranche] ) { + tranchesOutputFile.println(String.format("%.2f,%.2f,%.2f,%d,FDRtranche%.2fto%.2f", + FDRtranches[tranche],novelTiTvAtCut[jjj],theCut[jjj],numNovelAtCut[jjj], + (tranche == 0 ? 0.0 : FDRtranches[tranche-1]) ,FDRtranches[tranche])); + tranche--; + } + boolean foundCut = false; if( checkQuantile == 0 ) { if( novelTiTvAtCut[jjj] >= 0.9 * targetTITV ) { @@ -480,10 +502,12 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel String.format("%.2f",((double) numKnownAtCut[jjj] * 100.0) / ((double) numKnownAtCut[jjj] + numNovelAtCut[jjj]) ) + "%)"); logger.info("\t" + String.format("%.4f known Ti/Tv ratio", knownTiTvAtCut[jjj])); logger.info("\t" + String.format("%.4f novel Ti/Tv ratio", novelTiTvAtCut[jjj])); + logger.info("\t" + String.format("--> with an implied novel FDR of %.2f percent", Math.abs(100.0 * (1.0-((novelTiTvAtCut[jjj] - 0.5) / (targetTITV - 0.5)))))); } } outputFile.close(); + tranchesOutputFile.close(); } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java b/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java index f1060c232..998e25968 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java @@ -81,6 +81,8 @@ public class VariantRecalibrator extends RodWalker