Automatic cutting of recalibrated variant calls using ApplyVariantCuts. VariantRecalibrator produces the tranches plot alongside the optimization curve. Specify the levels using -tranche 1.0 -tranche 5.0 etc

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3472 348d0f76-0448-11de-a6fe-93d51630548a
2010-06-02 15:03:00 +00:00 · 2010-06-02 15:03:00 +00:00 · 290771a8c2
parent 4a555827aa
commit 290771a8c2
5 changed files with 253 additions and 3 deletions
--- a/R/plot_OptimizationCurve.R
+++ b/R/plot_OptimizationCurve.R
@ -32,4 +32,18 @@ points(data$pCut, data$numNovel,col="DarkGreen",pch=20)
 axis(side=4,col="DarkGreen")
 mtext("Number of Variants", side=4, line=2, col="DarkGreen",cex=1.4)
 legend("topright", c("Known","Novel"), col=c("Green","DarkGreen"), pch=c(20,20),cex=0.7)
-dev.off()
+dev.off()
+
+
+data2 = read.table(paste(input,".tranches",sep=""),sep=",",head=T)
+outfile = paste(input, ".FDRtranches.pdf", sep="")
+pdf(outfile, height=7, width=8)
+alpha = (data2$novelTITV - 0.5) / (targetTITV - 0.5);
+numGood = round(alpha * data2$numNovel);
+numBad = data2$numNovel - numGood;
+d=matrix(c(numGood,numBad),2,byrow=TRUE)
+barplot(d,horiz=TRUE,col=c(1,2),space=0.2,xlab="Number of Novel Variants",ylab="Novel Ti/Tv   -->   FDR (%)")
+legend('topright',c('implied TP','implied FP'),col=c(1,2),lty=1,lwd=16)
+axis(2,line=-1,at=0.7+(0:(length(data2$FDRtranche)-1))*1.2,tick=FALSE,labels=data2$FDRtranche)
+axis(2,line=0.4,at=0.7+(0:(length(data2$FDRtranche)-1))*1.2,tick=FALSE,labels=data2$novelTITV)
+dev.off()
--- a/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyVariantCuts.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyVariantCuts.java
@ -0,0 +1,195 @@
+/*
+ * Copyright (c) 2010 The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
+ * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.gatk.walkers.variantrecalibration;
+
+import Jama.Matrix;
+import org.broad.tribble.dbsnp.DbSNPFeature;
+import org.broad.tribble.vcf.*;
+import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
+import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
+import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
+import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource;
+import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
+import org.broadinstitute.sting.gatk.refdata.VariantContextAdaptors;
+import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack;
+import org.broadinstitute.sting.gatk.refdata.utils.helpers.DbSNPHelper;
+import org.broadinstitute.sting.gatk.walkers.RodWalker;
+import org.broadinstitute.sting.utils.*;
+import org.broadinstitute.sting.utils.collections.ExpandingArrayList;
+import org.broadinstitute.sting.commandline.Argument;
+import org.broadinstitute.sting.utils.genotype.vcf.VCFReader;
+import org.broadinstitute.sting.utils.genotype.vcf.VCFUtils;
+import org.broadinstitute.sting.utils.genotype.vcf.VCFWriter;
+import org.broadinstitute.sting.utils.text.XReadLines;
+
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.util.*;
+
+/**
+ * Applies cuts to the input vcf file (by adding filter lines) to achieve the desired novel FDR levels which were specified during VariantRecalibration
+ *
+ * @author rpoplin
+ * @since Jun 2, 2010
+ *
+ * @help.summary Applies cuts to the input vcf file (by adding filter lines) to achieve the desired novel FDR levels which were specified during VariantRecalibration
+ */
+
+public class ApplyVariantCuts extends RodWalker<Integer, Integer> {
+
+    /////////////////////////////
+    // Command Line Arguments
+    /////////////////////////////
+    @Argument(fullName="tranchesFile", shortName="tf", doc="The input tranches file describing where to cut the data", required=true)
+    private String TRANCHE_FILENAME = "optimizer.dat.tranches";
+    @Argument(fullName="outputVCFFile", shortName="outputVCF", doc="The output filtered VCF file", required=true)
+    private String OUTPUT_FILENAME = "optimizer.vcf";
+
+    /////////////////////////////
+    // Private Member Variables
+    /////////////////////////////
+    private VCFWriter vcfWriter;
+    private final ArrayList<String> ALLOWED_FORMAT_FIELDS = new ArrayList<String>();
+    final ExpandingArrayList<Double> qCuts = new ExpandingArrayList<Double>();
+    final ExpandingArrayList<String> filterName = new ExpandingArrayList<String>();
+
+    //---------------------------------------------------------------------------------------------------------------
+    //
+    // initialize
+    //
+    //---------------------------------------------------------------------------------------------------------------
+
+    public void initialize() {
+
+        boolean firstLine = true;
+        try {
+            for( final String line : new XReadLines(new File( TRANCHE_FILENAME )) ) {
+                if( !firstLine ) {
+                    final String[] vals = line.split(",");
+                    qCuts.add(Double.parseDouble(vals[2]));
+                    filterName.add(vals[4]);
+                }
+                firstLine = false;
+            }
+        } catch( FileNotFoundException e ) {
+            throw new StingException("Can not find input file: " + TRANCHE_FILENAME);
+        }
+
+        ALLOWED_FORMAT_FIELDS.add(VCFGenotypeRecord.GENOTYPE_KEY); // copied from VariantsToVCF
+        ALLOWED_FORMAT_FIELDS.add(VCFGenotypeRecord.GENOTYPE_QUALITY_KEY);
+        ALLOWED_FORMAT_FIELDS.add(VCFGenotypeRecord.DEPTH_KEY);
+        ALLOWED_FORMAT_FIELDS.add(VCFGenotypeRecord.GENOTYPE_LIKELIHOODS_KEY);
+
+        // setup the header fields
+        final Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
+        hInfo.addAll(VCFUtils.getHeaderFields(getToolkit()));
+        hInfo.add(new VCFInfoHeaderLine("OQ", 1, VCFInfoHeaderLine.INFO_TYPE.Float, "The original variant quality score"));
+        hInfo.add(new VCFHeaderLine("source", "VariantOptimizer"));
+        vcfWriter = new VCFWriter( new File(OUTPUT_FILENAME) );
+        final TreeSet<String> samples = new TreeSet<String>();
+        final List<ReferenceOrderedDataSource> dataSources = this.getToolkit().getRodDataSources();
+        for( final ReferenceOrderedDataSource source : dataSources ) {
+            final RMDTrack rod = source.getReferenceOrderedData();
+            if( rod.getRecordType().equals(VCFRecord.class) ) {
+                final VCFReader reader = new VCFReader(rod.getFile());
+                final Set<String> vcfSamples = reader.getHeader().getGenotypeSamples();
+                samples.addAll(vcfSamples);
+                reader.close();
+            }
+        }
+
+        // BUGBUG : why doesn't this work?  VCFFilterHeaderLine is protected?
+        //for( int iii = 1; iii < filterName.size(); iii++ ) {
+        //    hInfo.add(new VCFFilterHeaderLine(filterName.get(iii)), String.format("FDR tranche level at qual " + qCuts.get(iii)));
+        //}
+        //hInfo.add(new VCFFilterHeaderLine(filterName.get(0)+"+"), String.format("FDR tranche level at qual > " + qCuts.get(filterName.size()-1)));
+
+        final VCFHeader vcfHeader = new VCFHeader(hInfo, samples);
+        vcfWriter.writeHeader(vcfHeader);
+    }
+
+    //---------------------------------------------------------------------------------------------------------------
+    //
+    // map
+    //
+    //---------------------------------------------------------------------------------------------------------------
+
+    public Integer map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) {
+
+        if( tracker == null ) { // For some reason RodWalkers get map calls with null trackers
+            return 1;
+        }
+
+        for( final VariantContext vc : tracker.getAllVariantContexts(ref, null, context.getLocation(), false, false) ) {
+            if( vc != null && !vc.getName().equals(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME) && vc.isSNP() ) {
+                final VCFRecord vcf = VariantContextAdaptors.toVCF(vc, ref.getBase(), ALLOWED_FORMAT_FIELDS, false, false);
+                if( !vc.isFiltered() ) {
+                    final double qual = vc.getPhredScaledQual();
+                    boolean setFilter = false;
+                    for( int tranche = qCuts.size() - 1; tranche >= 0; tranche-- ) {
+                        if( qual >= qCuts.get(tranche) ) {
+                            if(tranche == qCuts.size() - 1) {
+                                vcf.setFilterString(VCFRecord.PASSES_FILTERS);
+                                setFilter = true;
+                            } else {
+                                vcf.setFilterString(filterName.get(tranche));
+                                setFilter = true;
+                            }
+                            break;
+                        }
+                    }
+                    if( !setFilter ) {
+                        vcf.setFilterString(filterName.get(0)+"+");
+                    }
+                }
+                vcfWriter.addRecord( vcf );
+            }
+
+        }
+
+        return 1; // This value isn't used for anything
+    }
+
+    //---------------------------------------------------------------------------------------------------------------
+    //
+    // reduce
+    //
+    //---------------------------------------------------------------------------------------------------------------
+
+    public Integer reduceInit() {
+        return 1;
+    }
+
+    public Integer reduce( final Integer mapValue, final Integer reduceSum ) {
+        return 1;
+    }
+
+    public void onTraversalDone( ExpandingArrayList<VariantDatum> reduceSum ) {
+        vcfWriter.close();
+    }
+}
+
--- a/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/GenerateVariantClustersWalker.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/GenerateVariantClustersWalker.java
@ -102,6 +102,8 @@ public class GenerateVariantClustersWalker extends RodWalker<ExpandingArrayList<
    //---------------------------------------------------------------------------------------------------------------

    public void initialize() {
+        if( !PATH_TO_RESOURCES.endsWith("/") ) { PATH_TO_RESOURCES = PATH_TO_RESOURCES + "/"; }
+        
        annotationKeys = new ExpandingArrayList<String>(Arrays.asList(USE_ANNOTATIONS));

        if( IGNORE_INPUT_FILTERS != null ) {
--- a/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantGaussianMixtureModel.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantGaussianMixtureModel.java
@ -367,7 +367,8 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
       // BUGBUG: next output the actual cluster on top by integrating out every other annotation
    }

-    public final void outputOptimizationCurve( final VariantDatum[] data, final String outputPrefix, final int desiredNumVariants ) {
+    public final void outputOptimizationCurve( final VariantDatum[] data, final String outputPrefix,
+                                               final int desiredNumVariants, final Double[] FDRtranches ) {

        final int numVariants = data.length;
        final boolean[] markedVariant = new boolean[numVariants];
@ -382,6 +383,11 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
        final double novelTiTvAtCut[] = new double[NUM_BINS];
        final double theCut[] = new double[NUM_BINS];

+        final double fdrCutAsTiTv[] = new double[FDRtranches.length];
+        for( int iii = 0; iii < FDRtranches.length; iii++ ) {
+            fdrCutAsTiTv[iii] = (1.0 - FDRtranches[iii] / 100.0) * (targetTITV - 0.5) + 0.5;
+        }
+
        for( int iii = 0; iii < numVariants; iii++ ) {
            markedVariant[iii] = false;
        }
@ -392,6 +398,13 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
        } catch (Exception e) {
            throw new StingException( "Unable to create output file: " + outputPrefix + ".dat" );
        }
+        PrintStream tranchesOutputFile;
+        try {
+            tranchesOutputFile = new PrintStream( outputPrefix + ".dat.tranches" );
+            tranchesOutputFile.println("FDRtranche,novelTITV,pCut,numNovel,filterName");
+        } catch (Exception e) {
+            throw new StingException( "Unable to create output file: " + outputPrefix + ".dat.tranches" );
+        }

        int numKnown = 0;
        int numNovel = 0;
@ -447,7 +460,16 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel

        // loop back through the data points looking for appropriate places to cut the data to get the target novel titv ratio
        int checkQuantile = 0;
+        int tranche = FDRtranches.length - 1;
        for( jjj = NUM_BINS-1; jjj >= 0; jjj-- ) {
+
+            if( tranche >= 0 && novelTiTvAtCut[jjj] >= fdrCutAsTiTv[tranche] ) {
+                tranchesOutputFile.println(String.format("%.2f,%.2f,%.2f,%d,FDRtranche%.2fto%.2f",
+                        FDRtranches[tranche],novelTiTvAtCut[jjj],theCut[jjj],numNovelAtCut[jjj],
+                        (tranche == 0 ? 0.0 : FDRtranches[tranche-1]) ,FDRtranches[tranche]));
+                tranche--;
+            }
+
            boolean foundCut = false;
            if( checkQuantile == 0 ) {
                if( novelTiTvAtCut[jjj] >= 0.9 * targetTITV ) {
@ -480,10 +502,12 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
                                    String.format("%.2f",((double) numKnownAtCut[jjj] * 100.0) / ((double) numKnownAtCut[jjj] + numNovelAtCut[jjj]) ) + "%)");
                logger.info("\t" + String.format("%.4f known Ti/Tv ratio", knownTiTvAtCut[jjj]));
                logger.info("\t" + String.format("%.4f novel Ti/Tv ratio", novelTiTvAtCut[jjj]));
+                logger.info("\t" + String.format("--> with an implied novel FDR of %.2f percent", Math.abs(100.0 * (1.0-((novelTiTvAtCut[jjj] - 0.5) / (targetTITV - 0.5))))));
            }
        }

        outputFile.close();
+        tranchesOutputFile.close();
    }


--- a/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java
@ -81,6 +81,8 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
    private String OUTPUT_PREFIX = "optimizer";
    @Argument(fullName="clusterFile", shortName="clusterFile", doc="The output cluster file", required=true)
    private String CLUSTER_FILENAME = "optimizer.cluster";
+    @Argument(fullName="FDRtranche", shortName="tranche", doc="The levels of novel false discovery rate (FDR, implied by ti/tv) at which to slice the data. (in percent, that is 1.0 for 1 percent)", required=false)
+    private Double[] FDR_TRANCHES = null;
    //@Argument(fullName = "optimization_model", shortName = "om", doc = "Optimization calculation model to employ -- GAUSSIAN_MIXTURE_MODEL is currently the default, while K_NEAREST_NEIGHBORS is also available for small callsets.", required = false)
    private VariantOptimizationModel.Model OPTIMIZATION_MODEL = VariantOptimizationModel.Model.GAUSSIAN_MIXTURE_MODEL;
    @Argument(fullName = "path_to_Rscript", shortName = "Rscript", doc = "The path to your implementation of Rscript. For Broad users this is probably /broad/tools/apps/R-2.6.0/bin/Rscript", required = false)
@ -156,6 +158,19 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
        if(!foundDBSNP) {
            throw new StingException("dbSNP track is required. This calculation is critically dependent on being able to distinguish known and novel sites.");
        }
+
+        // Set up default values for the FDR tranches if necessary
+        if( FDR_TRANCHES == null ) {
+            FDR_TRANCHES = new Double[3];
+            FDR_TRANCHES[0] = 12.5;
+            FDR_TRANCHES[1] = 20.0;
+            FDR_TRANCHES[2] = 30.0;
+            //FDR_TRANCHES[0] = 1.0;
+            //FDR_TRANCHES[1] = 5.0;
+            //FDR_TRANCHES[2] = 10.0;
+            //FDR_TRANCHES[3] = 20.0;
+            //FDR_TRANCHES[4] = 30.0;
+        }
    }

    //---------------------------------------------------------------------------------------------------------------
@ -226,7 +241,7 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
        final VariantDataManager dataManager = new VariantDataManager( reduceSum, theModel.dataManager.annotationKeys );
        reduceSum.clear(); // Don't need this ever again, clean up some memory

-        theModel.outputOptimizationCurve( dataManager.data, OUTPUT_PREFIX, DESIRED_NUM_VARIANTS );
+        theModel.outputOptimizationCurve( dataManager.data, OUTPUT_PREFIX, DESIRED_NUM_VARIANTS, FDR_TRANCHES );

        // Execute Rscript command to plot the optimization curve
        // Print out the command line to make it clear to the user what is being executed and how one might modify it