Support for automated selects for tranches in variant eval -- use -tf to make tranch-specific ve outputs. ApplyVariantCuts with tranche reading functions for general use, along with todo for ryan. CombineVariants now has --filteredAreUncalled and will treat filtered snps in input VCFs are uncalled, and so won't emit -filteredInOther set features

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3908 348d0f76-0448-11de-a6fe-93d51630548a
2010-07-30 14:16:43 +00:00 · 2010-07-30 14:16:43 +00:00 · ac8048f17b
parent 9231d13252
commit ac8048f17b
6 changed files with 94 additions and 14 deletions
--- a/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantContextUtils.java
+++ b/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantContextUtils.java
@ -77,6 +77,13 @@ public class VariantContextUtils {
        return VariantContextUtils.initializeMatchExps(map);
    }

+    public static List<JexlVCMatchExp> initializeMatchExps(ArrayList<String> names, ArrayList<String> exps) {
+        String[] nameArray = new String[names.size()];
+        String[] expArray = new String[exps.size()];
+        return initializeMatchExps(names.toArray(nameArray), exps.toArray(expArray));
+    }
+
+
    /**
     * Method for creating JexlVCMatchExp from input walker arguments mapping from names to exps.  These two arrays contain
     * the name associated with each JEXL expression. initializeMatchExps will parse each expression and return
@ -191,12 +198,13 @@ public class VariantContextUtils {
                                             VariantMergeType variantMergeOptions, GenotypeMergeType genotypeMergeOptions,
                                             boolean annotateOrigin, boolean printMessages, byte inputRefBase ) {

-        return simpleMerge(unsortedVCs, priorityListOfVCs, variantMergeOptions, genotypeMergeOptions, annotateOrigin, printMessages, inputRefBase, "set");
+        return simpleMerge(unsortedVCs, priorityListOfVCs, variantMergeOptions, genotypeMergeOptions, annotateOrigin, printMessages, inputRefBase, "set", false);
    }

-public static VariantContext simpleMerge(Collection<VariantContext> unsortedVCs, List<String> priorityListOfVCs,
-                                         VariantMergeType variantMergeOptions, GenotypeMergeType genotypeMergeOptions,
-                                         boolean annotateOrigin, boolean printMessages, byte inputRefBase, String setKey ) {
+    public static VariantContext simpleMerge(Collection<VariantContext> unsortedVCs, List<String> priorityListOfVCs,
+                                             VariantMergeType variantMergeOptions, GenotypeMergeType genotypeMergeOptions,
+                                             boolean annotateOrigin, boolean printMessages, byte inputRefBase, String setKey,
+                                             boolean filteredAreUncalled ) {
        if ( unsortedVCs == null || unsortedVCs.size() == 0 )
            return null;

@ -211,9 +219,13 @@ public static VariantContext simpleMerge(Collection<VariantContext> unsortedVCs,
        List<VariantContext> VCs = new ArrayList<VariantContext>();

        for (VariantContext vc : prepaddedVCs) {
-            VCs.add(createVariantContextWithPaddedAlleles(vc,inputRefBase));
+            // also a reasonable place to remove filtered calls, if needed
+            if ( ! filteredAreUncalled || vc.isNotFiltered() )
+                VCs.add(createVariantContextWithPaddedAlleles(vc,inputRefBase));
        }

+        if ( VCs.size() == 0 ) // everything is filtered out and we're filteredareUncalled
+            return null;

        // establish the baseline info from the first VC
        VariantContext first = VCs.get(0);
--- a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MyHaplotypeScore.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MyHaplotypeScore.java
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2010 The Broad Institute
+ * Copyright (c) 2010, The Broad Institute
 *
 * Permission is hereby granted, free of charge, to any person
 * obtaining a copy of this software and associated documentation
@ -12,15 +12,14 @@
 *
 * The above copyright notice and this permission notice shall be
 * included in all copies or substantial portions of the Software.
- *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
 * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
 * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
- * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
 */

 package org.broadinstitute.sting.gatk.walkers.annotator;
--- a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/GenotypeConcordance.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/GenotypeConcordance.java
@ -38,7 +38,7 @@ import java.util.*;

@Analysis(name = "Genotype Concordance", description = "Determine the genotype concordance between the genotypes in difference tracks")
 public class GenotypeConcordance extends VariantEvaluator {
-    private static final boolean  PRINT_INTERESTING_SITES = false;
+    private static final boolean PRINT_INTERESTING_SITES = true;

    protected final static Logger logger = Logger.getLogger(GenotypeConcordance.class);

@ -306,7 +306,8 @@ public class GenotypeConcordance extends VariantEvaluator {
                final Genotype.Type truth = validation.getGenotype(sample).getType();
                sampleStats.incrValue(sample, truth, called);
                if ( (truth == Genotype.Type.HOM_VAR || truth == Genotype.Type.HET) && called == Genotype.Type.NO_CALL ) {
-                    if ( PRINT_INTERESTING_SITES ) System.out.printf("%s: HM3 FN => %s%n", group, validation);
+                    if ( PRINT_INTERESTING_SITES && super.getVEWalker().printInterestingSites() )
+                        System.out.printf("%s: HM3 FN => %s%n", group, validation);
                }
            }
        }
--- a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java
@ -38,6 +38,7 @@ import org.broadinstitute.sting.gatk.refdata.utils.helpers.DbSNPHelper;
 import org.broadinstitute.sting.gatk.walkers.Reference;
 import org.broadinstitute.sting.gatk.walkers.RodWalker;
 import org.broadinstitute.sting.gatk.walkers.Window;
+import org.broadinstitute.sting.gatk.walkers.variantrecalibration.ApplyVariantCuts;
 import org.broadinstitute.sting.playground.utils.report.ReportMarshaller;
 import org.broadinstitute.sting.playground.utils.report.VE2ReportFactory;
 import org.broadinstitute.sting.playground.utils.report.templates.ReportFormat;
@ -114,7 +115,7 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> {
    // --------------------------------------------------------------------------------------------------------------

    @Argument(shortName="select", doc="One or more stratifications to use when evaluating the data", required=false)
-    protected String[] SELECT_EXPS = {};
+    protected ArrayList<String> SELECT_EXPS = new ArrayList<String>();
    //protected String[] SELECT_EXPS = {"set == \"Intersection\"",
    //        "set == \"HiSeq.WGS.cleaned.ug.vcf\"",
    //        "set == \"HiSeq.WGS.cleaned.ug.vcf\" || set == \"Intersection\"",
@ -122,7 +123,7 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> {
    //        "set == \"HiSeq.WGS.raw.OQ.ug.vcf\" || set == \"Intersection\""};
    
    @Argument(shortName="selectName", doc="Names to use for the list of stratifications (must be a 1-to-1 mapping)", required=false)
-    protected String[] SELECT_NAMES = {};
+    protected ArrayList<String> SELECT_NAMES = new ArrayList<String>();
    //protected String[] SELECT_NAMES = {"Intersection", "x1", "x2", "x3", "x4"};

    @Argument(shortName="known", doc="Name of ROD bindings containing variant sites that should be treated as known when splitting eval rods into known and novel subsets", required=false)
@ -194,6 +195,10 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> {
    @Argument(shortName="aatUseCodons", fullName="aminoAcidsRepresentedByCodons", doc="for the amino acid table, specifiy that the transitions are represented as codon changes, and not directly amino acid names", required = false)
    protected boolean aatUseCodons = false;

+    @Argument(fullName="tranchesFile", shortName="tf", doc="The input tranches file describing where to cut the data", required=false)
+    private String TRANCHE_FILENAME = null;
+
+
    // --------------------------------------------------------------------------------------------------------------
    //
    // private walker data
@ -279,6 +284,8 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> {
    //
    // --------------------------------------------------------------------------------------------------------------

+    public boolean printInterestingSites() { return writer != null; }
+
    public void initialize() {
        if ( dels ) {
            ALLOW_VARIANT_CONTEXT_TYPES = EnumSet.of(VariantContext.Type.INDEL, VariantContext.Type.NO_VARIATION);
@ -293,6 +300,17 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> {

        determineEvalations();

+        if ( TRANCHE_FILENAME != null ) {
+            // we are going to build a few select names automatically from the tranches file
+            for ( ApplyVariantCuts.Tranche t : ApplyVariantCuts.readTraches(new File(TRANCHE_FILENAME)) ) {
+                logger.info("Adding select for all variant above the pCut of : " + t);
+                SELECT_EXPS.add(String.format("QUAL >= %.2f", t.pCut));
+                SELECT_NAMES.add(String.format("FDR-%.2f", t.fdr));
+            }
+        }
+
+        logger.info("Selects: " + SELECT_NAMES);
+        logger.info("Selects: " + SELECT_EXPS);
        List<VariantContextUtils.JexlVCMatchExp> selectExps = VariantContextUtils.initializeMatchExps(SELECT_NAMES, SELECT_EXPS);

        for ( ReferenceOrderedDataSource d : this.getToolkit().getRodDataSources() ) {
--- a/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyVariantCuts.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyVariantCuts.java
@ -72,14 +72,61 @@ public class ApplyVariantCuts extends RodWalker<Integer, Integer> {
    final ExpandingArrayList<Double> qCuts = new ExpandingArrayList<Double>();
    final ExpandingArrayList<String> filterName = new ExpandingArrayList<String>();

+    public static class Tranche {
+        public double fdr, pCut, novelTiTv;
+        public int numNovel;
+        public String name;
+
+        public Tranche(double fdr, double pCut, double novelTiTv, int numNovel, String name) {
+            this.fdr = fdr;
+            this.pCut = pCut;
+            this.novelTiTv = novelTiTv;
+            this.numNovel = numNovel;
+            this.name = name;
+        }
+
+        public Tranche(final String line) {
+            final String[] vals = line.split(",");
+            this.fdr = Double.parseDouble(vals[0]);
+            this.novelTiTv = Double.parseDouble(vals[1]);
+            this.pCut = Double.parseDouble(vals[2]);
+            this.numNovel = Integer.parseInt(vals[3]);
+            this.name = vals[4];
+        }
+
+        public String toString() {
+            return String.format("[Tranche %s cut = %.3f with %d novels @ %.2f]", name, pCut, numNovel, novelTiTv);
+        }
+    }
+
    //---------------------------------------------------------------------------------------------------------------
    //
    // initialize
    //
    //---------------------------------------------------------------------------------------------------------------

+    public static List<Tranche> readTraches(File f) {
+        boolean firstLine = true;
+        List<Tranche> tranches = new ArrayList<Tranche>();
+
+        try {
+            for( final String line : new XReadLines(f) ) {
+                if( ! firstLine ) {
+                    tranches.add(new Tranche(line));
+                }
+                firstLine = false;
+            }
+
+            return tranches;
+        } catch( FileNotFoundException e ) {
+            throw new StingException("Can not find input file: " + f);
+        }
+    }
+
    public void initialize() {

+        // todo -- ryan, it's always best to use a read data structure, I need to read these in.
+        // todo -- I would have updated your code but there's no integration test to protect me from unexpected effects
        boolean firstLine = true;
        try {
            for( final String line : new XReadLines(new File( TRANCHE_FILENAME )) ) {
--- a/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java
@ -65,6 +65,9 @@ public class CombineVariants extends RodWalker<Integer, Integer> {
    @Argument(fullName="printComplexMerges", shortName="printComplexMerges", doc="Print out interesting sites requiring complex compatibility merging", required=false)
    public boolean printComplexMerges = false;

+    @Argument(fullName="filteredAreUncalled", shortName="filteredAreUncalled", doc="If true, then filtered VCFs are treated as uncalled, so that filtered set annotation don't appear in the combined VCF", required=false)
+    public boolean filteredAreUncalled = false;
+
    @Argument(fullName="setKey", shortName="setKey", doc="Key, by default set, in the INFO key=value tag emitted describing which set the combined VCF record came from.  Set to null if you don't want the set field emitted.", required=false)
    public String SET_KEY = "set";

@ -120,7 +123,7 @@ public class CombineVariants extends RodWalker<Integer, Integer> {
        // Need to provide reference bases to simpleMerge starting at current locus
        Collection<VariantContext> vcs = tracker.getAllVariantContexts(ref, context.getLocation());
        VariantContext mergedVC = VariantContextUtils.simpleMerge(vcs, priority, variantMergeOption,
-                genotypeMergeOption, true, printComplexMerges, ref.getBase(), SET_KEY);
+                genotypeMergeOption, true, printComplexMerges, ref.getBase(), SET_KEY, filteredAreUncalled);


        //out.printf("   merged => %s%nannotated => %s%n", mergedVC, annotatedMergedVC);