VariantEval test for tranches file, plus cutting over VE to use the generic Tranches framework

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4656 348d0f76-0448-11de-a6fe-93d51630548a
2010-11-12 13:52:40 +00:00 · 2010-11-12 13:52:40 +00:00 · c5f8c4dd0d
parent 69de3e51bf
commit c5f8c4dd0d
5 changed files with 15 additions and 20 deletions
--- a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java
@ -42,7 +42,7 @@ import org.broadinstitute.sting.gatk.walkers.Reference;
 import org.broadinstitute.sting.gatk.walkers.RodWalker;
 import org.broadinstitute.sting.gatk.walkers.Window;
 import org.broadinstitute.sting.gatk.walkers.TreeReducible;
-import org.broadinstitute.sting.gatk.walkers.variantrecalibration.ApplyVariantCuts;
+import org.broadinstitute.sting.gatk.walkers.variantrecalibration.Tranche;
 import org.broadinstitute.sting.utils.SampleUtils;
 import org.broadinstitute.sting.utils.report.ReportMarshaller;
 import org.broadinstitute.sting.utils.report.VE2ReportFactory;
@ -71,18 +71,10 @@ import java.util.*;
 // todo -- clustered SNP counter
 // todo -- HWEs
 // todo -- indel metrics [count of sizes in/del should be in CountVariants]
-// todo -- synonymous / non-synonmous ratio, or really just comparison of observed vs. expected biological annotation values
-
-// todo -- Performance:
-// todo -- deal with performance issues with variant contexts

 // todo -- port over SNP density walker:
 // todo -- see walker for WG calc but will need to make it work with intervals correctly

-// todo -- counts of snps per target [target name, gene, etc]
-
-// todo -- add subgroup of known variants as to those at hapmap sites [it's in the dbSNP record]
-
 // Todo -- should really include argument parsing @annotations from subclass in this walker.  Very
 // todo -- useful general capability.  Right now you need to add arguments to VariantEval2 to handle new
 // todo -- evaluation arguments (which is better than passing a string!)
@ -101,8 +93,6 @@ import java.util.*;
 // todo -- discovered and released by 1KG.  Might need to make this data set ourselves and keep it in GATK/data like
 // todo -- dbsnp rod
 //
-// todo -- aux. plotting routines for VE2
-//
 // todo -- implement as select statment, but it's hard for multi-sample calls.
 // todo -- Provide separate dbsnp rates for het only calls and any call where there is at least one hom-var genotype,
 // todo -- since hets are much more likely to be errors
@ -126,15 +116,9 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr

    @Argument(shortName="select", doc="One or more stratifications to use when evaluating the data", required=false)
    protected ArrayList<String> SELECT_EXPS = new ArrayList<String>();
-    //protected String[] SELECT_EXPS = {"set == \"Intersection\"",
-    //        "set == \"HiSeq.WGS.cleaned.ug.vcf\"",
-    //        "set == \"HiSeq.WGS.cleaned.ug.vcf\" || set == \"Intersection\"",
-    //        "set == \"HiSeq.WGS.raw.OQ.ug.vcf\"",
-    //        "set == \"HiSeq.WGS.raw.OQ.ug.vcf\" || set == \"Intersection\""};

    @Argument(shortName="selectName", doc="Names to use for the list of stratifications (must be a 1-to-1 mapping)", required=false)
    protected ArrayList<String> SELECT_NAMES = new ArrayList<String>();
-    //protected String[] SELECT_NAMES = {"Intersection", "x1", "x2", "x3", "x4"};

    @Argument(shortName="known", doc="Name of ROD bindings containing variant sites that should be treated as known when splitting eval rods into known and novel subsets", required=false)
    protected String[] KNOWN_NAMES = {DbSNPHelper.STANDARD_DBSNP_TRACK_NAME};
@ -321,15 +305,17 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr

        if ( TRANCHE_FILENAME != null ) {
            // we are going to build a few select names automatically from the tranches file
-            for ( ApplyVariantCuts.Tranche t : ApplyVariantCuts.readTraches(new File(TRANCHE_FILENAME)) ) {
+            for ( Tranche t : Tranche.readTraches(new File(TRANCHE_FILENAME)) ) {
                logger.info("Adding select for all variant above the pCut of : " + t);
                SELECT_EXPS.add(String.format("QUAL >= %.2f", t.pCut));
                SELECT_NAMES.add(String.format("FDR-%.2f", t.fdr));
            }
        }

-        logger.info("Selects: " + SELECT_NAMES);
-        logger.info("Selects: " + SELECT_EXPS);
+        if ( SELECT_NAMES.size() > 0 ) {
+            logger.info("Selects: " + SELECT_NAMES);
+            logger.info("Selects: " + SELECT_EXPS);
+        }
        List<VariantContextUtils.JexlVCMatchExp> selectExps = VariantContextUtils.initializeMatchExps(SELECT_NAMES, SELECT_EXPS);

        for ( ReferenceOrderedDataSource d : this.getToolkit().getRodDataSources() ) {
--- a/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java
+++ b/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java
@ -141,4 +141,13 @@ public class
    private static String withSelect(String cmd, String select, String name) {
        return String.format("%s -select '%s' -selectName %s", cmd, select, name);
    }
+
+    @Test
+    public void testTranches() {
+        String extraArgs = "-T VariantEval -R "+ hg18Reference +" -B:eval,vcf " + validationDataLocation + "GA2.WEx.cleaned.ug.snpfiltered.indelfiltered.optimized.vcf -o %s -E TiTvVariantEvaluator -L chr1 -noStandard -reportType CSV -tf " + testDir + "tranches.4.txt";
+        WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("85b6621c64cc8f9a3b68cea644edf216"));
+        executeTestParallel("testTranches",spec);
+        //executeTest("testACDiscordanceAtAC1EvalAC2Comp",spec);
+    }
+
 }
--- a/java/test/data/tranches.4.txt
+++ b/java/test/data/tranches.4.txt
--- a/java/test/data/tranches.6.txt
+++ b/java/test/data/tranches.6.txt
--- a/java/test/data/tranches.raw.dat
+++ b/java/test/data/tranches.raw.dat