Improvements to indel analysis capabilities of VariantEval

-- Now calculates the number of Indels overlapping gold standard sites, as well as the percent of indels overlapping gold standard sites -- Removed insertion : deletion ratio for 1 bp event, replaced it with 1 + 2 : 3 bp ratio for insertions and deletions separately. This is based on an old email from Mark Daly: // - Since 1 & 2 bp insertions and 1 & 2 bp deletions are equally likely to cause a // downstream frameshift, if we make the simplifying assumptions that 3 bp ins // and 3bp del (adding/subtracting 1 AA in general) are roughly comparably // selected against, we should see a consistent 1+2 : 3 bp ratio for insertions // as for deletions, and certainly would expect consistency between in/dels that // multiple methods find and in/dels that are unique to one method (since deletions // are more common and the artifacts differ, it is probably worth looking at the totals, // overlaps and ratios for insertions and deletions separately in the methods // comparison and in this case don't even need to make the simplifying in = del functional assumption -- Added a new VEW argument to bind a gold standard track -- Added two new stratifications: OneBPIndel and TandemRepeat which do exactly what you imagine they do -- Deleted random unused functions in IndelUtils
2012-04-06 16:02:09 -04:00 · 2012-04-06 16:02:09 -04:00 · 45fc0ea98d
parent 52ef4a3e26
commit 45fc0ea98d
6 changed files with 194 additions and 64 deletions
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java
@ -116,6 +116,15 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
    @ArgumentCollection
    protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection();

+    /**
+     * Some analyses want to count overlap not with dbSNP (which is in general very open) but
+     * actually want to itemize their overlap specifically with a set of gold standard sites
+     * such as HapMap, OMNI, or the gold standard indels.  Theis argument provides a mechanism
+     * for communicating which file to use
+     */
+    @Input(fullName="goldStandard", shortName = "gold", doc="Evaluations that count calls at sites of true variation (e.g., indel calls) will use this argument as their gold standard for comparison", required=false)
+    public RodBinding<VariantContext> goldStandard = null;
+
    // Help arguments
    @Argument(fullName="list", shortName="ls", doc="List the available eval modules and exit", required=false)
    protected Boolean LIST = false;
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelSummary.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelSummary.java
@ -56,6 +56,12 @@ public class IndelSummary extends VariantEvaluator implements StandardEval {
    @DataPoint(description = "Number of singleton Indels", format = "%d")
    public int n_singleton_indels = 0;

+    @DataPoint(description = "Number of Indels overlapping gold standard sites", format = "%d")
+    public int n_indels_matching_gold_standard = 0;
+
+    @DataPoint(description = "Percent of indels overlapping gold standard sites")
+    public String gold_standard_matching_rate;
+
    // counts 1 for each site where the number of alleles > 2
    public int nMultiIndelSites = 0;

@ -71,18 +77,6 @@ public class IndelSummary extends VariantEvaluator implements StandardEval {
    @DataPoint(description = "Indel novelty rate")
    public String indel_novelty_rate;

-    @DataPoint(description = "1 to 2 bp indel ratio")
-    public String ratio_of_1_to_2_bp_indels;
-
-    @DataPoint(description = "1 to 3 bp indel ratio")
-    public String ratio_of_1_to_3_bp_indels;
-
-    @DataPoint(description = "2 to 3 bp indel ratio")
-    public String ratio_of_2_to_3_bp_indels;
-
-    @DataPoint(description = "1 and 2 to 3 bp indel ratio")
-    public String ratio_of_1_and_2_to_3_bp_indels;
-
    @DataPoint(description = "Frameshift percent")
    public String frameshift_rate_for_coding_indels;

@ -92,9 +86,6 @@ public class IndelSummary extends VariantEvaluator implements StandardEval {
    @DataPoint(description = "Insertion to deletion ratio")
    public String insertion_to_deletion_ratio;

-    @DataPoint(description = "Insertion to deletion ratio for 1 bp events")
-    public String insertion_to_deletion_ratio_for_1bp_indels;
-
    //
    // Frameshifts
    //
@ -116,9 +107,25 @@ public class IndelSummary extends VariantEvaluator implements StandardEval {
    int nSNPHets = 0, nSNPHoms = 0, nIndelHets = 0, nIndelHoms = 0;

    int nKnownIndels = 0, nInsertions = 0;
-    int n1bpInsertions = 0, n1bpDeletions = 0;
-    int[] countByLength = new int[]{0, 0, 0, 0}; // note that the first element isn't used

+    int[] insertionCountByLength = new int[]{0, 0, 0, 0}; // note that the first element isn't used
+    int[] deletionCountByLength = new int[]{0, 0, 0, 0}; // note that the first element isn't used
+
+    // - Since 1 & 2 bp insertions and 1 & 2 bp deletions are equally likely to cause a
+    // downstream frameshift, if we make the simplifying assumptions that 3 bp ins
+    // and 3bp del (adding/subtracting 1 AA in general) are roughly comparably
+    // selected against, we should see a consistent 1+2 : 3 bp ratio for insertions
+    // as for deletions, and certainly would expect consistency between in/dels that
+    // multiple methods find and in/dels that are unique to one method  (since deletions
+    // are more common and the artifacts differ, it is probably worth looking at the totals,
+    // overlaps and ratios for insertions and deletions separately in the methods
+    // comparison and in this case don't even need to make the simplifying in = del functional assumption
+
+    @DataPoint(description = "ratio of 1 and 2 bp insertions to 3 bp insertions")
+    public String ratio_of_1_and_2_to_3_bp_insertions;
+
+    @DataPoint(description = "ratio of 1 and 2 bp deletions to 3 bp deletions")
+    public String ratio_of_1_and_2_to_3_bp_deletions;

    public final static int LARGE_INDEL_SIZE_THRESHOLD = 10;

@ -150,11 +157,11 @@ public class IndelSummary extends VariantEvaluator implements StandardEval {
                }
                break;
            case INDEL:
+                final VariantContext gold = getWalker().goldStandard == null ? null : tracker.getFirstValue(getWalker().goldStandard);
                if ( eval.isComplexIndel() ) break; // don't count complex substitutions
                
                nIndelSites++;
                if ( ! eval.isBiallelic() ) nMultiIndelSites++;
-                if ( variantWasSingleton(eval) ) n_singleton_indels++;

                // collect information about het / hom ratio
                for ( final Genotype g : eval.getGenotypes() ) {
@ -164,15 +171,14 @@ public class IndelSummary extends VariantEvaluator implements StandardEval {

                for ( Allele alt : eval.getAlternateAlleles() ) {
                    n_indels++; // +1 for each alt allele
-
+                    if ( variantWasSingleton(eval) ) n_singleton_indels++;
                    if ( comp != null ) nKnownIndels++; // TODO -- make this test allele specific?
+                    if ( gold != null ) n_indels_matching_gold_standard++;

                    // ins : del ratios
                    final int alleleSize = alt.length() - eval.getReference().length();
                    if ( alleleSize == 0 ) throw new ReviewedStingException("Allele size not expected to be zero for indel: alt = " + alt + " ref = " + eval.getReference());
                    if ( alleleSize > 0 ) nInsertions++;
-                    if ( alleleSize == 1 ) n1bpInsertions++;
-                    if ( alleleSize == -1 ) n1bpDeletions++;

                    // requires snpEFF annotations
                    if ( eval.getAttributeAsString("SNPEFF_GENE_BIOTYPE", "missing").equals("protein_coding") ) {
@ -193,6 +199,7 @@ public class IndelSummary extends VariantEvaluator implements StandardEval {
                        n_large_deletions++;
                    
                    // update the baby histogram
+                    final int[] countByLength = alleleSize < 0 ? deletionCountByLength : insertionCountByLength;
                    final int absSize = Math.abs(alleleSize);
                    if ( absSize < countByLength.length ) countByLength[absSize]++;

@ -210,18 +217,18 @@ public class IndelSummary extends VariantEvaluator implements StandardEval {
        percent_of_sites_with_more_than_2_alleles = Utils.formattedRatio(nMultiIndelSites, nIndelSites);
        SNP_to_indel_ratio = Utils.formattedRatio(n_SNPs, n_indels);
        SNP_to_indel_ratio_for_singletons = Utils.formattedRatio(n_singleton_SNPs, n_singleton_indels);
+
+        gold_standard_matching_rate = Utils.formattedNoveltyRate(n_indels_matching_gold_standard, n_indels);
        indel_novelty_rate = Utils.formattedNoveltyRate(nKnownIndels, n_indels);
-        ratio_of_1_to_2_bp_indels = Utils.formattedRatio(countByLength[1], countByLength[2]);
-        ratio_of_1_to_3_bp_indels = Utils.formattedRatio(countByLength[1], countByLength[3]);
-        ratio_of_2_to_3_bp_indels = Utils.formattedRatio(countByLength[2], countByLength[3]);
-        ratio_of_1_and_2_to_3_bp_indels = Utils.formattedRatio(countByLength[1] + countByLength[2], countByLength[3]);
        frameshift_rate_for_coding_indels = Utils.formattedPercent(n_coding_indels_frameshifting, n_coding_indels_in_frame + n_coding_indels_frameshifting);

+        ratio_of_1_and_2_to_3_bp_deletions = Utils.formattedRatio(deletionCountByLength[1] + deletionCountByLength[2], deletionCountByLength[3]);
+        ratio_of_1_and_2_to_3_bp_insertions = Utils.formattedRatio(insertionCountByLength[1] + insertionCountByLength[2], insertionCountByLength[3]);
+
        SNP_het_to_hom_ratio = Utils.formattedRatio(nSNPHets, nSNPHoms);
        indel_het_to_hom_ratio = Utils.formattedRatio(nIndelHets, nIndelHoms);

        insertion_to_deletion_ratio = Utils.formattedRatio(nInsertions, n_indels - nInsertions);
-        insertion_to_deletion_ratio_for_1bp_indels = Utils.formattedRatio(n1bpInsertions, n1bpDeletions);
        insertion_to_deletion_ratio_for_large_indels = Utils.formattedRatio(n_large_insertions, n_large_deletions);

    }
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/OneBPIndel.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/OneBPIndel.java
@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2012, The Broad Institute 
+ *
+ * Permission is hereby granted, free of charge, to any person 
+ * obtaining a copy of this software and associated documentation 
+ * files (the "Software"), to deal in the Software without 
+ * restriction, including without limitation the rights to use, 
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell 
+ * copies of the Software, and to permit persons to whom the 
+ * Software is furnished to do so, subject to the following 
+ * conditions: 
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software. 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.gatk.walkers.varianteval.stratifications;
+
+import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
+import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
+import org.broadinstitute.sting.utils.variantcontext.VariantContext;
+
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+
+/**
+ * Stratifies the eval RODs into sites where the indel is 1 bp in length and those where the event is 2+.
+ * all non indel events go into all bins, so that SNP counts can be used as contrasts in eval modules.
+ */
+public class OneBPIndel extends VariantStratifier {
+    private final static List<Object> ALL = Arrays.asList((Object)"all", (Object)"one.bp", (Object)"two.plus.bp");
+    private final static List<Object> ONE_BP = Arrays.asList((Object)"all", (Object)"one.bp");
+    private final static List<Object> TWO_PLUS_BP = Arrays.asList((Object)"all", (Object)"two.plus.bp");
+
+    @Override
+    public void initialize() {
+        states.addAll(ALL);
+    }
+
+    @Override
+    public List<Object> getRelevantStates(ReferenceContext ref, RefMetaDataTracker tracker, VariantContext comp, String compName, VariantContext eval, String evalName, String sampleName) {
+        if (eval != null && eval.isIndel()) {
+            for ( int l : eval.getIndelLengths() )
+                if ( l > 1 )
+                    return TWO_PLUS_BP; // someone is too long
+            return ONE_BP; // all lengths are one
+        } else
+            return ALL;
+    }
+}
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/TandemRepeat.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/TandemRepeat.java
@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2012, The Broad Institute 
+ *
+ * Permission is hereby granted, free of charge, to any person 
+ * obtaining a copy of this software and associated documentation 
+ * files (the "Software"), to deal in the Software without 
+ * restriction, including without limitation the rights to use, 
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell 
+ * copies of the Software, and to permit persons to whom the 
+ * Software is furnished to do so, subject to the following 
+ * conditions: 
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software. 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.gatk.walkers.varianteval.stratifications;
+
+import org.broad.tribble.util.ParsingUtils;
+import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
+import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
+import org.broadinstitute.sting.utils.variantcontext.VariantContext;
+import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
+
+import java.util.Arrays;
+import java.util.List;
+
+/**
+ * Stratifies the eval RODs into sites that are tandem repeats
+ */
+public class TandemRepeat extends VariantStratifier {
+    private final static List<Object> JUST_ALL = Arrays.asList((Object)"all");
+    private final static List<Object> ALL = Arrays.asList((Object)"all", (Object)"is.repeat", (Object)"not.repeat");
+    private final static List<Object> REPEAT = Arrays.asList((Object)"all", (Object)"is.repeat");
+    private final static List<Object> NOT_REPEAT = Arrays.asList((Object)"all", (Object)"not.repeat");
+
+    @Override
+    public void initialize() {
+        states.addAll(ALL);
+    }
+
+    @Override
+    public List<Object> getRelevantStates(ReferenceContext ref, RefMetaDataTracker tracker, VariantContext comp, String compName, VariantContext eval, String evalName, String sampleName) {
+        if ( eval == null || ! eval.isIndel() )
+            return ALL;
+        else if ( VariantContextUtils.isTandemRepeat(eval, ref.getForwardBases()) ) {
+            print("REPEAT", eval, ref);
+            return REPEAT;
+        } else {
+            print("NOT A REPEAT", eval, ref);
+            return NOT_REPEAT;
+        }
+    }
+    
+    private final void print(String prefix, VariantContext eval, ReferenceContext ref) {
+//        String alleles = ParsingUtils.sortList(eval.getAlleles()).toString();
+//        this.getVariantEvalWalker().getLogger().info(prefix + ": " + "pos=" + eval.getStart() + " alleles=" + alleles + " ref=" + new String(ref.getForwardBases()));
+    }
+}
--- a/public/java/src/org/broadinstitute/sting/utils/IndelUtils.java
+++ b/public/java/src/org/broadinstitute/sting/utils/IndelUtils.java
@ -224,10 +224,6 @@ public class IndelUtils {
        return inds;
    }

-    public static String[] getIndelClassificationNames() {
-        return COLUMN_KEYS;
-    }
-
    public static String getIndelClassificationName(int k) {
        if (k >=0 && k < COLUMN_KEYS.length)
            return COLUMN_KEYS[k];
@ -235,35 +231,6 @@ public class IndelUtils {
            throw new ReviewedStingException("Invalid index when trying to get indel classification name");
    }

-    public static boolean isATExpansion(VariantContext vc, ReferenceContext ref) {
-        ArrayList<Integer> inds = findEventClassificationIndex(vc, ref);
-
-        boolean isIt = false;
-        for (int k : inds) {
-            if (k == IND_FOR_REPEAT_EXPANSION_A || k == IND_FOR_REPEAT_EXPANSION_T) {
-                isIt = true;
-                break;
-            }
-        }
-
-        return isIt;
-
-    }
-    public static boolean isCGExpansion(VariantContext vc, ReferenceContext ref) {
-        ArrayList<Integer> inds = findEventClassificationIndex(vc, ref);
-
-         boolean isIt = false;
-         for (int k : inds) {
-             if (k == IND_FOR_REPEAT_EXPANSION_C || k == IND_FOR_REPEAT_EXPANSION_G) {
-                 isIt = true;
-                 break;
-             }
-         }
-
-         return isIt;
-
-    }
-
    public static boolean isInsideExtendedIndel(VariantContext vc, ReferenceContext ref) {
        return (vc.getStart() != ref.getLocus().getStart());
    }
--- a/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java
+++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java
@ -302,7 +302,7 @@ public class VariantEvalIntegrationTest extends WalkerTest {
                " --eval " + validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf" +
                " --comp:comp_genotypes,VCF3 " + validationDataLocation + "yri.trio.gatk.ug.head.vcf";
        WalkerTestSpec spec = new WalkerTestSpec(withSelect(tests, "DP < 50", "DP50") + " " + extraArgs + " -ST CpG -o %s",
-                1, Arrays.asList("c8a782f51e094dc7be06dbfb795feab2"));
+                1, Arrays.asList("4c00cfa0fd343fef62d19af0edeb4f65"));
        executeTestParallel("testSelect1", spec);
    }

@ -330,7 +330,7 @@ public class VariantEvalIntegrationTest extends WalkerTest {
    @Test
    public void testCompVsEvalAC() {
        String extraArgs = "-T VariantEval -R "+b36KGReference+" -o %s -ST CpG -EV GenotypeConcordance --eval:evalYRI,VCF3 " + validationDataLocation + "yri.trio.gatk.ug.very.few.lines.vcf --comp:compYRI,VCF3 " + validationDataLocation + "yri.trio.gatk.fake.genotypes.ac.test.vcf";
-        WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("5c409a2ab4517f862c6678902c0fd7a1"));
+        WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("4df6654860ad63b7e24e6bc5fbbbcb00"));
        executeTestParallel("testCompVsEvalAC",spec);
    }

@ -360,7 +360,7 @@ public class VariantEvalIntegrationTest extends WalkerTest {
                           " --dbsnp " + b37dbSNP132 +
                           " --eval:evalBI " + validationDataLocation + "VariantEval/ALL.20100201.chr20.bi.sites.vcf" +
                           " -noST -ST Novelty -o %s";
-        WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("a27c700eabe6b7b3877c8fd4eabb3975"));
+        WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("3b85cd0fa37539ff51d34e026f26fef2"));
        executeTestParallel("testEvalTrackWithoutGenotypes",spec);
    }

@ -372,7 +372,7 @@ public class VariantEvalIntegrationTest extends WalkerTest {
                " --eval:evalBI " + validationDataLocation + "VariantEval/ALL.20100201.chr20.bi.sites.vcf" +
                " --eval:evalBC " + validationDataLocation + "VariantEval/ALL.20100201.chr20.bc.sites.vcf" +
                " -noST -ST Novelty -o %s";
-        WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("3272a2db627d4f42bc512df49a8ea64b"));
+        WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("bed8751c773b9568218f78c90f13348a"));
        executeTestParallel("testMultipleEvalTracksWithoutGenotypes",spec);
    }

@ -488,11 +488,32 @@ public class VariantEvalIntegrationTest extends WalkerTest {
                                        "-o %s"
                                ),
                                1,
-                                Arrays.asList("41a37636868a838a632559949c5216cf")
+                                Arrays.asList("9726c0c8f19d271cf680f5f16f0926b3")
                              );
        executeTest("testModernVCFWithLargeIndels", spec);
    }

+    @Test
+    public void testStandardIndelEval() {
+        WalkerTestSpec spec = new WalkerTestSpec(
+                buildCommandLine(
+                        "-T VariantEval",
+                        "-R " + b37KGReference,
+                        "-eval " + validationDataLocation + "/NA12878.HiSeq.WGS.b37_decoy.indel.recalibrated.vcf",
+                        "-L 20",
+                        "-noST -ST Sample -ST OneBPIndel -ST TandemRepeat",
+                        "-noEV -EV IndelSummary -EV IndelLengthHistogram",
+                        "-gold " + validationDataLocation + "/Mills_and_1000G_gold_standard.indels.b37.sites.vcf",
+                        "-D " + b37dbSNP132,
+                        "-o %s"
+                ),
+                1,
+                Arrays.asList("c89705147ef4233d5de3a539469bd1d1")
+        );
+        executeTest("testStandardIndelEval", spec);
+    }
+
+
    @Test()
    public void testIncompatibleEvalAndStrat() {
        WalkerTestSpec spec = new WalkerTestSpec(