From 45fc0ea98d02f887c6cee0677d5f0f449fac8a7b Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 6 Apr 2012 16:02:09 -0400 Subject: [PATCH] Improvements to indel analysis capabilities of VariantEval -- Now calculates the number of Indels overlapping gold standard sites, as well as the percent of indels overlapping gold standard sites -- Removed insertion : deletion ratio for 1 bp event, replaced it with 1 + 2 : 3 bp ratio for insertions and deletions separately. This is based on an old email from Mark Daly: // - Since 1 & 2 bp insertions and 1 & 2 bp deletions are equally likely to cause a // downstream frameshift, if we make the simplifying assumptions that 3 bp ins // and 3bp del (adding/subtracting 1 AA in general) are roughly comparably // selected against, we should see a consistent 1+2 : 3 bp ratio for insertions // as for deletions, and certainly would expect consistency between in/dels that // multiple methods find and in/dels that are unique to one method (since deletions // are more common and the artifacts differ, it is probably worth looking at the totals, // overlaps and ratios for insertions and deletions separately in the methods // comparison and in this case don't even need to make the simplifying in = del functional assumption -- Added a new VEW argument to bind a gold standard track -- Added two new stratifications: OneBPIndel and TandemRepeat which do exactly what you imagine they do -- Deleted random unused functions in IndelUtils --- .../varianteval/VariantEvalWalker.java | 9 +++ .../varianteval/evaluators/IndelSummary.java | 59 +++++++++------- .../stratifications/OneBPIndel.java | 59 ++++++++++++++++ .../stratifications/TandemRepeat.java | 67 +++++++++++++++++++ .../sting/utils/IndelUtils.java | 33 --------- .../VariantEvalIntegrationTest.java | 31 +++++++-- 6 files changed, 194 insertions(+), 64 deletions(-) create mode 100644 public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/OneBPIndel.java create mode 100644 public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/TandemRepeat.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java index b0877d893..6c7922ea5 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java @@ -116,6 +116,15 @@ public class VariantEvalWalker extends RodWalker implements Tr @ArgumentCollection protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection(); + /** + * Some analyses want to count overlap not with dbSNP (which is in general very open) but + * actually want to itemize their overlap specifically with a set of gold standard sites + * such as HapMap, OMNI, or the gold standard indels. Theis argument provides a mechanism + * for communicating which file to use + */ + @Input(fullName="goldStandard", shortName = "gold", doc="Evaluations that count calls at sites of true variation (e.g., indel calls) will use this argument as their gold standard for comparison", required=false) + public RodBinding goldStandard = null; + // Help arguments @Argument(fullName="list", shortName="ls", doc="List the available eval modules and exit", required=false) protected Boolean LIST = false; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelSummary.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelSummary.java index 49b865c31..786b7296b 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelSummary.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelSummary.java @@ -56,6 +56,12 @@ public class IndelSummary extends VariantEvaluator implements StandardEval { @DataPoint(description = "Number of singleton Indels", format = "%d") public int n_singleton_indels = 0; + @DataPoint(description = "Number of Indels overlapping gold standard sites", format = "%d") + public int n_indels_matching_gold_standard = 0; + + @DataPoint(description = "Percent of indels overlapping gold standard sites") + public String gold_standard_matching_rate; + // counts 1 for each site where the number of alleles > 2 public int nMultiIndelSites = 0; @@ -71,18 +77,6 @@ public class IndelSummary extends VariantEvaluator implements StandardEval { @DataPoint(description = "Indel novelty rate") public String indel_novelty_rate; - @DataPoint(description = "1 to 2 bp indel ratio") - public String ratio_of_1_to_2_bp_indels; - - @DataPoint(description = "1 to 3 bp indel ratio") - public String ratio_of_1_to_3_bp_indels; - - @DataPoint(description = "2 to 3 bp indel ratio") - public String ratio_of_2_to_3_bp_indels; - - @DataPoint(description = "1 and 2 to 3 bp indel ratio") - public String ratio_of_1_and_2_to_3_bp_indels; - @DataPoint(description = "Frameshift percent") public String frameshift_rate_for_coding_indels; @@ -92,9 +86,6 @@ public class IndelSummary extends VariantEvaluator implements StandardEval { @DataPoint(description = "Insertion to deletion ratio") public String insertion_to_deletion_ratio; - @DataPoint(description = "Insertion to deletion ratio for 1 bp events") - public String insertion_to_deletion_ratio_for_1bp_indels; - // // Frameshifts // @@ -116,9 +107,25 @@ public class IndelSummary extends VariantEvaluator implements StandardEval { int nSNPHets = 0, nSNPHoms = 0, nIndelHets = 0, nIndelHoms = 0; int nKnownIndels = 0, nInsertions = 0; - int n1bpInsertions = 0, n1bpDeletions = 0; - int[] countByLength = new int[]{0, 0, 0, 0}; // note that the first element isn't used + int[] insertionCountByLength = new int[]{0, 0, 0, 0}; // note that the first element isn't used + int[] deletionCountByLength = new int[]{0, 0, 0, 0}; // note that the first element isn't used + + // - Since 1 & 2 bp insertions and 1 & 2 bp deletions are equally likely to cause a + // downstream frameshift, if we make the simplifying assumptions that 3 bp ins + // and 3bp del (adding/subtracting 1 AA in general) are roughly comparably + // selected against, we should see a consistent 1+2 : 3 bp ratio for insertions + // as for deletions, and certainly would expect consistency between in/dels that + // multiple methods find and in/dels that are unique to one method (since deletions + // are more common and the artifacts differ, it is probably worth looking at the totals, + // overlaps and ratios for insertions and deletions separately in the methods + // comparison and in this case don't even need to make the simplifying in = del functional assumption + + @DataPoint(description = "ratio of 1 and 2 bp insertions to 3 bp insertions") + public String ratio_of_1_and_2_to_3_bp_insertions; + + @DataPoint(description = "ratio of 1 and 2 bp deletions to 3 bp deletions") + public String ratio_of_1_and_2_to_3_bp_deletions; public final static int LARGE_INDEL_SIZE_THRESHOLD = 10; @@ -150,11 +157,11 @@ public class IndelSummary extends VariantEvaluator implements StandardEval { } break; case INDEL: + final VariantContext gold = getWalker().goldStandard == null ? null : tracker.getFirstValue(getWalker().goldStandard); if ( eval.isComplexIndel() ) break; // don't count complex substitutions nIndelSites++; if ( ! eval.isBiallelic() ) nMultiIndelSites++; - if ( variantWasSingleton(eval) ) n_singleton_indels++; // collect information about het / hom ratio for ( final Genotype g : eval.getGenotypes() ) { @@ -164,15 +171,14 @@ public class IndelSummary extends VariantEvaluator implements StandardEval { for ( Allele alt : eval.getAlternateAlleles() ) { n_indels++; // +1 for each alt allele - + if ( variantWasSingleton(eval) ) n_singleton_indels++; if ( comp != null ) nKnownIndels++; // TODO -- make this test allele specific? + if ( gold != null ) n_indels_matching_gold_standard++; // ins : del ratios final int alleleSize = alt.length() - eval.getReference().length(); if ( alleleSize == 0 ) throw new ReviewedStingException("Allele size not expected to be zero for indel: alt = " + alt + " ref = " + eval.getReference()); if ( alleleSize > 0 ) nInsertions++; - if ( alleleSize == 1 ) n1bpInsertions++; - if ( alleleSize == -1 ) n1bpDeletions++; // requires snpEFF annotations if ( eval.getAttributeAsString("SNPEFF_GENE_BIOTYPE", "missing").equals("protein_coding") ) { @@ -193,6 +199,7 @@ public class IndelSummary extends VariantEvaluator implements StandardEval { n_large_deletions++; // update the baby histogram + final int[] countByLength = alleleSize < 0 ? deletionCountByLength : insertionCountByLength; final int absSize = Math.abs(alleleSize); if ( absSize < countByLength.length ) countByLength[absSize]++; @@ -210,18 +217,18 @@ public class IndelSummary extends VariantEvaluator implements StandardEval { percent_of_sites_with_more_than_2_alleles = Utils.formattedRatio(nMultiIndelSites, nIndelSites); SNP_to_indel_ratio = Utils.formattedRatio(n_SNPs, n_indels); SNP_to_indel_ratio_for_singletons = Utils.formattedRatio(n_singleton_SNPs, n_singleton_indels); + + gold_standard_matching_rate = Utils.formattedNoveltyRate(n_indels_matching_gold_standard, n_indels); indel_novelty_rate = Utils.formattedNoveltyRate(nKnownIndels, n_indels); - ratio_of_1_to_2_bp_indels = Utils.formattedRatio(countByLength[1], countByLength[2]); - ratio_of_1_to_3_bp_indels = Utils.formattedRatio(countByLength[1], countByLength[3]); - ratio_of_2_to_3_bp_indels = Utils.formattedRatio(countByLength[2], countByLength[3]); - ratio_of_1_and_2_to_3_bp_indels = Utils.formattedRatio(countByLength[1] + countByLength[2], countByLength[3]); frameshift_rate_for_coding_indels = Utils.formattedPercent(n_coding_indels_frameshifting, n_coding_indels_in_frame + n_coding_indels_frameshifting); + ratio_of_1_and_2_to_3_bp_deletions = Utils.formattedRatio(deletionCountByLength[1] + deletionCountByLength[2], deletionCountByLength[3]); + ratio_of_1_and_2_to_3_bp_insertions = Utils.formattedRatio(insertionCountByLength[1] + insertionCountByLength[2], insertionCountByLength[3]); + SNP_het_to_hom_ratio = Utils.formattedRatio(nSNPHets, nSNPHoms); indel_het_to_hom_ratio = Utils.formattedRatio(nIndelHets, nIndelHoms); insertion_to_deletion_ratio = Utils.formattedRatio(nInsertions, n_indels - nInsertions); - insertion_to_deletion_ratio_for_1bp_indels = Utils.formattedRatio(n1bpInsertions, n1bpDeletions); insertion_to_deletion_ratio_for_large_indels = Utils.formattedRatio(n_large_insertions, n_large_deletions); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/OneBPIndel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/OneBPIndel.java new file mode 100644 index 000000000..fe4f7641f --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/OneBPIndel.java @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.varianteval.stratifications; + +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; + +import java.util.Arrays; +import java.util.Collections; +import java.util.List; + +/** + * Stratifies the eval RODs into sites where the indel is 1 bp in length and those where the event is 2+. + * all non indel events go into all bins, so that SNP counts can be used as contrasts in eval modules. + */ +public class OneBPIndel extends VariantStratifier { + private final static List ALL = Arrays.asList((Object)"all", (Object)"one.bp", (Object)"two.plus.bp"); + private final static List ONE_BP = Arrays.asList((Object)"all", (Object)"one.bp"); + private final static List TWO_PLUS_BP = Arrays.asList((Object)"all", (Object)"two.plus.bp"); + + @Override + public void initialize() { + states.addAll(ALL); + } + + @Override + public List getRelevantStates(ReferenceContext ref, RefMetaDataTracker tracker, VariantContext comp, String compName, VariantContext eval, String evalName, String sampleName) { + if (eval != null && eval.isIndel()) { + for ( int l : eval.getIndelLengths() ) + if ( l > 1 ) + return TWO_PLUS_BP; // someone is too long + return ONE_BP; // all lengths are one + } else + return ALL; + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/TandemRepeat.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/TandemRepeat.java new file mode 100644 index 000000000..834c02b83 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/TandemRepeat.java @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.varianteval.stratifications; + +import org.broad.tribble.util.ParsingUtils; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; + +import java.util.Arrays; +import java.util.List; + +/** + * Stratifies the eval RODs into sites that are tandem repeats + */ +public class TandemRepeat extends VariantStratifier { + private final static List JUST_ALL = Arrays.asList((Object)"all"); + private final static List ALL = Arrays.asList((Object)"all", (Object)"is.repeat", (Object)"not.repeat"); + private final static List REPEAT = Arrays.asList((Object)"all", (Object)"is.repeat"); + private final static List NOT_REPEAT = Arrays.asList((Object)"all", (Object)"not.repeat"); + + @Override + public void initialize() { + states.addAll(ALL); + } + + @Override + public List getRelevantStates(ReferenceContext ref, RefMetaDataTracker tracker, VariantContext comp, String compName, VariantContext eval, String evalName, String sampleName) { + if ( eval == null || ! eval.isIndel() ) + return ALL; + else if ( VariantContextUtils.isTandemRepeat(eval, ref.getForwardBases()) ) { + print("REPEAT", eval, ref); + return REPEAT; + } else { + print("NOT A REPEAT", eval, ref); + return NOT_REPEAT; + } + } + + private final void print(String prefix, VariantContext eval, ReferenceContext ref) { +// String alleles = ParsingUtils.sortList(eval.getAlleles()).toString(); +// this.getVariantEvalWalker().getLogger().info(prefix + ": " + "pos=" + eval.getStart() + " alleles=" + alleles + " ref=" + new String(ref.getForwardBases())); + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/IndelUtils.java b/public/java/src/org/broadinstitute/sting/utils/IndelUtils.java index 74f147127..c6ca39f4b 100755 --- a/public/java/src/org/broadinstitute/sting/utils/IndelUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/IndelUtils.java @@ -224,10 +224,6 @@ public class IndelUtils { return inds; } - public static String[] getIndelClassificationNames() { - return COLUMN_KEYS; - } - public static String getIndelClassificationName(int k) { if (k >=0 && k < COLUMN_KEYS.length) return COLUMN_KEYS[k]; @@ -235,35 +231,6 @@ public class IndelUtils { throw new ReviewedStingException("Invalid index when trying to get indel classification name"); } - public static boolean isATExpansion(VariantContext vc, ReferenceContext ref) { - ArrayList inds = findEventClassificationIndex(vc, ref); - - boolean isIt = false; - for (int k : inds) { - if (k == IND_FOR_REPEAT_EXPANSION_A || k == IND_FOR_REPEAT_EXPANSION_T) { - isIt = true; - break; - } - } - - return isIt; - - } - public static boolean isCGExpansion(VariantContext vc, ReferenceContext ref) { - ArrayList inds = findEventClassificationIndex(vc, ref); - - boolean isIt = false; - for (int k : inds) { - if (k == IND_FOR_REPEAT_EXPANSION_C || k == IND_FOR_REPEAT_EXPANSION_G) { - isIt = true; - break; - } - } - - return isIt; - - } - public static boolean isInsideExtendedIndel(VariantContext vc, ReferenceContext ref) { return (vc.getStart() != ref.getLocus().getStart()); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java index d67fc61e2..035bf4020 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java @@ -302,7 +302,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { " --eval " + validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf" + " --comp:comp_genotypes,VCF3 " + validationDataLocation + "yri.trio.gatk.ug.head.vcf"; WalkerTestSpec spec = new WalkerTestSpec(withSelect(tests, "DP < 50", "DP50") + " " + extraArgs + " -ST CpG -o %s", - 1, Arrays.asList("c8a782f51e094dc7be06dbfb795feab2")); + 1, Arrays.asList("4c00cfa0fd343fef62d19af0edeb4f65")); executeTestParallel("testSelect1", spec); } @@ -330,7 +330,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { @Test public void testCompVsEvalAC() { String extraArgs = "-T VariantEval -R "+b36KGReference+" -o %s -ST CpG -EV GenotypeConcordance --eval:evalYRI,VCF3 " + validationDataLocation + "yri.trio.gatk.ug.very.few.lines.vcf --comp:compYRI,VCF3 " + validationDataLocation + "yri.trio.gatk.fake.genotypes.ac.test.vcf"; - WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("5c409a2ab4517f862c6678902c0fd7a1")); + WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("4df6654860ad63b7e24e6bc5fbbbcb00")); executeTestParallel("testCompVsEvalAC",spec); } @@ -360,7 +360,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { " --dbsnp " + b37dbSNP132 + " --eval:evalBI " + validationDataLocation + "VariantEval/ALL.20100201.chr20.bi.sites.vcf" + " -noST -ST Novelty -o %s"; - WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("a27c700eabe6b7b3877c8fd4eabb3975")); + WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("3b85cd0fa37539ff51d34e026f26fef2")); executeTestParallel("testEvalTrackWithoutGenotypes",spec); } @@ -372,7 +372,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { " --eval:evalBI " + validationDataLocation + "VariantEval/ALL.20100201.chr20.bi.sites.vcf" + " --eval:evalBC " + validationDataLocation + "VariantEval/ALL.20100201.chr20.bc.sites.vcf" + " -noST -ST Novelty -o %s"; - WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("3272a2db627d4f42bc512df49a8ea64b")); + WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("bed8751c773b9568218f78c90f13348a")); executeTestParallel("testMultipleEvalTracksWithoutGenotypes",spec); } @@ -488,11 +488,32 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-o %s" ), 1, - Arrays.asList("41a37636868a838a632559949c5216cf") + Arrays.asList("9726c0c8f19d271cf680f5f16f0926b3") ); executeTest("testModernVCFWithLargeIndels", spec); } + @Test + public void testStandardIndelEval() { + WalkerTestSpec spec = new WalkerTestSpec( + buildCommandLine( + "-T VariantEval", + "-R " + b37KGReference, + "-eval " + validationDataLocation + "/NA12878.HiSeq.WGS.b37_decoy.indel.recalibrated.vcf", + "-L 20", + "-noST -ST Sample -ST OneBPIndel -ST TandemRepeat", + "-noEV -EV IndelSummary -EV IndelLengthHistogram", + "-gold " + validationDataLocation + "/Mills_and_1000G_gold_standard.indels.b37.sites.vcf", + "-D " + b37dbSNP132, + "-o %s" + ), + 1, + Arrays.asList("c89705147ef4233d5de3a539469bd1d1") + ); + executeTest("testStandardIndelEval", spec); + } + + @Test() public void testIncompatibleEvalAndStrat() { WalkerTestSpec spec = new WalkerTestSpec(