From 6df96644d94ccf270ba2dc7838abc61ba4498acf Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Thu, 22 Mar 2012 21:18:38 -0400 Subject: [PATCH] Unified, standard IndelSummary metrics for VariantEval -- Now you always get SNP and indel metrics with VariantEval! -- Includes Number of SNPs, Number of singleton SNPs, Number of Indels, Number of singleton Indels, Percent of indel sites that are multi-allelic, SNP to indel ratio, Singleton SNP to indel ratio, Indel novelty rate, 1 to 2 bp indel ratio, 1 to 3 bp indel ratio, 2 to 3 bp indel ratio, 1 and 2 to 3 bp indel ratio, Frameshift percent, Insertion to deletion ratio, Insertion to deletion ratio for 1 bp events, Number of indels in protein-coding regions labeled as frameshift, Number of indels in protein-coding regions not labeled as frameshift, Het to hom ratio for SNPs, Het to hom ratio for indels, a Histogram of indel lengths, Number of large (>10 bp) deletions, Number of large (>10 bp) insertions, Ratio of large (>10 bp) insertions to deletions -- Updated VE integration tests as appropriate --- .../varianteval/evaluators/IndelSummary.java | 230 ++++++++++++++++++ .../varianteval/util/IndelHistogram.java | 113 +++++++++ .../VariantEvalIntegrationTest.java | 34 +-- 3 files changed, 360 insertions(+), 17 deletions(-) create mode 100644 public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelSummary.java create mode 100644 public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/IndelHistogram.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelSummary.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelSummary.java new file mode 100644 index 000000000..51cf2bb6a --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelSummary.java @@ -0,0 +1,230 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.varianteval.evaluators; + +import org.apache.log4j.Logger; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.varianteval.util.Analysis; +import org.broadinstitute.sting.gatk.walkers.varianteval.util.DataPoint; +import org.broadinstitute.sting.gatk.walkers.varianteval.util.IndelHistogram; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.variantcontext.Allele; +import org.broadinstitute.sting.utils.variantcontext.Genotype; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; + +@Analysis(description = "Evaluation summary for indels") +public class IndelSummary extends VariantEvaluator implements StandardEval { + final protected static Logger logger = Logger.getLogger(IndelSummary.class); + + @DataPoint(description = "Number of SNPs", format = "%d") + public int n_SNPs = 0; + + @DataPoint(description = "Number of singleton SNPs", format = "%d") + public int n_singleton_SNPs = 0; + + @DataPoint(description = "Number of Indels", format = "%d") + public int n_indels = 0; + + // Number of Indels Sites (counts one for any number of alleles at site) + public int nIndelSites = 0; + + @DataPoint(description = "Number of singleton Indels", format = "%d") + public int n_singleton_indels = 0; + + // counts 1 for each site where the number of alleles > 2 + public int nMultiIndelSites = 0; + + @DataPoint(description = "Percent of indel sites that are multi-allelic") + public String percent_of_sites_with_more_than_2_alleles; + + @DataPoint(description = "SNP to indel ratio") + public String SNP_to_indel_ratio; + + @DataPoint(description = "Singleton SNP to indel ratio") + public String SNP_to_indel_ratio_for_singletons; + + @DataPoint(description = "Indel novelty rate") + public String indel_novelty_rate; + + @DataPoint(description = "1 to 2 bp indel ratio") + public String ratio_of_1_to_2_bp_indels; + + @DataPoint(description = "1 to 3 bp indel ratio") + public String ratio_of_1_to_3_bp_indels; + + @DataPoint(description = "2 to 3 bp indel ratio") + public String ratio_of_2_to_3_bp_indels; + + @DataPoint(description = "1 and 2 to 3 bp indel ratio") + public String ratio_of_1_and_2_to_3_bp_indels; + + @DataPoint(description = "Frameshift percent") + public String frameshift_rate_for_coding_indels; + + // + // insertions to deletions + // + @DataPoint(description = "Insertion to deletion ratio") + public String insertion_to_deletion_ratio; + + @DataPoint(description = "Insertion to deletion ratio for 1 bp events") + public String insertion_to_deletion_ratio_for_1bp_indels; + + // + // Frameshifts + // + @DataPoint(description = "Number of indels in protein-coding regions labeled as frameshift") + public int n_coding_indels_frameshifting = 0; + + @DataPoint(description = "Number of indels in protein-coding regions not labeled as frameshift") + public int n_coding_indels_in_frame = 0; + + // + // Het : hom ratios + // + @DataPoint(description = "Het to hom ratio for SNPs") + public String SNP_het_to_hom_ratio; + + @DataPoint(description = "Het to hom ratio for indels") + public String indel_het_to_hom_ratio; + + int nSNPHets = 0, nSNPHoms = 0, nIndelHets = 0, nIndelHoms = 0; + + int nKnownIndels = 0, nInsertions = 0; + int n1bpInsertions = 0, n1bpDeletions = 0; + int[] countByLength = new int[]{0, 0, 0, 0}; // note that the first element isn't used + + public final static int MAX_SIZE_FOR_HISTOGRAM = 10; + @DataPoint(description = "Histogram of indel lengths") + IndelHistogram lengthHistogram = new IndelHistogram(MAX_SIZE_FOR_HISTOGRAM, true); + + @DataPoint(description = "Number of large (>10 bp) deletions") + public int n_large_deletions = 0; + + @DataPoint(description = "Number of large (>10 bp) insertions") + public int n_large_insertions = 0; + + @DataPoint(description = "Ratio of large (>10 bp) insertions to deletions") + public String insertion_to_deletion_ratio_for_large_indels; + + @Override public boolean enabled() { return true; } + @Override public int getComparisonOrder() { return 2; } + + public String update2(VariantContext eval, VariantContext comp, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + if ( eval == null || eval.isMonomorphicInSamples() ) + return null; + + // update counts + switch ( eval.getType() ) { + case SNP: + n_SNPs += eval.getNAlleles() - 1; // -1 for ref + if ( variantWasSingleton(eval) ) n_singleton_SNPs++; + + // collect information about het / hom ratio + for ( final Genotype g : eval.getGenotypes() ) { + if ( g.isHet() ) nSNPHets++; + if ( g.isHomVar() ) nSNPHoms++; + } + break; + case INDEL: + if ( eval.isComplexIndel() ) break; // don't count complex substitutions + + nIndelSites++; + if ( ! eval.isBiallelic() ) nMultiIndelSites++; + if ( variantWasSingleton(eval) ) n_singleton_indels++; + + // collect information about het / hom ratio + for ( final Genotype g : eval.getGenotypes() ) { + if ( g.isHet() ) nIndelHets++; + if ( g.isHomVar() ) nIndelHoms++; + } + + for ( Allele alt : eval.getAlternateAlleles() ) { + n_indels++; // +1 for each alt allele + + if ( comp != null ) nKnownIndels++; // TODO -- make this test allele specific? + + // ins : del ratios + final int alleleSize = alt.length() - eval.getReference().length(); + if ( alleleSize == 0 ) throw new ReviewedStingException("Allele size not expected to be zero for indel: alt = " + alt + " ref = " + eval.getReference()); + if ( alleleSize > 0 ) nInsertions++; + if ( alleleSize == 1 ) n1bpInsertions++; + if ( alleleSize == -1 ) n1bpDeletions++; + + // update the length histogram + lengthHistogram.update(eval.getReference(), alt); + + // requires snpEFF annotations + if ( eval.getAttributeAsString("SNPEFF_GENE_BIOTYPE", "missing").equals("protein_coding") ) { + final String effect = eval.getAttributeAsString("SNPEFF_EFFECT", "missing"); + if ( effect.equals("missing") ) + throw new ReviewedStingException("Saw SNPEFF_GENE_BIOTYPE but unexpected no SNPEFF_EFFECT at " + eval); + if ( effect.equals("FRAME_SHIFT") ) + n_coding_indels_frameshifting++; + else if ( effect.startsWith("CODON") ) + n_coding_indels_in_frame++; + else + ; // lots of protein coding effects that shouldn't be counted, such as INTRON + } + + // update the baby histogram + final int absSize = Math.abs(alleleSize); + if ( absSize < countByLength.length ) countByLength[absSize]++; + } + + break; + default: + throw new UserException.BadInput("Unexpected variant context type: " + eval); + } + + return null; // we don't capture any interesting sites + } + + public void finalizeEvaluation() { + percent_of_sites_with_more_than_2_alleles = formattedRatio(nMultiIndelSites, nIndelSites); + SNP_to_indel_ratio = formattedRatio(n_SNPs, n_indels); + SNP_to_indel_ratio_for_singletons = formattedRatio(n_singleton_SNPs, n_singleton_indels); + indel_novelty_rate = formattedNoveltyRate(nKnownIndels, n_indels); + ratio_of_1_to_2_bp_indels = formattedRatio(countByLength[1], countByLength[2]); + ratio_of_1_to_3_bp_indels = formattedRatio(countByLength[1], countByLength[3]); + ratio_of_2_to_3_bp_indels = formattedRatio(countByLength[2], countByLength[3]); + ratio_of_1_and_2_to_3_bp_indels = formattedRatio(countByLength[1] + countByLength[2], countByLength[3]); + frameshift_rate_for_coding_indels = formattedPercent(n_coding_indels_frameshifting, n_coding_indels_in_frame + n_coding_indels_frameshifting); + + SNP_het_to_hom_ratio = formattedRatio(nSNPHets, nSNPHoms); + indel_het_to_hom_ratio = formattedRatio(nIndelHets, nIndelHoms); + + n_large_deletions = lengthHistogram.getnTooBigDeletions(); + n_large_insertions = lengthHistogram.getnTooBigInsertions(); + + insertion_to_deletion_ratio = formattedRatio(nInsertions, n_indels - nInsertions); + insertion_to_deletion_ratio_for_1bp_indels = formattedRatio(n1bpInsertions, n1bpDeletions); + insertion_to_deletion_ratio_for_large_indels = formattedRatio(n_large_insertions, n_large_deletions); + + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/IndelHistogram.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/IndelHistogram.java new file mode 100644 index 000000000..a6c86d3da --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/IndelHistogram.java @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.varianteval.util; + +import org.broadinstitute.sting.utils.variantcontext.Allele; + +import java.util.*; + +/** + * Simple utility for histogramming indel lengths + * + * Based on code from chartl + * + * @author Mark DePristo + * @since 3/21/12 + */ +public class IndelHistogram extends TableType { + private final boolean asFrequencies; + int nIndels = 0, nTooBigDeletions = 0, nTooBigInsertions = 0; + private final Integer[] rowKeys; + + private Map frequencies = null; + private final Map counts = new HashMap(); + + public IndelHistogram(int maxSize, boolean asFrequencies) { + this.asFrequencies = asFrequencies; + initializeCounts(maxSize); + this.rowKeys = new ArrayList(counts.keySet()).toArray(new Integer[maxSize]); + } + + private void initializeCounts(int size) { + for ( int i = -size; i <= size; i++ ) { + if ( i != 0 ) counts.put(i, 0); + } + } + + @Override + public String getRowName() { + return "Length"; + } + + @Override + public Object[] getColumnKeys() { + return new String[]{"Count"}; + } + + @Override + public Object[] getRowKeys() { + return rowKeys; + } + + @Override + public Object getCell(int row, int col) { + final int key = (Integer)getRowKeys()[row]; + if ( asFrequencies ) { + if ( frequencies == null ) { + frequencies = new HashMap(); + for ( final int len : counts.keySet() ) { + final double value = nIndels == 0 ? 0.0 : counts.get(len) / (1.0 * nIndels); + frequencies.put(len, value); + } + } + return frequencies.get(key); + } + return counts.get(key); + } + + public int getnTooBigDeletions() { + return nTooBigDeletions; + } + + public int getnTooBigInsertions() { + return nTooBigInsertions; + } + + public void update(final Allele ref, final Allele alt) { + final int alleleSize = alt.length() - ref.length(); + update(alleleSize); + } + + public void update(int len) { + if ( counts.containsKey(len) ) { + nIndels++; + counts.put(len, counts.get(len) + 1); + } else if ( len < 0 ) { + nTooBigDeletions++; + } else { + nTooBigInsertions++; + } + } +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java index 9f69554fe..610733d9c 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java @@ -94,7 +94,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-o %s" ), 1, - Arrays.asList("94fb8cba9e236131c6fbf1d7fee738fe") + Arrays.asList("7a726ecbedd722fa7cd4de3e023b7a82") ); executeTest("testFundamentalsCountVariantsSNPsandIndels", spec); } @@ -115,7 +115,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-o %s" ), 1, - Arrays.asList("670979268b05c3024297ba98d67d89ab") + Arrays.asList("95bb4a4267a8f29dd7a8169561499f20") ); executeTest("testFundamentalsCountVariantsSNPsandIndelsWithNovelty", spec); } @@ -137,7 +137,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-o %s" ), 1, - Arrays.asList("c38ce9c872a76ae7dd26c3e353bf0765") + Arrays.asList("9b51029083495935823fb0447a2857b9") ); executeTest("testFundamentalsCountVariantsSNPsandIndelsWithNoveltyAndFilter", spec); } @@ -158,7 +158,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-o %s" ), 1, - Arrays.asList("2c37f23bf6114a2b27f21ed445806fd2") + Arrays.asList("318b5fbbc61e2fc11d49369359812edd") ); executeTest("testFundamentalsCountVariantsSNPsandIndelsWithCpG", spec); } @@ -179,7 +179,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-o %s" ), 1, - Arrays.asList("206f0d629de9af0b97340cb22d34a81b") + Arrays.asList("74c02df2ef69dda231a2aec2a948747b") ); executeTest("testFundamentalsCountVariantsSNPsandIndelsWithFunctionalClass", spec); } @@ -200,7 +200,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-o %s" ), 1, - Arrays.asList("bd869725429deae8f56175ba9a8ab390") + Arrays.asList("2d97b1fe15e532e89803ba7ba347ff20") ); executeTest("testFundamentalsCountVariantsSNPsandIndelsWithDegeneracy", spec); } @@ -221,7 +221,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-o %s" ), 1, - Arrays.asList("9c7f6783a57ad681bb754b5e71de27dc") + Arrays.asList("474cbc231ddbc4ba299ffe61a17405b6") ); executeTest("testFundamentalsCountVariantsSNPsandIndelsWithSample", spec); } @@ -244,7 +244,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-o %s" ), 1, - Arrays.asList("a2d280440aa3771937f3d2d10f1eea74") + Arrays.asList("2cc9bc4bbe8b4edb6dc27642ec41f66e") ); executeTest("testFundamentalsCountVariantsSNPsandIndelsWithJexlExpression", spec); } @@ -269,7 +269,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-o %s" ), 1, - Arrays.asList("2925d811dd521beb00059f8c8e818d83") + Arrays.asList("00c94cf3e14bc2855d39bbefa27f9bb2") ); executeTest("testFundamentalsCountVariantsSNPsandIndelsWithMultipleJexlExpressions", spec); } @@ -288,7 +288,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-o %s" ), 1, - Arrays.asList("4b79bf2dfd73ddac0ceb0838a352bf9a") + Arrays.asList("a0c0d4805db1245aa30a306aa506096f") ); executeTest("testFundamentalsCountVariantsNoCompRod", spec); } @@ -301,7 +301,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { " --eval " + validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf" + " --comp:comp_genotypes,VCF3 " + validationDataLocation + "yri.trio.gatk.ug.head.vcf"; WalkerTestSpec spec = new WalkerTestSpec(withSelect(tests, "DP < 50", "DP50") + " " + extraArgs + " -ST CpG -o %s", - 1, Arrays.asList("1739654de350541edf429888b708ae01")); + 1, Arrays.asList("2192418a70a8e018a1675d4f425155f3")); executeTestParallel("testSelect1", spec); } @@ -329,7 +329,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { @Test public void testCompVsEvalAC() { String extraArgs = "-T VariantEval -R "+b36KGReference+" -o %s -ST CpG -EV GenotypeConcordance --eval:evalYRI,VCF3 " + validationDataLocation + "yri.trio.gatk.ug.very.few.lines.vcf --comp:compYRI,VCF3 " + validationDataLocation + "yri.trio.gatk.fake.genotypes.ac.test.vcf"; - WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("d57cf846bc26d338edcf181fb0c85535")); + WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("2282523336c24d434d1cc0eb1697b4f9")); executeTestParallel("testCompVsEvalAC",spec); } @@ -359,7 +359,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { " --dbsnp " + b37dbSNP132 + " --eval:evalBI " + validationDataLocation + "VariantEval/ALL.20100201.chr20.bi.sites.vcf" + " -noST -ST Novelty -o %s"; - WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("b663745a39f62bfa5b5d486811cf57ec")); + WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("ec321fcc424fbad74a4a74e739173d03")); executeTestParallel("testEvalTrackWithoutGenotypes",spec); } @@ -371,7 +371,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { " --eval:evalBI " + validationDataLocation + "VariantEval/ALL.20100201.chr20.bi.sites.vcf" + " --eval:evalBC " + validationDataLocation + "VariantEval/ALL.20100201.chr20.bc.sites.vcf" + " -noST -ST Novelty -o %s"; - WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("f1e1b1469dca86d72ae79a2d3e10612c")); + WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("ccaea6245086552cd63f828eabddfaf3")); executeTestParallel("testMultipleEvalTracksWithoutGenotypes",spec); } @@ -449,7 +449,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-o %s" ), 1, - Arrays.asList("0c632b5be8a54e43afa576510b40c4da") + Arrays.asList("9954c769ef37c47d3b61481ab0807be0") ); executeTest("testAlleleCountStrat", spec); } @@ -470,7 +470,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-o %s" ), 1, - Arrays.asList("92404820a94e7cfb854ae73450a0fbd9") + Arrays.asList("c0d69ce7647a575d166d8bab5aa16299") ); executeTest("testIntervalStrat", spec); } @@ -487,7 +487,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-o %s" ), 1, - Arrays.asList("417875ab1924b7e7950fa10daee393d2") + Arrays.asList("9a8ffb506118c1bde6f7bfadc4fb6f10") ); executeTest("testModernVCFWithLargeIndels", spec); }