From 23ccf772d4f644214eebd9d4e5fcc9358e5d55bf Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 13 Apr 2012 11:21:02 -0400 Subject: [PATCH] IndelSummary now emits all of the underlying counts for ratios, percentages, etc it computes --- .../varianteval/evaluators/IndelSummary.java | 70 ++++++++++++------- 1 file changed, 46 insertions(+), 24 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelSummary.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelSummary.java index 198172411..dda7e8611 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelSummary.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelSummary.java @@ -32,7 +32,6 @@ import org.broadinstitute.sting.gatk.walkers.varianteval.util.Analysis; import org.broadinstitute.sting.gatk.walkers.varianteval.util.DataPoint; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.Genotype; import org.broadinstitute.sting.utils.variantcontext.VariantContext; @@ -41,51 +40,81 @@ import org.broadinstitute.sting.utils.variantcontext.VariantContext; public class IndelSummary extends VariantEvaluator implements StandardEval { final protected static Logger logger = Logger.getLogger(IndelSummary.class); + // + // counts of snps and indels + // @DataPoint(description = "Number of SNPs", format = "%d") public int n_SNPs = 0; @DataPoint(description = "Number of singleton SNPs", format = "%d") public int n_singleton_SNPs = 0; - @DataPoint(description = "Number of Indels", format = "%d") + @DataPoint(description = "Number of indels", format = "%d") public int n_indels = 0; - // Number of Indels Sites (counts one for any number of alleles at site) - public int nIndelSites = 0; - - @DataPoint(description = "Number of singleton Indels", format = "%d") + @DataPoint(description = "Number of singleton indels", format = "%d") public int n_singleton_indels = 0; + // + // gold standard + // @DataPoint(description = "Number of Indels overlapping gold standard sites", format = "%d") public int n_indels_matching_gold_standard = 0; @DataPoint(description = "Percent of indels overlapping gold standard sites") public String gold_standard_matching_rate; + // + // multi-allelics + // + // Number of Indels Sites (counts one for any number of alleles at site) + public int nIndelSites = 0; + @DataPoint(description = "Number of sites with where the number of alleles is greater than 2") public int n_multiallelic_indel_sites = 0; @DataPoint(description = "Percent of indel sites that are multi-allelic") public String percent_of_sites_with_more_than_2_alleles; + // + // snp : indel ratios + // @DataPoint(description = "SNP to indel ratio") public String SNP_to_indel_ratio; @DataPoint(description = "Singleton SNP to indel ratio") public String SNP_to_indel_ratio_for_singletons; + // + // novelty + // + @DataPoint(description = "Number of novel indels", format = "%d") + public int n_novel_indels = 0; + @DataPoint(description = "Indel novelty rate") public String indel_novelty_rate; - @DataPoint(description = "Frameshift percent") - public String frameshift_rate_for_coding_indels; - // // insertions to deletions // + @DataPoint(description = "Number of insertion indels") + public int n_insertions = 0; + + @DataPoint(description = "Number of deletion indels") + public int n_deletions = 0; + @DataPoint(description = "Insertion to deletion ratio") public String insertion_to_deletion_ratio; + @DataPoint(description = "Number of large (>10 bp) deletions") + public int n_large_deletions = 0; + + @DataPoint(description = "Number of large (>10 bp) insertions") + public int n_large_insertions = 0; + + @DataPoint(description = "Ratio of large (>10 bp) insertions to deletions") + public String insertion_to_deletion_ratio_for_large_indels; + // // Frameshifts // @@ -95,6 +124,9 @@ public class IndelSummary extends VariantEvaluator implements StandardEval { @DataPoint(description = "Number of indels in protein-coding regions not labeled as frameshift") public int n_coding_indels_in_frame = 0; + @DataPoint(description = "Frameshift percent") + public String frameshift_rate_for_coding_indels; + // // Het : hom ratios // @@ -106,8 +138,6 @@ public class IndelSummary extends VariantEvaluator implements StandardEval { int nSNPHets = 0, nSNPHoms = 0, nIndelHets = 0, nIndelHoms = 0; - int nKnownIndels = 0, nInsertions = 0; - int[] insertionCountByLength = new int[]{0, 0, 0, 0}; // note that the first element isn't used int[] deletionCountByLength = new int[]{0, 0, 0, 0}; // note that the first element isn't used @@ -129,15 +159,6 @@ public class IndelSummary extends VariantEvaluator implements StandardEval { public final static int LARGE_INDEL_SIZE_THRESHOLD = 10; - @DataPoint(description = "Number of large (>10 bp) deletions") - public int n_large_deletions = 0; - - @DataPoint(description = "Number of large (>10 bp) insertions") - public int n_large_insertions = 0; - - @DataPoint(description = "Ratio of large (>10 bp) insertions to deletions") - public String insertion_to_deletion_ratio_for_large_indels; - @Override public int getComparisonOrder() { return 2; } public void update2(VariantContext eval, VariantContext comp, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { @@ -171,13 +192,14 @@ public class IndelSummary extends VariantEvaluator implements StandardEval { for ( Allele alt : eval.getAlternateAlleles() ) { n_indels++; // +1 for each alt allele if ( variantWasSingleton(eval) ) n_singleton_indels++; - if ( comp != null ) nKnownIndels++; // TODO -- make this test allele specific? + if ( comp == null ) n_novel_indels++; // TODO -- make this test allele specific? if ( gold != null ) n_indels_matching_gold_standard++; // ins : del ratios final int alleleSize = alt.length() - eval.getReference().length(); if ( alleleSize == 0 ) throw new ReviewedStingException("Allele size not expected to be zero for indel: alt = " + alt + " ref = " + eval.getReference()); - if ( alleleSize > 0 ) nInsertions++; + if ( alleleSize > 0 ) n_insertions++; + if ( alleleSize < 0 ) n_deletions++; // requires snpEFF annotations if ( eval.getAttributeAsString("SNPEFF_GENE_BIOTYPE", "missing").equals("protein_coding") ) { @@ -220,7 +242,7 @@ public class IndelSummary extends VariantEvaluator implements StandardEval { SNP_to_indel_ratio_for_singletons = Utils.formattedRatio(n_singleton_SNPs, n_singleton_indels); gold_standard_matching_rate = Utils.formattedPercent(n_indels_matching_gold_standard, n_indels); - indel_novelty_rate = Utils.formattedNoveltyRate(nKnownIndels, n_indels); + indel_novelty_rate = Utils.formattedNoveltyRate(n_indels - n_novel_indels, n_indels); frameshift_rate_for_coding_indels = Utils.formattedPercent(n_coding_indels_frameshifting, n_coding_indels_in_frame + n_coding_indels_frameshifting); ratio_of_1_and_2_to_3_bp_deletions = Utils.formattedRatio(deletionCountByLength[1] + deletionCountByLength[2], deletionCountByLength[3]); @@ -229,7 +251,7 @@ public class IndelSummary extends VariantEvaluator implements StandardEval { SNP_het_to_hom_ratio = Utils.formattedRatio(nSNPHets, nSNPHoms); indel_het_to_hom_ratio = Utils.formattedRatio(nIndelHets, nIndelHoms); - insertion_to_deletion_ratio = Utils.formattedRatio(nInsertions, n_indels - nInsertions); + insertion_to_deletion_ratio = Utils.formattedRatio(n_insertions, n_deletions); insertion_to_deletion_ratio_for_large_indels = Utils.formattedRatio(n_large_insertions, n_large_deletions); }