IndelSummary now emits all of the underlying counts for ratios, percentages, etc it computes

This commit is contained in:
Mark DePristo 2012-04-13 11:21:02 -04:00
parent 542a8e3306
commit 23ccf772d4
1 changed files with 46 additions and 24 deletions

View File

@ -32,7 +32,6 @@ import org.broadinstitute.sting.gatk.walkers.varianteval.util.Analysis;
import org.broadinstitute.sting.gatk.walkers.varianteval.util.DataPoint;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
@ -41,51 +40,81 @@ import org.broadinstitute.sting.utils.variantcontext.VariantContext;
public class IndelSummary extends VariantEvaluator implements StandardEval {
final protected static Logger logger = Logger.getLogger(IndelSummary.class);
//
// counts of snps and indels
//
@DataPoint(description = "Number of SNPs", format = "%d")
public int n_SNPs = 0;
@DataPoint(description = "Number of singleton SNPs", format = "%d")
public int n_singleton_SNPs = 0;
@DataPoint(description = "Number of Indels", format = "%d")
@DataPoint(description = "Number of indels", format = "%d")
public int n_indels = 0;
// Number of Indels Sites (counts one for any number of alleles at site)
public int nIndelSites = 0;
@DataPoint(description = "Number of singleton Indels", format = "%d")
@DataPoint(description = "Number of singleton indels", format = "%d")
public int n_singleton_indels = 0;
//
// gold standard
//
@DataPoint(description = "Number of Indels overlapping gold standard sites", format = "%d")
public int n_indels_matching_gold_standard = 0;
@DataPoint(description = "Percent of indels overlapping gold standard sites")
public String gold_standard_matching_rate;
//
// multi-allelics
//
// Number of Indels Sites (counts one for any number of alleles at site)
public int nIndelSites = 0;
@DataPoint(description = "Number of sites with where the number of alleles is greater than 2")
public int n_multiallelic_indel_sites = 0;
@DataPoint(description = "Percent of indel sites that are multi-allelic")
public String percent_of_sites_with_more_than_2_alleles;
//
// snp : indel ratios
//
@DataPoint(description = "SNP to indel ratio")
public String SNP_to_indel_ratio;
@DataPoint(description = "Singleton SNP to indel ratio")
public String SNP_to_indel_ratio_for_singletons;
//
// novelty
//
@DataPoint(description = "Number of novel indels", format = "%d")
public int n_novel_indels = 0;
@DataPoint(description = "Indel novelty rate")
public String indel_novelty_rate;
@DataPoint(description = "Frameshift percent")
public String frameshift_rate_for_coding_indels;
//
// insertions to deletions
//
@DataPoint(description = "Number of insertion indels")
public int n_insertions = 0;
@DataPoint(description = "Number of deletion indels")
public int n_deletions = 0;
@DataPoint(description = "Insertion to deletion ratio")
public String insertion_to_deletion_ratio;
@DataPoint(description = "Number of large (>10 bp) deletions")
public int n_large_deletions = 0;
@DataPoint(description = "Number of large (>10 bp) insertions")
public int n_large_insertions = 0;
@DataPoint(description = "Ratio of large (>10 bp) insertions to deletions")
public String insertion_to_deletion_ratio_for_large_indels;
//
// Frameshifts
//
@ -95,6 +124,9 @@ public class IndelSummary extends VariantEvaluator implements StandardEval {
@DataPoint(description = "Number of indels in protein-coding regions not labeled as frameshift")
public int n_coding_indels_in_frame = 0;
@DataPoint(description = "Frameshift percent")
public String frameshift_rate_for_coding_indels;
//
// Het : hom ratios
//
@ -106,8 +138,6 @@ public class IndelSummary extends VariantEvaluator implements StandardEval {
int nSNPHets = 0, nSNPHoms = 0, nIndelHets = 0, nIndelHoms = 0;
int nKnownIndels = 0, nInsertions = 0;
int[] insertionCountByLength = new int[]{0, 0, 0, 0}; // note that the first element isn't used
int[] deletionCountByLength = new int[]{0, 0, 0, 0}; // note that the first element isn't used
@ -129,15 +159,6 @@ public class IndelSummary extends VariantEvaluator implements StandardEval {
public final static int LARGE_INDEL_SIZE_THRESHOLD = 10;
@DataPoint(description = "Number of large (>10 bp) deletions")
public int n_large_deletions = 0;
@DataPoint(description = "Number of large (>10 bp) insertions")
public int n_large_insertions = 0;
@DataPoint(description = "Ratio of large (>10 bp) insertions to deletions")
public String insertion_to_deletion_ratio_for_large_indels;
@Override public int getComparisonOrder() { return 2; }
public void update2(VariantContext eval, VariantContext comp, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
@ -171,13 +192,14 @@ public class IndelSummary extends VariantEvaluator implements StandardEval {
for ( Allele alt : eval.getAlternateAlleles() ) {
n_indels++; // +1 for each alt allele
if ( variantWasSingleton(eval) ) n_singleton_indels++;
if ( comp != null ) nKnownIndels++; // TODO -- make this test allele specific?
if ( comp == null ) n_novel_indels++; // TODO -- make this test allele specific?
if ( gold != null ) n_indels_matching_gold_standard++;
// ins : del ratios
final int alleleSize = alt.length() - eval.getReference().length();
if ( alleleSize == 0 ) throw new ReviewedStingException("Allele size not expected to be zero for indel: alt = " + alt + " ref = " + eval.getReference());
if ( alleleSize > 0 ) nInsertions++;
if ( alleleSize > 0 ) n_insertions++;
if ( alleleSize < 0 ) n_deletions++;
// requires snpEFF annotations
if ( eval.getAttributeAsString("SNPEFF_GENE_BIOTYPE", "missing").equals("protein_coding") ) {
@ -220,7 +242,7 @@ public class IndelSummary extends VariantEvaluator implements StandardEval {
SNP_to_indel_ratio_for_singletons = Utils.formattedRatio(n_singleton_SNPs, n_singleton_indels);
gold_standard_matching_rate = Utils.formattedPercent(n_indels_matching_gold_standard, n_indels);
indel_novelty_rate = Utils.formattedNoveltyRate(nKnownIndels, n_indels);
indel_novelty_rate = Utils.formattedNoveltyRate(n_indels - n_novel_indels, n_indels);
frameshift_rate_for_coding_indels = Utils.formattedPercent(n_coding_indels_frameshifting, n_coding_indels_in_frame + n_coding_indels_frameshifting);
ratio_of_1_and_2_to_3_bp_deletions = Utils.formattedRatio(deletionCountByLength[1] + deletionCountByLength[2], deletionCountByLength[3]);
@ -229,7 +251,7 @@ public class IndelSummary extends VariantEvaluator implements StandardEval {
SNP_het_to_hom_ratio = Utils.formattedRatio(nSNPHets, nSNPHoms);
indel_het_to_hom_ratio = Utils.formattedRatio(nIndelHets, nIndelHoms);
insertion_to_deletion_ratio = Utils.formattedRatio(nInsertions, n_indels - nInsertions);
insertion_to_deletion_ratio = Utils.formattedRatio(n_insertions, n_deletions);
insertion_to_deletion_ratio_for_large_indels = Utils.formattedRatio(n_large_insertions, n_large_deletions);
}