From 2ac938fe4ee8e2078bf9f27463b721988dccf646 Mon Sep 17 00:00:00 2001 From: delangel Date: Tue, 30 Nov 2010 21:08:25 +0000 Subject: [PATCH] 1) Minor fixes to avoid crashes vs CG indel files: - Add count for complex events, not just insertions and deletions - Handle correctly cases of large indels falling out of bounds of histogram array: added a count of indels ouf of bounds and avoid exceptions. 2) Cosmetic fix for R script assessing UG calling performance: draw red y=x line on top of Simulated vs Estimated AC to get a better view of under/over-estimation of AC. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4758 348d0f76-0448-11de-a6fe-93d51630548a --- R/assessCallingPerformance.R | 5 ++-- .../walkers/varianteval/IndelStatistics.java | 29 +++++++++++++++---- 2 files changed, 27 insertions(+), 7 deletions(-) diff --git a/R/assessCallingPerformance.R b/R/assessCallingPerformance.R index bddfcb8ab..f364bc427 100644 --- a/R/assessCallingPerformance.R +++ b/R/assessCallingPerformance.R @@ -49,9 +49,10 @@ for ( i in 1:(dim(results)[1]) ) { results[i,]$specificity = x$specificity } -for ( depth in DEPTHS ) +for ( depth in DEPTHS ) { boxplot(called.AC ~ sim.AC, data = subset(d, called.DP == depth * NS), main = paste("Depth of coverage ", depth), xlab = "Simulation AC", ylab = "Called AC", outwex=0.5, col = "cornflowerblue") - + abline(a=0,b=1,col="red",lwd=3) +} print(results) par(mfcol=c(2,1)) diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/IndelStatistics.java b/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/IndelStatistics.java index 668b43881..34a4fb317 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/IndelStatistics.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/IndelStatistics.java @@ -45,7 +45,7 @@ public class IndelStatistics extends VariantEvaluator { private static final int INDEL_SIZE_LIMIT = 100; - private static final int NUM_SCALAR_COLUMNS = 8; + private static final int NUM_SCALAR_COLUMNS = 10; static int len2Index(int ind) { return ind+INDEL_SIZE_LIMIT+NUM_SCALAR_COLUMNS; @@ -67,6 +67,9 @@ public class IndelStatistics extends VariantEvaluator { COLUMN_KEYS[5] = "number_het_deletions"; COLUMN_KEYS[6] = "number_homozygous_deletions"; COLUMN_KEYS[7] = "number of homozygous reference sites"; + COLUMN_KEYS[8] = "number of complex events"; + COLUMN_KEYS[9] = "number of long indels"; + for (int k=NUM_SCALAR_COLUMNS; k < NUM_SCALAR_COLUMNS+ 2*INDEL_SIZE_LIMIT+1; k++) COLUMN_KEYS[k] = "indel_size_len"+Integer.valueOf(index2len(k)); } @@ -132,7 +135,7 @@ public class IndelStatistics extends VariantEvaluator { */ public void incrValue(VariantContext vc) { int eventLength = 0; - boolean isInsertion = false; + boolean isInsertion = false, isDeletion = false; if ( vc.isInsertion() ) { eventLength = vc.getAlternateAllele(0).length(); @@ -141,8 +144,18 @@ public class IndelStatistics extends VariantEvaluator { } else if ( vc.isDeletion() ) { indelSummary.get(ALL_SAMPLES_KEY)[2]++; eventLength = -vc.getReference().length(); + isDeletion = true; } - indelSummary.get(ALL_SAMPLES_KEY)[len2Index(eventLength)]++; + else { + indelSummary.get(ALL_SAMPLES_KEY)[8]++; + } + + // make sure event doesn't overstep array boundaries + if (Math.abs(eventLength) < INDEL_SIZE_LIMIT) + indelSummary.get(ALL_SAMPLES_KEY)[len2Index(eventLength)]++; + else + indelSummary.get(ALL_SAMPLES_KEY)[9]++; + for( final String sample : vc.getGenotypes().keySet() ) { if ( indelSummary.containsKey(sample) ) { @@ -153,10 +166,16 @@ public class IndelStatistics extends VariantEvaluator { if (isInsertion) { indelSummary.get(sample)[1]++; } - else + else if (isDeletion) indelSummary.get(sample)[2]++; + else + indelSummary.get(sample)[8]++; + // update histogram - indelSummary.get(sample)[len2Index(eventLength)]++; + if (Math.abs(eventLength) < INDEL_SIZE_LIMIT) + indelSummary.get(sample)[len2Index(eventLength)]++; + else + indelSummary.get(sample)[9]++; if (g.isHet()) if (isInsertion)