From 17db2e0e24d63d11637efc9aa1fc68d5dc09146e Mon Sep 17 00:00:00 2001 From: delangel Date: Mon, 13 Dec 2010 17:43:43 +0000 Subject: [PATCH] (forgot I hadn't committed this) - refactored IndelStatistics module and added a new inner class to compute Indel classification along with other statistics. So, we now get an extra table specifying, per sample, counts of whether indels are: - Repeat Expansions - Novel sequence And for indels of size <=2 we get a per-mononuc. or dinuc. breakdown of novels and expansions. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4828 348d0f76-0448-11de-a6fe-93d51630548a --- .../walkers/varianteval/IndelStatistics.java | 268 ++++++++++++++++-- 1 file changed, 249 insertions(+), 19 deletions(-) diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/IndelStatistics.java b/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/IndelStatistics.java index 34a4fb317..34f0732da 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/IndelStatistics.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/IndelStatistics.java @@ -10,6 +10,7 @@ import org.broadinstitute.sting.utils.report.tags.Analysis; import org.broadinstitute.sting.utils.report.tags.DataPoint; import org.broadinstitute.sting.utils.report.utils.TableType; +import java.util.Arrays; import java.util.HashMap; /* @@ -37,11 +38,13 @@ import java.util.HashMap; * OTHER DEALINGS IN THE SOFTWARE. */ -@Analysis(name = "Indel Statistics", description = "Shows various indel metrics and statistics") +@Analysis(name = "IndelStatistics", description = "Shows various indel metrics and statistics") public class IndelStatistics extends VariantEvaluator { @DataPoint(name="IndelStatistics", description = "Indel Statistics") IndelStats indelStats = null; + @DataPoint(name="IndelClasses", description = "Indel Classification") + IndelClasses indelClasses = null; private static final int INDEL_SIZE_LIMIT = 100; @@ -54,6 +57,7 @@ public class IndelStatistics extends VariantEvaluator { static int index2len(int ind) { return ind-INDEL_SIZE_LIMIT-NUM_SCALAR_COLUMNS; } + static class IndelStats implements TableType { protected final static String ALL_SAMPLES_KEY = "allSamples"; protected final static String[] COLUMN_KEYS; @@ -115,20 +119,6 @@ public class IndelStatistics extends VariantEvaluator { public String toString() { return getName(); } -/* - private double ratio(long numer, long denom) { - return denom != 0L ? 100.0 * ( ((double)numer) / ((double)denom) ) : 0.0; - } - - final long[] allSamplesNumerators = new long[COLUMN_KEYS.length]; - final long[] allSamplesDenominators = new long[COLUMN_KEYS.length]; - - private void updateSummaries(int i, double[] summary, long numer, long denom ) { - allSamplesNumerators[i] += numer; - allSamplesDenominators[i] += denom; - summary[i] = ratio(numer, denom); - } - */ /* * increment the specified value @@ -200,6 +190,241 @@ public class IndelStatistics extends VariantEvaluator { } } + static class IndelClasses implements TableType { + protected final static String ALL_SAMPLES_KEY = "allSamples"; + protected final static String[] COLUMN_KEYS; + + + + static { + COLUMN_KEYS= new String[41]; + COLUMN_KEYS[0] = "Novel_A"; + COLUMN_KEYS[1] = "Novel_C"; + COLUMN_KEYS[2] = "Novel_G"; + COLUMN_KEYS[3] = "Novel_T"; + COLUMN_KEYS[4] = "NOVEL_1"; + COLUMN_KEYS[5] = "NOVEL_2"; + COLUMN_KEYS[6] = "NOVEL_3"; + COLUMN_KEYS[7] = "NOVEL_4"; + COLUMN_KEYS[8] = "NOVEL_5"; + COLUMN_KEYS[9] = "NOVEL_6"; + COLUMN_KEYS[10] = "NOVEL_7"; + COLUMN_KEYS[11] = "NOVEL_8"; + COLUMN_KEYS[12] = "NOVEL_9"; + COLUMN_KEYS[13] = "NOVEL_10orMore"; + COLUMN_KEYS[14] = "RepeatExpansion_A"; + COLUMN_KEYS[15] = "RepeatExpansion_C"; + COLUMN_KEYS[16] = "RepeatExpansion_G"; + COLUMN_KEYS[17] = "RepeatExpansion_T"; + COLUMN_KEYS[18] = "RepeatExpansion_AC"; + COLUMN_KEYS[19] = "RepeatExpansion_AG"; + COLUMN_KEYS[20] = "RepeatExpansion_AT"; + COLUMN_KEYS[21] = "RepeatExpansion_CA"; + COLUMN_KEYS[22] = "RepeatExpansion_CG"; + COLUMN_KEYS[23] = "RepeatExpansion_CT"; + COLUMN_KEYS[24] = "RepeatExpansion_GA"; + COLUMN_KEYS[25] = "RepeatExpansion_GC"; + COLUMN_KEYS[26] = "RepeatExpansion_GT"; + COLUMN_KEYS[27] = "RepeatExpansion_TA"; + COLUMN_KEYS[28] = "RepeatExpansion_TC"; + COLUMN_KEYS[29] = "RepeatExpansion_TG"; + COLUMN_KEYS[30] = "RepeatExpansion_1"; + COLUMN_KEYS[31] = "RepeatExpansion_2"; + COLUMN_KEYS[32] = "RepeatExpansion_3"; + COLUMN_KEYS[33] = "RepeatExpansion_4"; + COLUMN_KEYS[34] = "RepeatExpansion_5"; + COLUMN_KEYS[35] = "RepeatExpansion_6"; + COLUMN_KEYS[36] = "RepeatExpansion_7"; + COLUMN_KEYS[37] = "RepeatExpansion_8"; + COLUMN_KEYS[38] = "RepeatExpansion_9"; + COLUMN_KEYS[39] = "RepeatExpansion_10orMore"; + COLUMN_KEYS[40] = "Other"; + + } + + private static final int START_IND_NOVEL = 4; + private static final int STOP_IND_NOVEL = 13; + private static final int START_IND_FOR_REPEAT_EXPANSION_1 = 14; + private static final int STOP_IND_FOR_REPEAT_EXPANSION_2 = 29; + private static final int START_IND_FOR_REPEAT_EXPANSION_COUNTS = 30; + private static final int STOP_IND_FOR_REPEAT_EXPANSION_COUNTS = 39; + private static final int IND_FOR_OTHER_EVENT = 40; + private static final int START_IND_NOVEL_PER_BASE = 0; + private static final int STOP_IND_NOVEL_PER_BASE = 3; + + + // map of sample to statistics + protected final HashMap indelClassSummary = new HashMap(); + + public IndelClasses(final VariantContext vc) { + indelClassSummary.put(ALL_SAMPLES_KEY, new int[COLUMN_KEYS.length]); + for( final String sample : vc.getGenotypes().keySet() ) { + indelClassSummary.put(sample, new int[COLUMN_KEYS.length]); + } + } + + /** + * + * @return one row per sample + */ + public Object[] getRowKeys() { + return indelClassSummary.keySet().toArray(new String[indelClassSummary.size()]); + } + public Object getCell(int x, int y) { + final Object[] rowKeys = getRowKeys(); + return String.format("%d",indelClassSummary.get(rowKeys[x])[y]); + } + + /** + * get the column keys + * @return a list of objects, in this case strings, that are the column names + */ + public Object[] getColumnKeys() { + return COLUMN_KEYS; + } + + public String getName() { + return "IndelClasses"; + } + + public int getComparisonOrder() { + return 1; // we only need to see each eval track + } + + public String toString() { + return getName(); + } + + private void incrementSampleStat(VariantContext vc, int index) { + indelClassSummary.get(ALL_SAMPLES_KEY)[index]++; + for( final String sample : vc.getGenotypes().keySet() ) { + if ( indelClassSummary.containsKey(sample) ) { + Genotype g = vc.getGenotype(sample); + boolean isVariant = (g.isCalled() && !g.isHomRef()); + if (isVariant) + // update count + indelClassSummary.get(sample)[index]++; + + } + } + + } + /* + * increment the specified value + */ + public void incrValue(VariantContext vc, ReferenceContext ref) { + int eventLength = 0; + boolean isInsertion = false, isDeletion = false; + String indelAlleleString; + + if ( vc.isInsertion() ) { + eventLength = vc.getAlternateAllele(0).length(); + isInsertion = true; + indelAlleleString = vc.getAlternateAllele(0).getDisplayString(); + } else if ( vc.isDeletion() ) { + eventLength = vc.getReference().length(); + isDeletion = true; + indelAlleleString = vc.getReference().getDisplayString(); + } + else { + incrementSampleStat(vc, IND_FOR_OTHER_EVENT); + + return; + } + + byte[] refBases = ref.getBases(); + + + + // See first if indel is a repetition of bases before current + int indStart = refBases.length/2-eventLength; + + boolean done = false; + int numRepetitions = 0; + while (!done) { + if (indStart < 0) + done = true; + else { + String refPiece = new String(Arrays.copyOfRange(refBases,indStart,indStart+eventLength)); + if (refPiece.matches(indelAlleleString)) + { + numRepetitions++; + indStart = indStart - eventLength; + } + else + done = true; + + } + } + + // now do it forward + done = false; + indStart = refBases.length/2; + while (!done) { + if (indStart + eventLength >= refBases.length) + break; + else { + String refPiece = new String(Arrays.copyOfRange(refBases,indStart,indStart+eventLength)); + if (refPiece.matches(indelAlleleString)) + { + numRepetitions++; + indStart = indStart + eventLength; + } + else + done = true; + + } + } + + if (numRepetitions == 0) { + //unrepeated sequence from surroundings + int ind = START_IND_NOVEL + (eventLength-1); + if (ind > STOP_IND_NOVEL) + ind = STOP_IND_NOVEL; + incrementSampleStat(vc, ind); + + if (eventLength == 1) { + // log single base indels additionally by base + String keyStr = "Novel_" + indelAlleleString; + int k; + for (k=START_IND_NOVEL_PER_BASE; k <= STOP_IND_NOVEL_PER_BASE; k++) { + if (keyStr.matches(COLUMN_KEYS[k])) + break; + } + // log now event + incrementSampleStat(vc, k); + } + } + else { + int ind = START_IND_FOR_REPEAT_EXPANSION_COUNTS + (numRepetitions-1); + if (ind > STOP_IND_FOR_REPEAT_EXPANSION_COUNTS) + ind = STOP_IND_FOR_REPEAT_EXPANSION_COUNTS; + incrementSampleStat(vc, ind); + + if (eventLength<=2) { + // for single or dinucleotide indels, we further log the base in which they occurred + String keyStr = "RepeatExpansion_" + indelAlleleString; + int k; + for (k=START_IND_FOR_REPEAT_EXPANSION_1; k <= STOP_IND_FOR_REPEAT_EXPANSION_2; k++) { + if (keyStr.matches(COLUMN_KEYS[k])) + break; + } + // log now event + incrementSampleStat(vc, k); + } + + } + //g+ + /* + System.out.format("RefBefore: %s\n",new String(refBefore)); + System.out.format("RefAfter: %s\n",new String(refAfter)); + System.out.format("Indel Allele: %s\n", indelAlleleString); + System.out.format("Num Repetitions: %d\n", numRepetitions); + */ + } + + } + public IndelStatistics(VariantEvalWalker parent) { super(parent); // don't do anything @@ -233,11 +458,16 @@ public class IndelStatistics extends VariantEvaluator { if ( nSamples != -1 ) indelStats = new IndelStats(eval); } + if ( indelClasses == null ) { + indelClasses = new IndelClasses(eval); + } - if ( eval.isIndel() && - eval.isBiallelic() && - indelStats != null ) { - indelStats.incrValue(eval); + if ( eval.isIndel() && eval.isBiallelic() ) { + if (indelStats != null ) + indelStats.incrValue(eval); + + if (indelClasses != null) + indelClasses.incrValue(eval, ref); } }