From f3de9ee3e0b81f5f94eaaf74883df05e53d40be4 Mon Sep 17 00:00:00 2001 From: delangel Date: Tue, 8 Feb 2011 17:35:16 +0000 Subject: [PATCH] Refactoring of indel evaluation code to make it easier for external functions to get access to indel classification, in preparation for IndelMetricsByAC to stratify indel classes by AC (not done yet). git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5219 348d0f76-0448-11de-a6fe-93d51630548a --- .../evaluators/IndelMetricsByAC.java | 2 +- .../evaluators/IndelStatistics.java | 202 +-------------- .../sting/utils/IndelUtils.java | 237 ++++++++++++++++++ 3 files changed, 248 insertions(+), 193 deletions(-) create mode 100755 java/src/org/broadinstitute/sting/utils/IndelUtils.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelMetricsByAC.java b/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelMetricsByAC.java index e61540c32..962c333aa 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelMetricsByAC.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelMetricsByAC.java @@ -59,7 +59,7 @@ public class IndelMetricsByAC extends VariantEvaluator { private static final int INDEL_SIZE_LIMIT = 100; private static final int NUM_SCALAR_COLUMNS = 6; static int len2Index(int ind) { - return ind+INDEL_SIZE_LIMIT+NUM_SCALAR_COLUMNS; + return ind+INDEL_SIZE_LIMIT; } static int index2len(int ind) { diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelStatistics.java b/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelStatistics.java index 97a5669a7..b58148021 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelStatistics.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelStatistics.java @@ -8,8 +8,10 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.varianteval.VariantEvalWalker; import org.broadinstitute.sting.gatk.walkers.varianteval.tags.Analysis; import org.broadinstitute.sting.gatk.walkers.varianteval.tags.DataPoint; +import org.broadinstitute.sting.utils.IndelUtils; import org.broadinstitute.sting.utils.report.utils.TableType; +import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; @@ -197,74 +199,16 @@ public class IndelStatistics extends VariantEvaluator { static class IndelClasses implements TableType { protected final static String ALL_SAMPLES_KEY = "allSamples"; - protected final static String[] COLUMN_KEYS; - - - - static { - COLUMN_KEYS= new String[41]; - COLUMN_KEYS[0] = "Novel_A"; - COLUMN_KEYS[1] = "Novel_C"; - COLUMN_KEYS[2] = "Novel_G"; - COLUMN_KEYS[3] = "Novel_T"; - COLUMN_KEYS[4] = "NOVEL_1"; - COLUMN_KEYS[5] = "NOVEL_2"; - COLUMN_KEYS[6] = "NOVEL_3"; - COLUMN_KEYS[7] = "NOVEL_4"; - COLUMN_KEYS[8] = "NOVEL_5"; - COLUMN_KEYS[9] = "NOVEL_6"; - COLUMN_KEYS[10] = "NOVEL_7"; - COLUMN_KEYS[11] = "NOVEL_8"; - COLUMN_KEYS[12] = "NOVEL_9"; - COLUMN_KEYS[13] = "NOVEL_10orMore"; - COLUMN_KEYS[14] = "RepeatExpansion_A"; - COLUMN_KEYS[15] = "RepeatExpansion_C"; - COLUMN_KEYS[16] = "RepeatExpansion_G"; - COLUMN_KEYS[17] = "RepeatExpansion_T"; - COLUMN_KEYS[18] = "RepeatExpansion_AC"; - COLUMN_KEYS[19] = "RepeatExpansion_AG"; - COLUMN_KEYS[20] = "RepeatExpansion_AT"; - COLUMN_KEYS[21] = "RepeatExpansion_CA"; - COLUMN_KEYS[22] = "RepeatExpansion_CG"; - COLUMN_KEYS[23] = "RepeatExpansion_CT"; - COLUMN_KEYS[24] = "RepeatExpansion_GA"; - COLUMN_KEYS[25] = "RepeatExpansion_GC"; - COLUMN_KEYS[26] = "RepeatExpansion_GT"; - COLUMN_KEYS[27] = "RepeatExpansion_TA"; - COLUMN_KEYS[28] = "RepeatExpansion_TC"; - COLUMN_KEYS[29] = "RepeatExpansion_TG"; - COLUMN_KEYS[30] = "RepeatExpansion_1"; - COLUMN_KEYS[31] = "RepeatExpansion_2"; - COLUMN_KEYS[32] = "RepeatExpansion_3"; - COLUMN_KEYS[33] = "RepeatExpansion_4"; - COLUMN_KEYS[34] = "RepeatExpansion_5"; - COLUMN_KEYS[35] = "RepeatExpansion_6"; - COLUMN_KEYS[36] = "RepeatExpansion_7"; - COLUMN_KEYS[37] = "RepeatExpansion_8"; - COLUMN_KEYS[38] = "RepeatExpansion_9"; - COLUMN_KEYS[39] = "RepeatExpansion_10orMore"; - COLUMN_KEYS[40] = "Other"; - - } - - private static final int START_IND_NOVEL = 4; - private static final int STOP_IND_NOVEL = 13; - private static final int START_IND_FOR_REPEAT_EXPANSION_1 = 14; - private static final int STOP_IND_FOR_REPEAT_EXPANSION_2 = 29; - private static final int START_IND_FOR_REPEAT_EXPANSION_COUNTS = 30; - private static final int STOP_IND_FOR_REPEAT_EXPANSION_COUNTS = 39; - private static final int IND_FOR_OTHER_EVENT = 40; - private static final int START_IND_NOVEL_PER_BASE = 0; - private static final int STOP_IND_NOVEL_PER_BASE = 3; + protected final static String[] columnNames = IndelUtils.getIndelClassificationNames(); // map of sample to statistics protected final HashMap indelClassSummary = new HashMap(); public IndelClasses(final VariantContext vc) { - indelClassSummary.put(ALL_SAMPLES_KEY, new int[COLUMN_KEYS.length]); + indelClassSummary.put(ALL_SAMPLES_KEY, new int[columnNames.length]); for( final String sample : vc.getGenotypes().keySet() ) { - indelClassSummary.put(sample, new int[COLUMN_KEYS.length]); + indelClassSummary.put(sample, new int[columnNames.length]); } } @@ -285,7 +229,7 @@ public class IndelStatistics extends VariantEvaluator { * @return a list of objects, in this case strings, that are the column names */ public Object[] getColumnKeys() { - return COLUMN_KEYS; + return columnNames; } public String getName() { @@ -317,138 +261,12 @@ public class IndelStatistics extends VariantEvaluator { /* * increment the specified value */ - private String findMinimalEvent(String eventString) { + public void incrValue(VariantContext vc, ReferenceContext ref) { - // for each length up to given string length, see if event string is a repetition of units of size N - boolean foundSubstring = false; - String minEvent = eventString; - for (int k=1; k < eventString.length(); k++) { - if (eventString.length() % k > 0) - continue; - String str = eventString.substring(0,k); - // now see if event string is a repetition of str - int numReps = eventString.length() / k; - String r = ""; - for (int j=0; j < numReps; j++) - r = r.concat(str); - if (r.matches(eventString)) { - foundSubstring = true; - minEvent = str; - break; - } - - } - return minEvent; - } - public void incrValue(VariantContext vc, ReferenceContext ref) { - int eventLength = 0; - boolean isInsertion = false, isDeletion = false; - String indelAlleleString; - - if ( vc.isInsertion() ) { - isInsertion = true; - indelAlleleString = vc.getAlternateAllele(0).getDisplayString(); - } else if ( vc.isDeletion() ) { - isDeletion = true; - indelAlleleString = vc.getReference().getDisplayString(); - } - else { - incrementSampleStat(vc, IND_FOR_OTHER_EVENT); - - return; - } - - byte[] refBases = ref.getBases(); - - indelAlleleString = findMinimalEvent(indelAlleleString); - eventLength = indelAlleleString.length(); - - // See first if indel is a repetition of bases before current - int indStart = refBases.length/2-eventLength+1; - - boolean done = false; - int numRepetitions = 0; - while (!done) { - if (indStart < 0) - done = true; - else { - String refPiece = new String(Arrays.copyOfRange(refBases,indStart,indStart+eventLength)); - if (refPiece.matches(indelAlleleString)) - { - numRepetitions++; - indStart = indStart - eventLength; - } - else - done = true; - - } - } - - // now do it forward - done = false; - indStart = refBases.length/2+1; - while (!done) { - if (indStart + eventLength >= refBases.length) - break; - else { - String refPiece = new String(Arrays.copyOfRange(refBases,indStart,indStart+eventLength)); - if (refPiece.matches(indelAlleleString)) - { - numRepetitions++; - indStart = indStart + eventLength; - } - else - done = true; - - } - } - - if (numRepetitions == 0) { - //unrepeated sequence from surroundings - int ind = START_IND_NOVEL + (eventLength-1); - if (ind > STOP_IND_NOVEL) - ind = STOP_IND_NOVEL; - incrementSampleStat(vc, ind); - - if (eventLength == 1) { - // log single base indels additionally by base - String keyStr = "Novel_" + indelAlleleString; - int k; - for (k=START_IND_NOVEL_PER_BASE; k <= STOP_IND_NOVEL_PER_BASE; k++) { - if (keyStr.matches(COLUMN_KEYS[k])) - break; - } - // log now event - incrementSampleStat(vc, k); - } - } - else { - int ind = START_IND_FOR_REPEAT_EXPANSION_COUNTS + (numRepetitions-1); - if (ind > STOP_IND_FOR_REPEAT_EXPANSION_COUNTS) - ind = STOP_IND_FOR_REPEAT_EXPANSION_COUNTS; - incrementSampleStat(vc, ind); - - if (eventLength<=2) { - // for single or dinucleotide indels, we further log the base in which they occurred - String keyStr = "RepeatExpansion_" + indelAlleleString; - int k; - for (k=START_IND_FOR_REPEAT_EXPANSION_1; k <= STOP_IND_FOR_REPEAT_EXPANSION_2; k++) { - if (keyStr.matches(COLUMN_KEYS[k])) - break; - } - // log now event - incrementSampleStat(vc, k); - } - - } - //g+ - /* - System.out.format("RefBefore: %s\n",new String(refBefore)); - System.out.format("RefAfter: %s\n",new String(refAfter)); - System.out.format("Indel Allele: %s\n", indelAlleleString); - System.out.format("Num Repetitions: %d\n", numRepetitions); - */ + ArrayList indices = IndelUtils.findEventClassificationIndex(vc,ref); + for (int index: indices) + incrementSampleStat(vc, index); } } diff --git a/java/src/org/broadinstitute/sting/utils/IndelUtils.java b/java/src/org/broadinstitute/sting/utils/IndelUtils.java new file mode 100755 index 000000000..bfbeaee55 --- /dev/null +++ b/java/src/org/broadinstitute/sting/utils/IndelUtils.java @@ -0,0 +1,237 @@ +package org.broadinstitute.sting.utils; + +import org.broad.tribble.util.variantcontext.VariantContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +import java.util.ArrayList; +import java.util.Arrays; + +/** + * Created by IntelliJ IDEA. + * User: delangel + * Date: Feb 3, 2011 + * Time: 2:44:22 PM + * To change this template use File | Settings | File Templates. + */ +public class IndelUtils { + protected final static String[] COLUMN_KEYS; + + + + static { + COLUMN_KEYS= new String[41]; + COLUMN_KEYS[0] = "Novel_A"; + COLUMN_KEYS[1] = "Novel_C"; + COLUMN_KEYS[2] = "Novel_G"; + COLUMN_KEYS[3] = "Novel_T"; + COLUMN_KEYS[4] = "NOVEL_1"; + COLUMN_KEYS[5] = "NOVEL_2"; + COLUMN_KEYS[6] = "NOVEL_3"; + COLUMN_KEYS[7] = "NOVEL_4"; + COLUMN_KEYS[8] = "NOVEL_5"; + COLUMN_KEYS[9] = "NOVEL_6"; + COLUMN_KEYS[10] = "NOVEL_7"; + COLUMN_KEYS[11] = "NOVEL_8"; + COLUMN_KEYS[12] = "NOVEL_9"; + COLUMN_KEYS[13] = "NOVEL_10orMore"; + COLUMN_KEYS[14] = "RepeatExpansion_A"; + COLUMN_KEYS[15] = "RepeatExpansion_C"; + COLUMN_KEYS[16] = "RepeatExpansion_G"; + COLUMN_KEYS[17] = "RepeatExpansion_T"; + COLUMN_KEYS[18] = "RepeatExpansion_AC"; + COLUMN_KEYS[19] = "RepeatExpansion_AG"; + COLUMN_KEYS[20] = "RepeatExpansion_AT"; + COLUMN_KEYS[21] = "RepeatExpansion_CA"; + COLUMN_KEYS[22] = "RepeatExpansion_CG"; + COLUMN_KEYS[23] = "RepeatExpansion_CT"; + COLUMN_KEYS[24] = "RepeatExpansion_GA"; + COLUMN_KEYS[25] = "RepeatExpansion_GC"; + COLUMN_KEYS[26] = "RepeatExpansion_GT"; + COLUMN_KEYS[27] = "RepeatExpansion_TA"; + COLUMN_KEYS[28] = "RepeatExpansion_TC"; + COLUMN_KEYS[29] = "RepeatExpansion_TG"; + COLUMN_KEYS[30] = "RepeatExpansion_1"; + COLUMN_KEYS[31] = "RepeatExpansion_2"; + COLUMN_KEYS[32] = "RepeatExpansion_3"; + COLUMN_KEYS[33] = "RepeatExpansion_4"; + COLUMN_KEYS[34] = "RepeatExpansion_5"; + COLUMN_KEYS[35] = "RepeatExpansion_6"; + COLUMN_KEYS[36] = "RepeatExpansion_7"; + COLUMN_KEYS[37] = "RepeatExpansion_8"; + COLUMN_KEYS[38] = "RepeatExpansion_9"; + COLUMN_KEYS[39] = "RepeatExpansion_10orMore"; + COLUMN_KEYS[40] = "Other"; + + } + + private static final int START_IND_NOVEL = 4; + private static final int STOP_IND_NOVEL = 13; + private static final int START_IND_FOR_REPEAT_EXPANSION_1 = 14; + private static final int IND_FOR_REPEAT_EXPANSION_A = 14; + private static final int IND_FOR_REPEAT_EXPANSION_C = 15; + private static final int IND_FOR_REPEAT_EXPANSION_G = 16; + private static final int IND_FOR_REPEAT_EXPANSION_T = 17; + private static final int STOP_IND_FOR_REPEAT_EXPANSION_2 = 29; + private static final int START_IND_FOR_REPEAT_EXPANSION_COUNTS = 30; + private static final int STOP_IND_FOR_REPEAT_EXPANSION_COUNTS = 39; + private static final int IND_FOR_OTHER_EVENT = 40; + private static final int START_IND_NOVEL_PER_BASE = 0; + private static final int STOP_IND_NOVEL_PER_BASE = 3; + + private static String findMinimalEvent(String eventString) { + + // for each length up to given string length, see if event string is a repetition of units of size N + String minEvent = eventString; + for (int k=1; k < eventString.length(); k++) { + if (eventString.length() % k > 0) + continue; + String str = eventString.substring(0,k); + // now see if event string is a repetition of str + int numReps = eventString.length() / k; + String r = ""; + for (int j=0; j < numReps; j++) + r = r.concat(str); + + if (r.matches(eventString)) { + minEvent = str; + break; + } + + } + return minEvent; + } + + public static ArrayList findEventClassificationIndex(VariantContext vc, ReferenceContext ref) { + int eventLength; + + String indelAlleleString; + boolean done = false; + + ArrayList inds = new ArrayList(); + if ( vc.isInsertion() ) { + indelAlleleString = vc.getAlternateAllele(0).getDisplayString(); + } else if ( vc.isDeletion() ) { + indelAlleleString = vc.getReference().getDisplayString(); + } + else { + inds.add(IND_FOR_OTHER_EVENT); + return inds; + } + + byte[] refBases = ref.getBases(); + + indelAlleleString = findMinimalEvent(indelAlleleString); + eventLength = indelAlleleString.length(); + + // See first if indel is a repetition of bases before current + int indStart = refBases.length/2-eventLength+1; + + int numRepetitions = 0; + while (!done) { + if (indStart < 0) + done = true; + else { + String refPiece = new String(Arrays.copyOfRange(refBases,indStart,indStart+eventLength)); + if (refPiece.matches(indelAlleleString)) + { + numRepetitions++; + indStart = indStart - eventLength; + } + else + done = true; + + } + } + + // now do it forward + done = false; + indStart = refBases.length/2+1; + while (!done) { + if (indStart + eventLength >= refBases.length) + break; + else { + String refPiece = new String(Arrays.copyOfRange(refBases,indStart,indStart+eventLength)); + if (refPiece.matches(indelAlleleString)) + { + numRepetitions++; + indStart = indStart + eventLength; + } + else + done = true; + + } + } + + if (numRepetitions == 0) { + //unrepeated sequence from surroundings + int ind = START_IND_NOVEL + (eventLength-1); + if (ind > STOP_IND_NOVEL) + ind = STOP_IND_NOVEL; + inds.add(ind); + + if (eventLength == 1) { + // log single base indels additionally by base + String keyStr = "Novel_" + indelAlleleString; + int k; + for (k=START_IND_NOVEL_PER_BASE; k <= STOP_IND_NOVEL_PER_BASE; k++) { + if (keyStr.matches(COLUMN_KEYS[k])) + break; + } + inds.add(k); + } + } + else { + int ind = START_IND_FOR_REPEAT_EXPANSION_COUNTS + (numRepetitions-1); + if (ind > STOP_IND_FOR_REPEAT_EXPANSION_COUNTS) + ind = STOP_IND_FOR_REPEAT_EXPANSION_COUNTS; + inds.add(ind); + + if (eventLength<=2) { + // for single or dinucleotide indels, we further log the base in which they occurred + String keyStr = "RepeatExpansion_" + indelAlleleString; + int k; + for (k=START_IND_FOR_REPEAT_EXPANSION_1; k <= STOP_IND_FOR_REPEAT_EXPANSION_2; k++) { + if (keyStr.matches(COLUMN_KEYS[k])) + break; + } + // log now event + inds.add(k); + } + + } + + return inds; + } + + public static String[] getIndelClassificationNames() { + return COLUMN_KEYS; + } + + public static String getIndelClassificationName(int k) { + if (k >=0 && k < COLUMN_KEYS.length) + return COLUMN_KEYS[k]; + else + throw new ReviewedStingException("Invalid index when trying to get indel classification name"); + } + + public static boolean isATExpansion(VariantContext vc, ReferenceContext ref) { + ArrayList inds = findEventClassificationIndex(vc, ref); + + boolean isIt = false; + for (int k : inds) { + if (k == IND_FOR_REPEAT_EXPANSION_A || k == IND_FOR_REPEAT_EXPANSION_A) { + isIt = true; + break; + } + } + + return isIt; + + } + public static boolean isCGExpansion(VariantContext vc, ReferenceContext ref) { + return !isATExpansion(vc,ref); + + } + +}