diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelMetricsByAC.java b/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelMetricsByAC.java index e61540c32..962c333aa 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelMetricsByAC.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelMetricsByAC.java @@ -59,7 +59,7 @@ public class IndelMetricsByAC extends VariantEvaluator { private static final int INDEL_SIZE_LIMIT = 100; private static final int NUM_SCALAR_COLUMNS = 6; static int len2Index(int ind) { - return ind+INDEL_SIZE_LIMIT+NUM_SCALAR_COLUMNS; + return ind+INDEL_SIZE_LIMIT; } static int index2len(int ind) { diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelStatistics.java b/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelStatistics.java index 97a5669a7..b58148021 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelStatistics.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelStatistics.java @@ -8,8 +8,10 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.varianteval.VariantEvalWalker; import org.broadinstitute.sting.gatk.walkers.varianteval.tags.Analysis; import org.broadinstitute.sting.gatk.walkers.varianteval.tags.DataPoint; +import org.broadinstitute.sting.utils.IndelUtils; import org.broadinstitute.sting.utils.report.utils.TableType; +import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; @@ -197,74 +199,16 @@ public class IndelStatistics extends VariantEvaluator { static class IndelClasses implements TableType { protected final static String ALL_SAMPLES_KEY = "allSamples"; - protected final static String[] COLUMN_KEYS; - - - - static { - COLUMN_KEYS= new String[41]; - COLUMN_KEYS[0] = "Novel_A"; - COLUMN_KEYS[1] = "Novel_C"; - COLUMN_KEYS[2] = "Novel_G"; - COLUMN_KEYS[3] = "Novel_T"; - COLUMN_KEYS[4] = "NOVEL_1"; - COLUMN_KEYS[5] = "NOVEL_2"; - COLUMN_KEYS[6] = "NOVEL_3"; - COLUMN_KEYS[7] = "NOVEL_4"; - COLUMN_KEYS[8] = "NOVEL_5"; - COLUMN_KEYS[9] = "NOVEL_6"; - COLUMN_KEYS[10] = "NOVEL_7"; - COLUMN_KEYS[11] = "NOVEL_8"; - COLUMN_KEYS[12] = "NOVEL_9"; - COLUMN_KEYS[13] = "NOVEL_10orMore"; - COLUMN_KEYS[14] = "RepeatExpansion_A"; - COLUMN_KEYS[15] = "RepeatExpansion_C"; - COLUMN_KEYS[16] = "RepeatExpansion_G"; - COLUMN_KEYS[17] = "RepeatExpansion_T"; - COLUMN_KEYS[18] = "RepeatExpansion_AC"; - COLUMN_KEYS[19] = "RepeatExpansion_AG"; - COLUMN_KEYS[20] = "RepeatExpansion_AT"; - COLUMN_KEYS[21] = "RepeatExpansion_CA"; - COLUMN_KEYS[22] = "RepeatExpansion_CG"; - COLUMN_KEYS[23] = "RepeatExpansion_CT"; - COLUMN_KEYS[24] = "RepeatExpansion_GA"; - COLUMN_KEYS[25] = "RepeatExpansion_GC"; - COLUMN_KEYS[26] = "RepeatExpansion_GT"; - COLUMN_KEYS[27] = "RepeatExpansion_TA"; - COLUMN_KEYS[28] = "RepeatExpansion_TC"; - COLUMN_KEYS[29] = "RepeatExpansion_TG"; - COLUMN_KEYS[30] = "RepeatExpansion_1"; - COLUMN_KEYS[31] = "RepeatExpansion_2"; - COLUMN_KEYS[32] = "RepeatExpansion_3"; - COLUMN_KEYS[33] = "RepeatExpansion_4"; - COLUMN_KEYS[34] = "RepeatExpansion_5"; - COLUMN_KEYS[35] = "RepeatExpansion_6"; - COLUMN_KEYS[36] = "RepeatExpansion_7"; - COLUMN_KEYS[37] = "RepeatExpansion_8"; - COLUMN_KEYS[38] = "RepeatExpansion_9"; - COLUMN_KEYS[39] = "RepeatExpansion_10orMore"; - COLUMN_KEYS[40] = "Other"; - - } - - private static final int START_IND_NOVEL = 4; - private static final int STOP_IND_NOVEL = 13; - private static final int START_IND_FOR_REPEAT_EXPANSION_1 = 14; - private static final int STOP_IND_FOR_REPEAT_EXPANSION_2 = 29; - private static final int START_IND_FOR_REPEAT_EXPANSION_COUNTS = 30; - private static final int STOP_IND_FOR_REPEAT_EXPANSION_COUNTS = 39; - private static final int IND_FOR_OTHER_EVENT = 40; - private static final int START_IND_NOVEL_PER_BASE = 0; - private static final int STOP_IND_NOVEL_PER_BASE = 3; + protected final static String[] columnNames = IndelUtils.getIndelClassificationNames(); // map of sample to statistics protected final HashMap indelClassSummary = new HashMap(); public IndelClasses(final VariantContext vc) { - indelClassSummary.put(ALL_SAMPLES_KEY, new int[COLUMN_KEYS.length]); + indelClassSummary.put(ALL_SAMPLES_KEY, new int[columnNames.length]); for( final String sample : vc.getGenotypes().keySet() ) { - indelClassSummary.put(sample, new int[COLUMN_KEYS.length]); + indelClassSummary.put(sample, new int[columnNames.length]); } } @@ -285,7 +229,7 @@ public class IndelStatistics extends VariantEvaluator { * @return a list of objects, in this case strings, that are the column names */ public Object[] getColumnKeys() { - return COLUMN_KEYS; + return columnNames; } public String getName() { @@ -317,138 +261,12 @@ public class IndelStatistics extends VariantEvaluator { /* * increment the specified value */ - private String findMinimalEvent(String eventString) { + public void incrValue(VariantContext vc, ReferenceContext ref) { - // for each length up to given string length, see if event string is a repetition of units of size N - boolean foundSubstring = false; - String minEvent = eventString; - for (int k=1; k < eventString.length(); k++) { - if (eventString.length() % k > 0) - continue; - String str = eventString.substring(0,k); - // now see if event string is a repetition of str - int numReps = eventString.length() / k; - String r = ""; - for (int j=0; j < numReps; j++) - r = r.concat(str); - if (r.matches(eventString)) { - foundSubstring = true; - minEvent = str; - break; - } - - } - return minEvent; - } - public void incrValue(VariantContext vc, ReferenceContext ref) { - int eventLength = 0; - boolean isInsertion = false, isDeletion = false; - String indelAlleleString; - - if ( vc.isInsertion() ) { - isInsertion = true; - indelAlleleString = vc.getAlternateAllele(0).getDisplayString(); - } else if ( vc.isDeletion() ) { - isDeletion = true; - indelAlleleString = vc.getReference().getDisplayString(); - } - else { - incrementSampleStat(vc, IND_FOR_OTHER_EVENT); - - return; - } - - byte[] refBases = ref.getBases(); - - indelAlleleString = findMinimalEvent(indelAlleleString); - eventLength = indelAlleleString.length(); - - // See first if indel is a repetition of bases before current - int indStart = refBases.length/2-eventLength+1; - - boolean done = false; - int numRepetitions = 0; - while (!done) { - if (indStart < 0) - done = true; - else { - String refPiece = new String(Arrays.copyOfRange(refBases,indStart,indStart+eventLength)); - if (refPiece.matches(indelAlleleString)) - { - numRepetitions++; - indStart = indStart - eventLength; - } - else - done = true; - - } - } - - // now do it forward - done = false; - indStart = refBases.length/2+1; - while (!done) { - if (indStart + eventLength >= refBases.length) - break; - else { - String refPiece = new String(Arrays.copyOfRange(refBases,indStart,indStart+eventLength)); - if (refPiece.matches(indelAlleleString)) - { - numRepetitions++; - indStart = indStart + eventLength; - } - else - done = true; - - } - } - - if (numRepetitions == 0) { - //unrepeated sequence from surroundings - int ind = START_IND_NOVEL + (eventLength-1); - if (ind > STOP_IND_NOVEL) - ind = STOP_IND_NOVEL; - incrementSampleStat(vc, ind); - - if (eventLength == 1) { - // log single base indels additionally by base - String keyStr = "Novel_" + indelAlleleString; - int k; - for (k=START_IND_NOVEL_PER_BASE; k <= STOP_IND_NOVEL_PER_BASE; k++) { - if (keyStr.matches(COLUMN_KEYS[k])) - break; - } - // log now event - incrementSampleStat(vc, k); - } - } - else { - int ind = START_IND_FOR_REPEAT_EXPANSION_COUNTS + (numRepetitions-1); - if (ind > STOP_IND_FOR_REPEAT_EXPANSION_COUNTS) - ind = STOP_IND_FOR_REPEAT_EXPANSION_COUNTS; - incrementSampleStat(vc, ind); - - if (eventLength<=2) { - // for single or dinucleotide indels, we further log the base in which they occurred - String keyStr = "RepeatExpansion_" + indelAlleleString; - int k; - for (k=START_IND_FOR_REPEAT_EXPANSION_1; k <= STOP_IND_FOR_REPEAT_EXPANSION_2; k++) { - if (keyStr.matches(COLUMN_KEYS[k])) - break; - } - // log now event - incrementSampleStat(vc, k); - } - - } - //g+ - /* - System.out.format("RefBefore: %s\n",new String(refBefore)); - System.out.format("RefAfter: %s\n",new String(refAfter)); - System.out.format("Indel Allele: %s\n", indelAlleleString); - System.out.format("Num Repetitions: %d\n", numRepetitions); - */ + ArrayList indices = IndelUtils.findEventClassificationIndex(vc,ref); + for (int index: indices) + incrementSampleStat(vc, index); } } diff --git a/java/src/org/broadinstitute/sting/utils/IndelUtils.java b/java/src/org/broadinstitute/sting/utils/IndelUtils.java new file mode 100755 index 000000000..bfbeaee55 --- /dev/null +++ b/java/src/org/broadinstitute/sting/utils/IndelUtils.java @@ -0,0 +1,237 @@ +package org.broadinstitute.sting.utils; + +import org.broad.tribble.util.variantcontext.VariantContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +import java.util.ArrayList; +import java.util.Arrays; + +/** + * Created by IntelliJ IDEA. + * User: delangel + * Date: Feb 3, 2011 + * Time: 2:44:22 PM + * To change this template use File | Settings | File Templates. + */ +public class IndelUtils { + protected final static String[] COLUMN_KEYS; + + + + static { + COLUMN_KEYS= new String[41]; + COLUMN_KEYS[0] = "Novel_A"; + COLUMN_KEYS[1] = "Novel_C"; + COLUMN_KEYS[2] = "Novel_G"; + COLUMN_KEYS[3] = "Novel_T"; + COLUMN_KEYS[4] = "NOVEL_1"; + COLUMN_KEYS[5] = "NOVEL_2"; + COLUMN_KEYS[6] = "NOVEL_3"; + COLUMN_KEYS[7] = "NOVEL_4"; + COLUMN_KEYS[8] = "NOVEL_5"; + COLUMN_KEYS[9] = "NOVEL_6"; + COLUMN_KEYS[10] = "NOVEL_7"; + COLUMN_KEYS[11] = "NOVEL_8"; + COLUMN_KEYS[12] = "NOVEL_9"; + COLUMN_KEYS[13] = "NOVEL_10orMore"; + COLUMN_KEYS[14] = "RepeatExpansion_A"; + COLUMN_KEYS[15] = "RepeatExpansion_C"; + COLUMN_KEYS[16] = "RepeatExpansion_G"; + COLUMN_KEYS[17] = "RepeatExpansion_T"; + COLUMN_KEYS[18] = "RepeatExpansion_AC"; + COLUMN_KEYS[19] = "RepeatExpansion_AG"; + COLUMN_KEYS[20] = "RepeatExpansion_AT"; + COLUMN_KEYS[21] = "RepeatExpansion_CA"; + COLUMN_KEYS[22] = "RepeatExpansion_CG"; + COLUMN_KEYS[23] = "RepeatExpansion_CT"; + COLUMN_KEYS[24] = "RepeatExpansion_GA"; + COLUMN_KEYS[25] = "RepeatExpansion_GC"; + COLUMN_KEYS[26] = "RepeatExpansion_GT"; + COLUMN_KEYS[27] = "RepeatExpansion_TA"; + COLUMN_KEYS[28] = "RepeatExpansion_TC"; + COLUMN_KEYS[29] = "RepeatExpansion_TG"; + COLUMN_KEYS[30] = "RepeatExpansion_1"; + COLUMN_KEYS[31] = "RepeatExpansion_2"; + COLUMN_KEYS[32] = "RepeatExpansion_3"; + COLUMN_KEYS[33] = "RepeatExpansion_4"; + COLUMN_KEYS[34] = "RepeatExpansion_5"; + COLUMN_KEYS[35] = "RepeatExpansion_6"; + COLUMN_KEYS[36] = "RepeatExpansion_7"; + COLUMN_KEYS[37] = "RepeatExpansion_8"; + COLUMN_KEYS[38] = "RepeatExpansion_9"; + COLUMN_KEYS[39] = "RepeatExpansion_10orMore"; + COLUMN_KEYS[40] = "Other"; + + } + + private static final int START_IND_NOVEL = 4; + private static final int STOP_IND_NOVEL = 13; + private static final int START_IND_FOR_REPEAT_EXPANSION_1 = 14; + private static final int IND_FOR_REPEAT_EXPANSION_A = 14; + private static final int IND_FOR_REPEAT_EXPANSION_C = 15; + private static final int IND_FOR_REPEAT_EXPANSION_G = 16; + private static final int IND_FOR_REPEAT_EXPANSION_T = 17; + private static final int STOP_IND_FOR_REPEAT_EXPANSION_2 = 29; + private static final int START_IND_FOR_REPEAT_EXPANSION_COUNTS = 30; + private static final int STOP_IND_FOR_REPEAT_EXPANSION_COUNTS = 39; + private static final int IND_FOR_OTHER_EVENT = 40; + private static final int START_IND_NOVEL_PER_BASE = 0; + private static final int STOP_IND_NOVEL_PER_BASE = 3; + + private static String findMinimalEvent(String eventString) { + + // for each length up to given string length, see if event string is a repetition of units of size N + String minEvent = eventString; + for (int k=1; k < eventString.length(); k++) { + if (eventString.length() % k > 0) + continue; + String str = eventString.substring(0,k); + // now see if event string is a repetition of str + int numReps = eventString.length() / k; + String r = ""; + for (int j=0; j < numReps; j++) + r = r.concat(str); + + if (r.matches(eventString)) { + minEvent = str; + break; + } + + } + return minEvent; + } + + public static ArrayList findEventClassificationIndex(VariantContext vc, ReferenceContext ref) { + int eventLength; + + String indelAlleleString; + boolean done = false; + + ArrayList inds = new ArrayList(); + if ( vc.isInsertion() ) { + indelAlleleString = vc.getAlternateAllele(0).getDisplayString(); + } else if ( vc.isDeletion() ) { + indelAlleleString = vc.getReference().getDisplayString(); + } + else { + inds.add(IND_FOR_OTHER_EVENT); + return inds; + } + + byte[] refBases = ref.getBases(); + + indelAlleleString = findMinimalEvent(indelAlleleString); + eventLength = indelAlleleString.length(); + + // See first if indel is a repetition of bases before current + int indStart = refBases.length/2-eventLength+1; + + int numRepetitions = 0; + while (!done) { + if (indStart < 0) + done = true; + else { + String refPiece = new String(Arrays.copyOfRange(refBases,indStart,indStart+eventLength)); + if (refPiece.matches(indelAlleleString)) + { + numRepetitions++; + indStart = indStart - eventLength; + } + else + done = true; + + } + } + + // now do it forward + done = false; + indStart = refBases.length/2+1; + while (!done) { + if (indStart + eventLength >= refBases.length) + break; + else { + String refPiece = new String(Arrays.copyOfRange(refBases,indStart,indStart+eventLength)); + if (refPiece.matches(indelAlleleString)) + { + numRepetitions++; + indStart = indStart + eventLength; + } + else + done = true; + + } + } + + if (numRepetitions == 0) { + //unrepeated sequence from surroundings + int ind = START_IND_NOVEL + (eventLength-1); + if (ind > STOP_IND_NOVEL) + ind = STOP_IND_NOVEL; + inds.add(ind); + + if (eventLength == 1) { + // log single base indels additionally by base + String keyStr = "Novel_" + indelAlleleString; + int k; + for (k=START_IND_NOVEL_PER_BASE; k <= STOP_IND_NOVEL_PER_BASE; k++) { + if (keyStr.matches(COLUMN_KEYS[k])) + break; + } + inds.add(k); + } + } + else { + int ind = START_IND_FOR_REPEAT_EXPANSION_COUNTS + (numRepetitions-1); + if (ind > STOP_IND_FOR_REPEAT_EXPANSION_COUNTS) + ind = STOP_IND_FOR_REPEAT_EXPANSION_COUNTS; + inds.add(ind); + + if (eventLength<=2) { + // for single or dinucleotide indels, we further log the base in which they occurred + String keyStr = "RepeatExpansion_" + indelAlleleString; + int k; + for (k=START_IND_FOR_REPEAT_EXPANSION_1; k <= STOP_IND_FOR_REPEAT_EXPANSION_2; k++) { + if (keyStr.matches(COLUMN_KEYS[k])) + break; + } + // log now event + inds.add(k); + } + + } + + return inds; + } + + public static String[] getIndelClassificationNames() { + return COLUMN_KEYS; + } + + public static String getIndelClassificationName(int k) { + if (k >=0 && k < COLUMN_KEYS.length) + return COLUMN_KEYS[k]; + else + throw new ReviewedStingException("Invalid index when trying to get indel classification name"); + } + + public static boolean isATExpansion(VariantContext vc, ReferenceContext ref) { + ArrayList inds = findEventClassificationIndex(vc, ref); + + boolean isIt = false; + for (int k : inds) { + if (k == IND_FOR_REPEAT_EXPANSION_A || k == IND_FOR_REPEAT_EXPANSION_A) { + isIt = true; + break; + } + } + + return isIt; + + } + public static boolean isCGExpansion(VariantContext vc, ReferenceContext ref) { + return !isATExpansion(vc,ref); + + } + +}