Refactoring of indel evaluation code to make it easier for external functions to get access to indel classification, in preparation for IndelMetricsByAC to stratify indel classes by AC (not done yet).
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5219 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
3635606cd8
commit
f3de9ee3e0
|
|
@ -59,7 +59,7 @@ public class IndelMetricsByAC extends VariantEvaluator {
|
|||
private static final int INDEL_SIZE_LIMIT = 100;
|
||||
private static final int NUM_SCALAR_COLUMNS = 6;
|
||||
static int len2Index(int ind) {
|
||||
return ind+INDEL_SIZE_LIMIT+NUM_SCALAR_COLUMNS;
|
||||
return ind+INDEL_SIZE_LIMIT;
|
||||
}
|
||||
|
||||
static int index2len(int ind) {
|
||||
|
|
|
|||
|
|
@ -8,8 +8,10 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
|||
import org.broadinstitute.sting.gatk.walkers.varianteval.VariantEvalWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.varianteval.tags.Analysis;
|
||||
import org.broadinstitute.sting.gatk.walkers.varianteval.tags.DataPoint;
|
||||
import org.broadinstitute.sting.utils.IndelUtils;
|
||||
import org.broadinstitute.sting.utils.report.utils.TableType;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
|
||||
|
|
@ -197,74 +199,16 @@ public class IndelStatistics extends VariantEvaluator {
|
|||
|
||||
static class IndelClasses implements TableType {
|
||||
protected final static String ALL_SAMPLES_KEY = "allSamples";
|
||||
protected final static String[] COLUMN_KEYS;
|
||||
|
||||
|
||||
|
||||
static {
|
||||
COLUMN_KEYS= new String[41];
|
||||
COLUMN_KEYS[0] = "Novel_A";
|
||||
COLUMN_KEYS[1] = "Novel_C";
|
||||
COLUMN_KEYS[2] = "Novel_G";
|
||||
COLUMN_KEYS[3] = "Novel_T";
|
||||
COLUMN_KEYS[4] = "NOVEL_1";
|
||||
COLUMN_KEYS[5] = "NOVEL_2";
|
||||
COLUMN_KEYS[6] = "NOVEL_3";
|
||||
COLUMN_KEYS[7] = "NOVEL_4";
|
||||
COLUMN_KEYS[8] = "NOVEL_5";
|
||||
COLUMN_KEYS[9] = "NOVEL_6";
|
||||
COLUMN_KEYS[10] = "NOVEL_7";
|
||||
COLUMN_KEYS[11] = "NOVEL_8";
|
||||
COLUMN_KEYS[12] = "NOVEL_9";
|
||||
COLUMN_KEYS[13] = "NOVEL_10orMore";
|
||||
COLUMN_KEYS[14] = "RepeatExpansion_A";
|
||||
COLUMN_KEYS[15] = "RepeatExpansion_C";
|
||||
COLUMN_KEYS[16] = "RepeatExpansion_G";
|
||||
COLUMN_KEYS[17] = "RepeatExpansion_T";
|
||||
COLUMN_KEYS[18] = "RepeatExpansion_AC";
|
||||
COLUMN_KEYS[19] = "RepeatExpansion_AG";
|
||||
COLUMN_KEYS[20] = "RepeatExpansion_AT";
|
||||
COLUMN_KEYS[21] = "RepeatExpansion_CA";
|
||||
COLUMN_KEYS[22] = "RepeatExpansion_CG";
|
||||
COLUMN_KEYS[23] = "RepeatExpansion_CT";
|
||||
COLUMN_KEYS[24] = "RepeatExpansion_GA";
|
||||
COLUMN_KEYS[25] = "RepeatExpansion_GC";
|
||||
COLUMN_KEYS[26] = "RepeatExpansion_GT";
|
||||
COLUMN_KEYS[27] = "RepeatExpansion_TA";
|
||||
COLUMN_KEYS[28] = "RepeatExpansion_TC";
|
||||
COLUMN_KEYS[29] = "RepeatExpansion_TG";
|
||||
COLUMN_KEYS[30] = "RepeatExpansion_1";
|
||||
COLUMN_KEYS[31] = "RepeatExpansion_2";
|
||||
COLUMN_KEYS[32] = "RepeatExpansion_3";
|
||||
COLUMN_KEYS[33] = "RepeatExpansion_4";
|
||||
COLUMN_KEYS[34] = "RepeatExpansion_5";
|
||||
COLUMN_KEYS[35] = "RepeatExpansion_6";
|
||||
COLUMN_KEYS[36] = "RepeatExpansion_7";
|
||||
COLUMN_KEYS[37] = "RepeatExpansion_8";
|
||||
COLUMN_KEYS[38] = "RepeatExpansion_9";
|
||||
COLUMN_KEYS[39] = "RepeatExpansion_10orMore";
|
||||
COLUMN_KEYS[40] = "Other";
|
||||
|
||||
}
|
||||
|
||||
private static final int START_IND_NOVEL = 4;
|
||||
private static final int STOP_IND_NOVEL = 13;
|
||||
private static final int START_IND_FOR_REPEAT_EXPANSION_1 = 14;
|
||||
private static final int STOP_IND_FOR_REPEAT_EXPANSION_2 = 29;
|
||||
private static final int START_IND_FOR_REPEAT_EXPANSION_COUNTS = 30;
|
||||
private static final int STOP_IND_FOR_REPEAT_EXPANSION_COUNTS = 39;
|
||||
private static final int IND_FOR_OTHER_EVENT = 40;
|
||||
private static final int START_IND_NOVEL_PER_BASE = 0;
|
||||
private static final int STOP_IND_NOVEL_PER_BASE = 3;
|
||||
protected final static String[] columnNames = IndelUtils.getIndelClassificationNames();
|
||||
|
||||
|
||||
// map of sample to statistics
|
||||
protected final HashMap<String, int[]> indelClassSummary = new HashMap<String, int[]>();
|
||||
|
||||
public IndelClasses(final VariantContext vc) {
|
||||
indelClassSummary.put(ALL_SAMPLES_KEY, new int[COLUMN_KEYS.length]);
|
||||
indelClassSummary.put(ALL_SAMPLES_KEY, new int[columnNames.length]);
|
||||
for( final String sample : vc.getGenotypes().keySet() ) {
|
||||
indelClassSummary.put(sample, new int[COLUMN_KEYS.length]);
|
||||
indelClassSummary.put(sample, new int[columnNames.length]);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -285,7 +229,7 @@ public class IndelStatistics extends VariantEvaluator {
|
|||
* @return a list of objects, in this case strings, that are the column names
|
||||
*/
|
||||
public Object[] getColumnKeys() {
|
||||
return COLUMN_KEYS;
|
||||
return columnNames;
|
||||
}
|
||||
|
||||
public String getName() {
|
||||
|
|
@ -317,138 +261,12 @@ public class IndelStatistics extends VariantEvaluator {
|
|||
/*
|
||||
* increment the specified value
|
||||
*/
|
||||
private String findMinimalEvent(String eventString) {
|
||||
public void incrValue(VariantContext vc, ReferenceContext ref) {
|
||||
|
||||
// for each length up to given string length, see if event string is a repetition of units of size N
|
||||
boolean foundSubstring = false;
|
||||
String minEvent = eventString;
|
||||
for (int k=1; k < eventString.length(); k++) {
|
||||
if (eventString.length() % k > 0)
|
||||
continue;
|
||||
String str = eventString.substring(0,k);
|
||||
// now see if event string is a repetition of str
|
||||
int numReps = eventString.length() / k;
|
||||
String r = "";
|
||||
for (int j=0; j < numReps; j++)
|
||||
r = r.concat(str);
|
||||
|
||||
if (r.matches(eventString)) {
|
||||
foundSubstring = true;
|
||||
minEvent = str;
|
||||
break;
|
||||
}
|
||||
|
||||
}
|
||||
return minEvent;
|
||||
}
|
||||
public void incrValue(VariantContext vc, ReferenceContext ref) {
|
||||
int eventLength = 0;
|
||||
boolean isInsertion = false, isDeletion = false;
|
||||
String indelAlleleString;
|
||||
|
||||
if ( vc.isInsertion() ) {
|
||||
isInsertion = true;
|
||||
indelAlleleString = vc.getAlternateAllele(0).getDisplayString();
|
||||
} else if ( vc.isDeletion() ) {
|
||||
isDeletion = true;
|
||||
indelAlleleString = vc.getReference().getDisplayString();
|
||||
}
|
||||
else {
|
||||
incrementSampleStat(vc, IND_FOR_OTHER_EVENT);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
byte[] refBases = ref.getBases();
|
||||
|
||||
indelAlleleString = findMinimalEvent(indelAlleleString);
|
||||
eventLength = indelAlleleString.length();
|
||||
|
||||
// See first if indel is a repetition of bases before current
|
||||
int indStart = refBases.length/2-eventLength+1;
|
||||
|
||||
boolean done = false;
|
||||
int numRepetitions = 0;
|
||||
while (!done) {
|
||||
if (indStart < 0)
|
||||
done = true;
|
||||
else {
|
||||
String refPiece = new String(Arrays.copyOfRange(refBases,indStart,indStart+eventLength));
|
||||
if (refPiece.matches(indelAlleleString))
|
||||
{
|
||||
numRepetitions++;
|
||||
indStart = indStart - eventLength;
|
||||
}
|
||||
else
|
||||
done = true;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
// now do it forward
|
||||
done = false;
|
||||
indStart = refBases.length/2+1;
|
||||
while (!done) {
|
||||
if (indStart + eventLength >= refBases.length)
|
||||
break;
|
||||
else {
|
||||
String refPiece = new String(Arrays.copyOfRange(refBases,indStart,indStart+eventLength));
|
||||
if (refPiece.matches(indelAlleleString))
|
||||
{
|
||||
numRepetitions++;
|
||||
indStart = indStart + eventLength;
|
||||
}
|
||||
else
|
||||
done = true;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
if (numRepetitions == 0) {
|
||||
//unrepeated sequence from surroundings
|
||||
int ind = START_IND_NOVEL + (eventLength-1);
|
||||
if (ind > STOP_IND_NOVEL)
|
||||
ind = STOP_IND_NOVEL;
|
||||
incrementSampleStat(vc, ind);
|
||||
|
||||
if (eventLength == 1) {
|
||||
// log single base indels additionally by base
|
||||
String keyStr = "Novel_" + indelAlleleString;
|
||||
int k;
|
||||
for (k=START_IND_NOVEL_PER_BASE; k <= STOP_IND_NOVEL_PER_BASE; k++) {
|
||||
if (keyStr.matches(COLUMN_KEYS[k]))
|
||||
break;
|
||||
}
|
||||
// log now event
|
||||
incrementSampleStat(vc, k);
|
||||
}
|
||||
}
|
||||
else {
|
||||
int ind = START_IND_FOR_REPEAT_EXPANSION_COUNTS + (numRepetitions-1);
|
||||
if (ind > STOP_IND_FOR_REPEAT_EXPANSION_COUNTS)
|
||||
ind = STOP_IND_FOR_REPEAT_EXPANSION_COUNTS;
|
||||
incrementSampleStat(vc, ind);
|
||||
|
||||
if (eventLength<=2) {
|
||||
// for single or dinucleotide indels, we further log the base in which they occurred
|
||||
String keyStr = "RepeatExpansion_" + indelAlleleString;
|
||||
int k;
|
||||
for (k=START_IND_FOR_REPEAT_EXPANSION_1; k <= STOP_IND_FOR_REPEAT_EXPANSION_2; k++) {
|
||||
if (keyStr.matches(COLUMN_KEYS[k]))
|
||||
break;
|
||||
}
|
||||
// log now event
|
||||
incrementSampleStat(vc, k);
|
||||
}
|
||||
|
||||
}
|
||||
//g+
|
||||
/*
|
||||
System.out.format("RefBefore: %s\n",new String(refBefore));
|
||||
System.out.format("RefAfter: %s\n",new String(refAfter));
|
||||
System.out.format("Indel Allele: %s\n", indelAlleleString);
|
||||
System.out.format("Num Repetitions: %d\n", numRepetitions);
|
||||
*/
|
||||
ArrayList<Integer> indices = IndelUtils.findEventClassificationIndex(vc,ref);
|
||||
for (int index: indices)
|
||||
incrementSampleStat(vc, index);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,237 @@
|
|||
package org.broadinstitute.sting.utils;
|
||||
|
||||
import org.broad.tribble.util.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: delangel
|
||||
* Date: Feb 3, 2011
|
||||
* Time: 2:44:22 PM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public class IndelUtils {
|
||||
protected final static String[] COLUMN_KEYS;
|
||||
|
||||
|
||||
|
||||
static {
|
||||
COLUMN_KEYS= new String[41];
|
||||
COLUMN_KEYS[0] = "Novel_A";
|
||||
COLUMN_KEYS[1] = "Novel_C";
|
||||
COLUMN_KEYS[2] = "Novel_G";
|
||||
COLUMN_KEYS[3] = "Novel_T";
|
||||
COLUMN_KEYS[4] = "NOVEL_1";
|
||||
COLUMN_KEYS[5] = "NOVEL_2";
|
||||
COLUMN_KEYS[6] = "NOVEL_3";
|
||||
COLUMN_KEYS[7] = "NOVEL_4";
|
||||
COLUMN_KEYS[8] = "NOVEL_5";
|
||||
COLUMN_KEYS[9] = "NOVEL_6";
|
||||
COLUMN_KEYS[10] = "NOVEL_7";
|
||||
COLUMN_KEYS[11] = "NOVEL_8";
|
||||
COLUMN_KEYS[12] = "NOVEL_9";
|
||||
COLUMN_KEYS[13] = "NOVEL_10orMore";
|
||||
COLUMN_KEYS[14] = "RepeatExpansion_A";
|
||||
COLUMN_KEYS[15] = "RepeatExpansion_C";
|
||||
COLUMN_KEYS[16] = "RepeatExpansion_G";
|
||||
COLUMN_KEYS[17] = "RepeatExpansion_T";
|
||||
COLUMN_KEYS[18] = "RepeatExpansion_AC";
|
||||
COLUMN_KEYS[19] = "RepeatExpansion_AG";
|
||||
COLUMN_KEYS[20] = "RepeatExpansion_AT";
|
||||
COLUMN_KEYS[21] = "RepeatExpansion_CA";
|
||||
COLUMN_KEYS[22] = "RepeatExpansion_CG";
|
||||
COLUMN_KEYS[23] = "RepeatExpansion_CT";
|
||||
COLUMN_KEYS[24] = "RepeatExpansion_GA";
|
||||
COLUMN_KEYS[25] = "RepeatExpansion_GC";
|
||||
COLUMN_KEYS[26] = "RepeatExpansion_GT";
|
||||
COLUMN_KEYS[27] = "RepeatExpansion_TA";
|
||||
COLUMN_KEYS[28] = "RepeatExpansion_TC";
|
||||
COLUMN_KEYS[29] = "RepeatExpansion_TG";
|
||||
COLUMN_KEYS[30] = "RepeatExpansion_1";
|
||||
COLUMN_KEYS[31] = "RepeatExpansion_2";
|
||||
COLUMN_KEYS[32] = "RepeatExpansion_3";
|
||||
COLUMN_KEYS[33] = "RepeatExpansion_4";
|
||||
COLUMN_KEYS[34] = "RepeatExpansion_5";
|
||||
COLUMN_KEYS[35] = "RepeatExpansion_6";
|
||||
COLUMN_KEYS[36] = "RepeatExpansion_7";
|
||||
COLUMN_KEYS[37] = "RepeatExpansion_8";
|
||||
COLUMN_KEYS[38] = "RepeatExpansion_9";
|
||||
COLUMN_KEYS[39] = "RepeatExpansion_10orMore";
|
||||
COLUMN_KEYS[40] = "Other";
|
||||
|
||||
}
|
||||
|
||||
private static final int START_IND_NOVEL = 4;
|
||||
private static final int STOP_IND_NOVEL = 13;
|
||||
private static final int START_IND_FOR_REPEAT_EXPANSION_1 = 14;
|
||||
private static final int IND_FOR_REPEAT_EXPANSION_A = 14;
|
||||
private static final int IND_FOR_REPEAT_EXPANSION_C = 15;
|
||||
private static final int IND_FOR_REPEAT_EXPANSION_G = 16;
|
||||
private static final int IND_FOR_REPEAT_EXPANSION_T = 17;
|
||||
private static final int STOP_IND_FOR_REPEAT_EXPANSION_2 = 29;
|
||||
private static final int START_IND_FOR_REPEAT_EXPANSION_COUNTS = 30;
|
||||
private static final int STOP_IND_FOR_REPEAT_EXPANSION_COUNTS = 39;
|
||||
private static final int IND_FOR_OTHER_EVENT = 40;
|
||||
private static final int START_IND_NOVEL_PER_BASE = 0;
|
||||
private static final int STOP_IND_NOVEL_PER_BASE = 3;
|
||||
|
||||
private static String findMinimalEvent(String eventString) {
|
||||
|
||||
// for each length up to given string length, see if event string is a repetition of units of size N
|
||||
String minEvent = eventString;
|
||||
for (int k=1; k < eventString.length(); k++) {
|
||||
if (eventString.length() % k > 0)
|
||||
continue;
|
||||
String str = eventString.substring(0,k);
|
||||
// now see if event string is a repetition of str
|
||||
int numReps = eventString.length() / k;
|
||||
String r = "";
|
||||
for (int j=0; j < numReps; j++)
|
||||
r = r.concat(str);
|
||||
|
||||
if (r.matches(eventString)) {
|
||||
minEvent = str;
|
||||
break;
|
||||
}
|
||||
|
||||
}
|
||||
return minEvent;
|
||||
}
|
||||
|
||||
public static ArrayList<Integer> findEventClassificationIndex(VariantContext vc, ReferenceContext ref) {
|
||||
int eventLength;
|
||||
|
||||
String indelAlleleString;
|
||||
boolean done = false;
|
||||
|
||||
ArrayList<Integer> inds = new ArrayList<Integer>();
|
||||
if ( vc.isInsertion() ) {
|
||||
indelAlleleString = vc.getAlternateAllele(0).getDisplayString();
|
||||
} else if ( vc.isDeletion() ) {
|
||||
indelAlleleString = vc.getReference().getDisplayString();
|
||||
}
|
||||
else {
|
||||
inds.add(IND_FOR_OTHER_EVENT);
|
||||
return inds;
|
||||
}
|
||||
|
||||
byte[] refBases = ref.getBases();
|
||||
|
||||
indelAlleleString = findMinimalEvent(indelAlleleString);
|
||||
eventLength = indelAlleleString.length();
|
||||
|
||||
// See first if indel is a repetition of bases before current
|
||||
int indStart = refBases.length/2-eventLength+1;
|
||||
|
||||
int numRepetitions = 0;
|
||||
while (!done) {
|
||||
if (indStart < 0)
|
||||
done = true;
|
||||
else {
|
||||
String refPiece = new String(Arrays.copyOfRange(refBases,indStart,indStart+eventLength));
|
||||
if (refPiece.matches(indelAlleleString))
|
||||
{
|
||||
numRepetitions++;
|
||||
indStart = indStart - eventLength;
|
||||
}
|
||||
else
|
||||
done = true;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
// now do it forward
|
||||
done = false;
|
||||
indStart = refBases.length/2+1;
|
||||
while (!done) {
|
||||
if (indStart + eventLength >= refBases.length)
|
||||
break;
|
||||
else {
|
||||
String refPiece = new String(Arrays.copyOfRange(refBases,indStart,indStart+eventLength));
|
||||
if (refPiece.matches(indelAlleleString))
|
||||
{
|
||||
numRepetitions++;
|
||||
indStart = indStart + eventLength;
|
||||
}
|
||||
else
|
||||
done = true;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
if (numRepetitions == 0) {
|
||||
//unrepeated sequence from surroundings
|
||||
int ind = START_IND_NOVEL + (eventLength-1);
|
||||
if (ind > STOP_IND_NOVEL)
|
||||
ind = STOP_IND_NOVEL;
|
||||
inds.add(ind);
|
||||
|
||||
if (eventLength == 1) {
|
||||
// log single base indels additionally by base
|
||||
String keyStr = "Novel_" + indelAlleleString;
|
||||
int k;
|
||||
for (k=START_IND_NOVEL_PER_BASE; k <= STOP_IND_NOVEL_PER_BASE; k++) {
|
||||
if (keyStr.matches(COLUMN_KEYS[k]))
|
||||
break;
|
||||
}
|
||||
inds.add(k);
|
||||
}
|
||||
}
|
||||
else {
|
||||
int ind = START_IND_FOR_REPEAT_EXPANSION_COUNTS + (numRepetitions-1);
|
||||
if (ind > STOP_IND_FOR_REPEAT_EXPANSION_COUNTS)
|
||||
ind = STOP_IND_FOR_REPEAT_EXPANSION_COUNTS;
|
||||
inds.add(ind);
|
||||
|
||||
if (eventLength<=2) {
|
||||
// for single or dinucleotide indels, we further log the base in which they occurred
|
||||
String keyStr = "RepeatExpansion_" + indelAlleleString;
|
||||
int k;
|
||||
for (k=START_IND_FOR_REPEAT_EXPANSION_1; k <= STOP_IND_FOR_REPEAT_EXPANSION_2; k++) {
|
||||
if (keyStr.matches(COLUMN_KEYS[k]))
|
||||
break;
|
||||
}
|
||||
// log now event
|
||||
inds.add(k);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return inds;
|
||||
}
|
||||
|
||||
public static String[] getIndelClassificationNames() {
|
||||
return COLUMN_KEYS;
|
||||
}
|
||||
|
||||
public static String getIndelClassificationName(int k) {
|
||||
if (k >=0 && k < COLUMN_KEYS.length)
|
||||
return COLUMN_KEYS[k];
|
||||
else
|
||||
throw new ReviewedStingException("Invalid index when trying to get indel classification name");
|
||||
}
|
||||
|
||||
public static boolean isATExpansion(VariantContext vc, ReferenceContext ref) {
|
||||
ArrayList<Integer> inds = findEventClassificationIndex(vc, ref);
|
||||
|
||||
boolean isIt = false;
|
||||
for (int k : inds) {
|
||||
if (k == IND_FOR_REPEAT_EXPANSION_A || k == IND_FOR_REPEAT_EXPANSION_A) {
|
||||
isIt = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return isIt;
|
||||
|
||||
}
|
||||
public static boolean isCGExpansion(VariantContext vc, ReferenceContext ref) {
|
||||
return !isATExpansion(vc,ref);
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
Loading…
Reference in New Issue