(forgot I hadn't committed this) - refactored IndelStatistics module and added a new inner class to compute Indel classification along with other statistics. So, we now get an extra table specifying, per sample, counts of whether indels are:

- Repeat Expansions
- Novel sequence
And for indels of size <=2 we get a per-mononuc. or dinuc. breakdown of novels and expansions.


git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4828 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
delangel 2010-12-13 17:43:43 +00:00
parent f795b25c47
commit 17db2e0e24
1 changed files with 249 additions and 19 deletions

View File

@ -10,6 +10,7 @@ import org.broadinstitute.sting.utils.report.tags.Analysis;
import org.broadinstitute.sting.utils.report.tags.DataPoint;
import org.broadinstitute.sting.utils.report.utils.TableType;
import java.util.Arrays;
import java.util.HashMap;
/*
@ -37,11 +38,13 @@ import java.util.HashMap;
* OTHER DEALINGS IN THE SOFTWARE.
*/
@Analysis(name = "Indel Statistics", description = "Shows various indel metrics and statistics")
@Analysis(name = "IndelStatistics", description = "Shows various indel metrics and statistics")
public class IndelStatistics extends VariantEvaluator {
@DataPoint(name="IndelStatistics", description = "Indel Statistics")
IndelStats indelStats = null;
@DataPoint(name="IndelClasses", description = "Indel Classification")
IndelClasses indelClasses = null;
private static final int INDEL_SIZE_LIMIT = 100;
@ -54,6 +57,7 @@ public class IndelStatistics extends VariantEvaluator {
static int index2len(int ind) {
return ind-INDEL_SIZE_LIMIT-NUM_SCALAR_COLUMNS;
}
static class IndelStats implements TableType {
protected final static String ALL_SAMPLES_KEY = "allSamples";
protected final static String[] COLUMN_KEYS;
@ -115,20 +119,6 @@ public class IndelStatistics extends VariantEvaluator {
public String toString() {
return getName();
}
/*
private double ratio(long numer, long denom) {
return denom != 0L ? 100.0 * ( ((double)numer) / ((double)denom) ) : 0.0;
}
final long[] allSamplesNumerators = new long[COLUMN_KEYS.length];
final long[] allSamplesDenominators = new long[COLUMN_KEYS.length];
private void updateSummaries(int i, double[] summary, long numer, long denom ) {
allSamplesNumerators[i] += numer;
allSamplesDenominators[i] += denom;
summary[i] = ratio(numer, denom);
}
*/
/*
* increment the specified value
@ -200,6 +190,241 @@ public class IndelStatistics extends VariantEvaluator {
}
}
static class IndelClasses implements TableType {
protected final static String ALL_SAMPLES_KEY = "allSamples";
protected final static String[] COLUMN_KEYS;
static {
COLUMN_KEYS= new String[41];
COLUMN_KEYS[0] = "Novel_A";
COLUMN_KEYS[1] = "Novel_C";
COLUMN_KEYS[2] = "Novel_G";
COLUMN_KEYS[3] = "Novel_T";
COLUMN_KEYS[4] = "NOVEL_1";
COLUMN_KEYS[5] = "NOVEL_2";
COLUMN_KEYS[6] = "NOVEL_3";
COLUMN_KEYS[7] = "NOVEL_4";
COLUMN_KEYS[8] = "NOVEL_5";
COLUMN_KEYS[9] = "NOVEL_6";
COLUMN_KEYS[10] = "NOVEL_7";
COLUMN_KEYS[11] = "NOVEL_8";
COLUMN_KEYS[12] = "NOVEL_9";
COLUMN_KEYS[13] = "NOVEL_10orMore";
COLUMN_KEYS[14] = "RepeatExpansion_A";
COLUMN_KEYS[15] = "RepeatExpansion_C";
COLUMN_KEYS[16] = "RepeatExpansion_G";
COLUMN_KEYS[17] = "RepeatExpansion_T";
COLUMN_KEYS[18] = "RepeatExpansion_AC";
COLUMN_KEYS[19] = "RepeatExpansion_AG";
COLUMN_KEYS[20] = "RepeatExpansion_AT";
COLUMN_KEYS[21] = "RepeatExpansion_CA";
COLUMN_KEYS[22] = "RepeatExpansion_CG";
COLUMN_KEYS[23] = "RepeatExpansion_CT";
COLUMN_KEYS[24] = "RepeatExpansion_GA";
COLUMN_KEYS[25] = "RepeatExpansion_GC";
COLUMN_KEYS[26] = "RepeatExpansion_GT";
COLUMN_KEYS[27] = "RepeatExpansion_TA";
COLUMN_KEYS[28] = "RepeatExpansion_TC";
COLUMN_KEYS[29] = "RepeatExpansion_TG";
COLUMN_KEYS[30] = "RepeatExpansion_1";
COLUMN_KEYS[31] = "RepeatExpansion_2";
COLUMN_KEYS[32] = "RepeatExpansion_3";
COLUMN_KEYS[33] = "RepeatExpansion_4";
COLUMN_KEYS[34] = "RepeatExpansion_5";
COLUMN_KEYS[35] = "RepeatExpansion_6";
COLUMN_KEYS[36] = "RepeatExpansion_7";
COLUMN_KEYS[37] = "RepeatExpansion_8";
COLUMN_KEYS[38] = "RepeatExpansion_9";
COLUMN_KEYS[39] = "RepeatExpansion_10orMore";
COLUMN_KEYS[40] = "Other";
}
private static final int START_IND_NOVEL = 4;
private static final int STOP_IND_NOVEL = 13;
private static final int START_IND_FOR_REPEAT_EXPANSION_1 = 14;
private static final int STOP_IND_FOR_REPEAT_EXPANSION_2 = 29;
private static final int START_IND_FOR_REPEAT_EXPANSION_COUNTS = 30;
private static final int STOP_IND_FOR_REPEAT_EXPANSION_COUNTS = 39;
private static final int IND_FOR_OTHER_EVENT = 40;
private static final int START_IND_NOVEL_PER_BASE = 0;
private static final int STOP_IND_NOVEL_PER_BASE = 3;
// map of sample to statistics
protected final HashMap<String, int[]> indelClassSummary = new HashMap<String, int[]>();
public IndelClasses(final VariantContext vc) {
indelClassSummary.put(ALL_SAMPLES_KEY, new int[COLUMN_KEYS.length]);
for( final String sample : vc.getGenotypes().keySet() ) {
indelClassSummary.put(sample, new int[COLUMN_KEYS.length]);
}
}
/**
*
* @return one row per sample
*/
public Object[] getRowKeys() {
return indelClassSummary.keySet().toArray(new String[indelClassSummary.size()]);
}
public Object getCell(int x, int y) {
final Object[] rowKeys = getRowKeys();
return String.format("%d",indelClassSummary.get(rowKeys[x])[y]);
}
/**
* get the column keys
* @return a list of objects, in this case strings, that are the column names
*/
public Object[] getColumnKeys() {
return COLUMN_KEYS;
}
public String getName() {
return "IndelClasses";
}
public int getComparisonOrder() {
return 1; // we only need to see each eval track
}
public String toString() {
return getName();
}
private void incrementSampleStat(VariantContext vc, int index) {
indelClassSummary.get(ALL_SAMPLES_KEY)[index]++;
for( final String sample : vc.getGenotypes().keySet() ) {
if ( indelClassSummary.containsKey(sample) ) {
Genotype g = vc.getGenotype(sample);
boolean isVariant = (g.isCalled() && !g.isHomRef());
if (isVariant)
// update count
indelClassSummary.get(sample)[index]++;
}
}
}
/*
* increment the specified value
*/
public void incrValue(VariantContext vc, ReferenceContext ref) {
int eventLength = 0;
boolean isInsertion = false, isDeletion = false;
String indelAlleleString;
if ( vc.isInsertion() ) {
eventLength = vc.getAlternateAllele(0).length();
isInsertion = true;
indelAlleleString = vc.getAlternateAllele(0).getDisplayString();
} else if ( vc.isDeletion() ) {
eventLength = vc.getReference().length();
isDeletion = true;
indelAlleleString = vc.getReference().getDisplayString();
}
else {
incrementSampleStat(vc, IND_FOR_OTHER_EVENT);
return;
}
byte[] refBases = ref.getBases();
// See first if indel is a repetition of bases before current
int indStart = refBases.length/2-eventLength;
boolean done = false;
int numRepetitions = 0;
while (!done) {
if (indStart < 0)
done = true;
else {
String refPiece = new String(Arrays.copyOfRange(refBases,indStart,indStart+eventLength));
if (refPiece.matches(indelAlleleString))
{
numRepetitions++;
indStart = indStart - eventLength;
}
else
done = true;
}
}
// now do it forward
done = false;
indStart = refBases.length/2;
while (!done) {
if (indStart + eventLength >= refBases.length)
break;
else {
String refPiece = new String(Arrays.copyOfRange(refBases,indStart,indStart+eventLength));
if (refPiece.matches(indelAlleleString))
{
numRepetitions++;
indStart = indStart + eventLength;
}
else
done = true;
}
}
if (numRepetitions == 0) {
//unrepeated sequence from surroundings
int ind = START_IND_NOVEL + (eventLength-1);
if (ind > STOP_IND_NOVEL)
ind = STOP_IND_NOVEL;
incrementSampleStat(vc, ind);
if (eventLength == 1) {
// log single base indels additionally by base
String keyStr = "Novel_" + indelAlleleString;
int k;
for (k=START_IND_NOVEL_PER_BASE; k <= STOP_IND_NOVEL_PER_BASE; k++) {
if (keyStr.matches(COLUMN_KEYS[k]))
break;
}
// log now event
incrementSampleStat(vc, k);
}
}
else {
int ind = START_IND_FOR_REPEAT_EXPANSION_COUNTS + (numRepetitions-1);
if (ind > STOP_IND_FOR_REPEAT_EXPANSION_COUNTS)
ind = STOP_IND_FOR_REPEAT_EXPANSION_COUNTS;
incrementSampleStat(vc, ind);
if (eventLength<=2) {
// for single or dinucleotide indels, we further log the base in which they occurred
String keyStr = "RepeatExpansion_" + indelAlleleString;
int k;
for (k=START_IND_FOR_REPEAT_EXPANSION_1; k <= STOP_IND_FOR_REPEAT_EXPANSION_2; k++) {
if (keyStr.matches(COLUMN_KEYS[k]))
break;
}
// log now event
incrementSampleStat(vc, k);
}
}
//g+
/*
System.out.format("RefBefore: %s\n",new String(refBefore));
System.out.format("RefAfter: %s\n",new String(refAfter));
System.out.format("Indel Allele: %s\n", indelAlleleString);
System.out.format("Num Repetitions: %d\n", numRepetitions);
*/
}
}
public IndelStatistics(VariantEvalWalker parent) {
super(parent);
// don't do anything
@ -233,11 +458,16 @@ public class IndelStatistics extends VariantEvaluator {
if ( nSamples != -1 )
indelStats = new IndelStats(eval);
}
if ( indelClasses == null ) {
indelClasses = new IndelClasses(eval);
}
if ( eval.isIndel() &&
eval.isBiallelic() &&
indelStats != null ) {
indelStats.incrValue(eval);
if ( eval.isIndel() && eval.isBiallelic() ) {
if (indelStats != null )
indelStats.incrValue(eval);
if (indelClasses != null)
indelClasses.incrValue(eval, ref);
}
}