(forgot I hadn't committed this) - refactored IndelStatistics module and added a new inner class to compute Indel classification along with other statistics. So, we now get an extra table specifying, per sample, counts of whether indels are:
- Repeat Expansions - Novel sequence And for indels of size <=2 we get a per-mononuc. or dinuc. breakdown of novels and expansions. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4828 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
f795b25c47
commit
17db2e0e24
|
|
@ -10,6 +10,7 @@ import org.broadinstitute.sting.utils.report.tags.Analysis;
|
|||
import org.broadinstitute.sting.utils.report.tags.DataPoint;
|
||||
import org.broadinstitute.sting.utils.report.utils.TableType;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
|
||||
/*
|
||||
|
|
@ -37,11 +38,13 @@ import java.util.HashMap;
|
|||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
@Analysis(name = "Indel Statistics", description = "Shows various indel metrics and statistics")
|
||||
@Analysis(name = "IndelStatistics", description = "Shows various indel metrics and statistics")
|
||||
public class IndelStatistics extends VariantEvaluator {
|
||||
@DataPoint(name="IndelStatistics", description = "Indel Statistics")
|
||||
IndelStats indelStats = null;
|
||||
|
||||
@DataPoint(name="IndelClasses", description = "Indel Classification")
|
||||
IndelClasses indelClasses = null;
|
||||
|
||||
|
||||
private static final int INDEL_SIZE_LIMIT = 100;
|
||||
|
|
@ -54,6 +57,7 @@ public class IndelStatistics extends VariantEvaluator {
|
|||
static int index2len(int ind) {
|
||||
return ind-INDEL_SIZE_LIMIT-NUM_SCALAR_COLUMNS;
|
||||
}
|
||||
|
||||
static class IndelStats implements TableType {
|
||||
protected final static String ALL_SAMPLES_KEY = "allSamples";
|
||||
protected final static String[] COLUMN_KEYS;
|
||||
|
|
@ -115,20 +119,6 @@ public class IndelStatistics extends VariantEvaluator {
|
|||
public String toString() {
|
||||
return getName();
|
||||
}
|
||||
/*
|
||||
private double ratio(long numer, long denom) {
|
||||
return denom != 0L ? 100.0 * ( ((double)numer) / ((double)denom) ) : 0.0;
|
||||
}
|
||||
|
||||
final long[] allSamplesNumerators = new long[COLUMN_KEYS.length];
|
||||
final long[] allSamplesDenominators = new long[COLUMN_KEYS.length];
|
||||
|
||||
private void updateSummaries(int i, double[] summary, long numer, long denom ) {
|
||||
allSamplesNumerators[i] += numer;
|
||||
allSamplesDenominators[i] += denom;
|
||||
summary[i] = ratio(numer, denom);
|
||||
}
|
||||
*/
|
||||
|
||||
/*
|
||||
* increment the specified value
|
||||
|
|
@ -200,6 +190,241 @@ public class IndelStatistics extends VariantEvaluator {
|
|||
}
|
||||
}
|
||||
|
||||
static class IndelClasses implements TableType {
|
||||
protected final static String ALL_SAMPLES_KEY = "allSamples";
|
||||
protected final static String[] COLUMN_KEYS;
|
||||
|
||||
|
||||
|
||||
static {
|
||||
COLUMN_KEYS= new String[41];
|
||||
COLUMN_KEYS[0] = "Novel_A";
|
||||
COLUMN_KEYS[1] = "Novel_C";
|
||||
COLUMN_KEYS[2] = "Novel_G";
|
||||
COLUMN_KEYS[3] = "Novel_T";
|
||||
COLUMN_KEYS[4] = "NOVEL_1";
|
||||
COLUMN_KEYS[5] = "NOVEL_2";
|
||||
COLUMN_KEYS[6] = "NOVEL_3";
|
||||
COLUMN_KEYS[7] = "NOVEL_4";
|
||||
COLUMN_KEYS[8] = "NOVEL_5";
|
||||
COLUMN_KEYS[9] = "NOVEL_6";
|
||||
COLUMN_KEYS[10] = "NOVEL_7";
|
||||
COLUMN_KEYS[11] = "NOVEL_8";
|
||||
COLUMN_KEYS[12] = "NOVEL_9";
|
||||
COLUMN_KEYS[13] = "NOVEL_10orMore";
|
||||
COLUMN_KEYS[14] = "RepeatExpansion_A";
|
||||
COLUMN_KEYS[15] = "RepeatExpansion_C";
|
||||
COLUMN_KEYS[16] = "RepeatExpansion_G";
|
||||
COLUMN_KEYS[17] = "RepeatExpansion_T";
|
||||
COLUMN_KEYS[18] = "RepeatExpansion_AC";
|
||||
COLUMN_KEYS[19] = "RepeatExpansion_AG";
|
||||
COLUMN_KEYS[20] = "RepeatExpansion_AT";
|
||||
COLUMN_KEYS[21] = "RepeatExpansion_CA";
|
||||
COLUMN_KEYS[22] = "RepeatExpansion_CG";
|
||||
COLUMN_KEYS[23] = "RepeatExpansion_CT";
|
||||
COLUMN_KEYS[24] = "RepeatExpansion_GA";
|
||||
COLUMN_KEYS[25] = "RepeatExpansion_GC";
|
||||
COLUMN_KEYS[26] = "RepeatExpansion_GT";
|
||||
COLUMN_KEYS[27] = "RepeatExpansion_TA";
|
||||
COLUMN_KEYS[28] = "RepeatExpansion_TC";
|
||||
COLUMN_KEYS[29] = "RepeatExpansion_TG";
|
||||
COLUMN_KEYS[30] = "RepeatExpansion_1";
|
||||
COLUMN_KEYS[31] = "RepeatExpansion_2";
|
||||
COLUMN_KEYS[32] = "RepeatExpansion_3";
|
||||
COLUMN_KEYS[33] = "RepeatExpansion_4";
|
||||
COLUMN_KEYS[34] = "RepeatExpansion_5";
|
||||
COLUMN_KEYS[35] = "RepeatExpansion_6";
|
||||
COLUMN_KEYS[36] = "RepeatExpansion_7";
|
||||
COLUMN_KEYS[37] = "RepeatExpansion_8";
|
||||
COLUMN_KEYS[38] = "RepeatExpansion_9";
|
||||
COLUMN_KEYS[39] = "RepeatExpansion_10orMore";
|
||||
COLUMN_KEYS[40] = "Other";
|
||||
|
||||
}
|
||||
|
||||
private static final int START_IND_NOVEL = 4;
|
||||
private static final int STOP_IND_NOVEL = 13;
|
||||
private static final int START_IND_FOR_REPEAT_EXPANSION_1 = 14;
|
||||
private static final int STOP_IND_FOR_REPEAT_EXPANSION_2 = 29;
|
||||
private static final int START_IND_FOR_REPEAT_EXPANSION_COUNTS = 30;
|
||||
private static final int STOP_IND_FOR_REPEAT_EXPANSION_COUNTS = 39;
|
||||
private static final int IND_FOR_OTHER_EVENT = 40;
|
||||
private static final int START_IND_NOVEL_PER_BASE = 0;
|
||||
private static final int STOP_IND_NOVEL_PER_BASE = 3;
|
||||
|
||||
|
||||
// map of sample to statistics
|
||||
protected final HashMap<String, int[]> indelClassSummary = new HashMap<String, int[]>();
|
||||
|
||||
public IndelClasses(final VariantContext vc) {
|
||||
indelClassSummary.put(ALL_SAMPLES_KEY, new int[COLUMN_KEYS.length]);
|
||||
for( final String sample : vc.getGenotypes().keySet() ) {
|
||||
indelClassSummary.put(sample, new int[COLUMN_KEYS.length]);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @return one row per sample
|
||||
*/
|
||||
public Object[] getRowKeys() {
|
||||
return indelClassSummary.keySet().toArray(new String[indelClassSummary.size()]);
|
||||
}
|
||||
public Object getCell(int x, int y) {
|
||||
final Object[] rowKeys = getRowKeys();
|
||||
return String.format("%d",indelClassSummary.get(rowKeys[x])[y]);
|
||||
}
|
||||
|
||||
/**
|
||||
* get the column keys
|
||||
* @return a list of objects, in this case strings, that are the column names
|
||||
*/
|
||||
public Object[] getColumnKeys() {
|
||||
return COLUMN_KEYS;
|
||||
}
|
||||
|
||||
public String getName() {
|
||||
return "IndelClasses";
|
||||
}
|
||||
|
||||
public int getComparisonOrder() {
|
||||
return 1; // we only need to see each eval track
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return getName();
|
||||
}
|
||||
|
||||
private void incrementSampleStat(VariantContext vc, int index) {
|
||||
indelClassSummary.get(ALL_SAMPLES_KEY)[index]++;
|
||||
for( final String sample : vc.getGenotypes().keySet() ) {
|
||||
if ( indelClassSummary.containsKey(sample) ) {
|
||||
Genotype g = vc.getGenotype(sample);
|
||||
boolean isVariant = (g.isCalled() && !g.isHomRef());
|
||||
if (isVariant)
|
||||
// update count
|
||||
indelClassSummary.get(sample)[index]++;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
/*
|
||||
* increment the specified value
|
||||
*/
|
||||
public void incrValue(VariantContext vc, ReferenceContext ref) {
|
||||
int eventLength = 0;
|
||||
boolean isInsertion = false, isDeletion = false;
|
||||
String indelAlleleString;
|
||||
|
||||
if ( vc.isInsertion() ) {
|
||||
eventLength = vc.getAlternateAllele(0).length();
|
||||
isInsertion = true;
|
||||
indelAlleleString = vc.getAlternateAllele(0).getDisplayString();
|
||||
} else if ( vc.isDeletion() ) {
|
||||
eventLength = vc.getReference().length();
|
||||
isDeletion = true;
|
||||
indelAlleleString = vc.getReference().getDisplayString();
|
||||
}
|
||||
else {
|
||||
incrementSampleStat(vc, IND_FOR_OTHER_EVENT);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
byte[] refBases = ref.getBases();
|
||||
|
||||
|
||||
|
||||
// See first if indel is a repetition of bases before current
|
||||
int indStart = refBases.length/2-eventLength;
|
||||
|
||||
boolean done = false;
|
||||
int numRepetitions = 0;
|
||||
while (!done) {
|
||||
if (indStart < 0)
|
||||
done = true;
|
||||
else {
|
||||
String refPiece = new String(Arrays.copyOfRange(refBases,indStart,indStart+eventLength));
|
||||
if (refPiece.matches(indelAlleleString))
|
||||
{
|
||||
numRepetitions++;
|
||||
indStart = indStart - eventLength;
|
||||
}
|
||||
else
|
||||
done = true;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
// now do it forward
|
||||
done = false;
|
||||
indStart = refBases.length/2;
|
||||
while (!done) {
|
||||
if (indStart + eventLength >= refBases.length)
|
||||
break;
|
||||
else {
|
||||
String refPiece = new String(Arrays.copyOfRange(refBases,indStart,indStart+eventLength));
|
||||
if (refPiece.matches(indelAlleleString))
|
||||
{
|
||||
numRepetitions++;
|
||||
indStart = indStart + eventLength;
|
||||
}
|
||||
else
|
||||
done = true;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
if (numRepetitions == 0) {
|
||||
//unrepeated sequence from surroundings
|
||||
int ind = START_IND_NOVEL + (eventLength-1);
|
||||
if (ind > STOP_IND_NOVEL)
|
||||
ind = STOP_IND_NOVEL;
|
||||
incrementSampleStat(vc, ind);
|
||||
|
||||
if (eventLength == 1) {
|
||||
// log single base indels additionally by base
|
||||
String keyStr = "Novel_" + indelAlleleString;
|
||||
int k;
|
||||
for (k=START_IND_NOVEL_PER_BASE; k <= STOP_IND_NOVEL_PER_BASE; k++) {
|
||||
if (keyStr.matches(COLUMN_KEYS[k]))
|
||||
break;
|
||||
}
|
||||
// log now event
|
||||
incrementSampleStat(vc, k);
|
||||
}
|
||||
}
|
||||
else {
|
||||
int ind = START_IND_FOR_REPEAT_EXPANSION_COUNTS + (numRepetitions-1);
|
||||
if (ind > STOP_IND_FOR_REPEAT_EXPANSION_COUNTS)
|
||||
ind = STOP_IND_FOR_REPEAT_EXPANSION_COUNTS;
|
||||
incrementSampleStat(vc, ind);
|
||||
|
||||
if (eventLength<=2) {
|
||||
// for single or dinucleotide indels, we further log the base in which they occurred
|
||||
String keyStr = "RepeatExpansion_" + indelAlleleString;
|
||||
int k;
|
||||
for (k=START_IND_FOR_REPEAT_EXPANSION_1; k <= STOP_IND_FOR_REPEAT_EXPANSION_2; k++) {
|
||||
if (keyStr.matches(COLUMN_KEYS[k]))
|
||||
break;
|
||||
}
|
||||
// log now event
|
||||
incrementSampleStat(vc, k);
|
||||
}
|
||||
|
||||
}
|
||||
//g+
|
||||
/*
|
||||
System.out.format("RefBefore: %s\n",new String(refBefore));
|
||||
System.out.format("RefAfter: %s\n",new String(refAfter));
|
||||
System.out.format("Indel Allele: %s\n", indelAlleleString);
|
||||
System.out.format("Num Repetitions: %d\n", numRepetitions);
|
||||
*/
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public IndelStatistics(VariantEvalWalker parent) {
|
||||
super(parent);
|
||||
// don't do anything
|
||||
|
|
@ -233,11 +458,16 @@ public class IndelStatistics extends VariantEvaluator {
|
|||
if ( nSamples != -1 )
|
||||
indelStats = new IndelStats(eval);
|
||||
}
|
||||
if ( indelClasses == null ) {
|
||||
indelClasses = new IndelClasses(eval);
|
||||
}
|
||||
|
||||
if ( eval.isIndel() &&
|
||||
eval.isBiallelic() &&
|
||||
indelStats != null ) {
|
||||
indelStats.incrValue(eval);
|
||||
if ( eval.isIndel() && eval.isBiallelic() ) {
|
||||
if (indelStats != null )
|
||||
indelStats.incrValue(eval);
|
||||
|
||||
if (indelClasses != null)
|
||||
indelClasses.incrValue(eval, ref);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue