Several major fixes and changes to new indel likelihood model:

a) Scrapped the way in which we constructed candidate haplotypes because it wasnt fully correct and yielded corner conditions with incorrect genotyping and likelihood computation. Ideally, a haplotype should "cover" the read and the most likely alignments should be such that the ends of the read are inside the ends of the haplotype. This wasn't happening, and if you have a "dangling read off a haplotype" the probabilistic alignment model may prefer to shift a read instead of scoring it correctly - this is especially bad with tandem repeat insertions. 
So now, we build haplotypes based on the reference context and adaptively change them based on read alignment positions, plus some padding and uncertainty in the alignment.

b) Changed the way soft clipped based are dealt with. Instead of either ignoring them or using them, we only use them if the read start or end position (after soft clipping) are within eventDistance of the current location. This is done because it's very common that BWA's strictly local SW implementation will soft clip every single read at an insertion position because it couldn't place that end of the read without too many mismatches, but the read is legit and the bases are good quality. If we don't take these bases into consideration, reads which are informative of an insertion event are essentially discarded because the informative part is clipped away. 

c) Several cleanups and fixes to the context-dependent gap penalty model based on length of HRun.





git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5464 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
delangel 2011-03-17 18:39:31 +00:00
parent 18002abb1f
commit b45afe5ba8
2 changed files with 122 additions and 106 deletions

View File

@ -320,7 +320,7 @@ public class DindelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoo
return null;
// check if there is enough reference window to create haplotypes (can be an issue at end of contigs)
if (ref.getWindow().getStop() <= loc.getStop()+HAPLOTYPE_SIZE)
if (ref.getWindow().getStop() < loc.getStop()+HAPLOTYPE_SIZE)
return null;
if ( !(priors instanceof DiploidIndelGenotypePriors) )
throw new StingException("Only diploid-based Indel priors are supported in the DINDEL GL model");
@ -330,16 +330,19 @@ public class DindelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoo
refAllele = alleleList.get(0);
altAllele = alleleList.get(1);
int eventLength = refAllele.getBaseString().length() - altAllele.getBaseString().length();
int eventLength = altAllele.getBaseString().length() - refAllele.getBaseString().length();
// assume only one alt allele for now
if (eventLength<0)
eventLength = - eventLength;
// int numSamples = getNSamples(contexts);
List<Haplotype> haplotypesInVC;
if (newLike)
if (newLike) {
int hsize = (int)ref.getWindow().size()-Math.abs(eventLength)-1;
int numPrefBases = ref.getLocus().getStart()-ref.getWindow().getStart()+1;
if (DEBUG)
System.out.format("hsize: %d eventLength: %d refSize: %d, locStart: %d numpr: %d\n",hsize,eventLength,
(int)ref.getWindow().size(), loc.getStart(), numPrefBases);
haplotypesInVC = Haplotype.makeHaplotypeListFromAlleles( alleleList, loc.getStart(),
ref, HAPLOTYPE_SIZE, HAPLOTYPE_SIZE/2);
ref, hsize, numPrefBases);
}
else
haplotypesInVC = Haplotype.makeHaplotypeListFromAlleles( alleleList, loc.getStart(),
ref, HAPLOTYPE_SIZE, 20);
@ -351,13 +354,6 @@ public class DindelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoo
double[][] haplotypeLikehoodMatrix;
if (useFlatPriors) {
priors = new DiploidIndelGenotypePriors();
}
else
priors = new DiploidIndelGenotypePriors(indelHeterozygosity,eventLength,HAPLOTYPE_SIZE);
//double[] priorLikelihoods = priors.getPriors();
for ( Map.Entry<String, StratifiedAlignmentContext> sample : contexts.entrySet() ) {
AlignmentContext context = sample.getValue().getContext(contextType);
@ -371,7 +367,7 @@ public class DindelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoo
if (pileup != null ) {
if (newLike)
haplotypeLikehoodMatrix = pairModel.computeReadHaplotypeLikelihoods( pileup, haplotypesInVC, ref);
haplotypeLikehoodMatrix = pairModel.computeReadHaplotypeLikelihoods( pileup, haplotypesInVC, ref, HAPLOTYPE_SIZE, eventLength);
else
haplotypeLikehoodMatrix = model.computeReadHaplotypeLikelihoods( pileup, haplotypesInVC);

View File

@ -45,7 +45,7 @@ import java.util.List;
public class PairHMMIndelErrorModel {
private final int BASE_QUAL_THRESHOLD = 4;
private final int BASE_QUAL_THRESHOLD = 10;
private static final int MATCH_OFFSET = 0;
@ -85,34 +85,24 @@ public class PairHMMIndelErrorModel {
private final static double LOG_ONE_THIRD;
private final static double LOG_ONE_HALF;
private final static double END_GAP_COST;
private static final int START_HRUN_GAP_IDX = 4;
private static final int MAX_HRUN_GAP_IDX = 20;
private static final double MIN_GAP_PENALTY = 20.0;
private static final double MIN_GAP_OPEN_PENALTY = 20.0;
private static final double MIN_GAP_CONT_PENALTY = 6.0;
private static final double GAP_PENALTY_HRUN_STEP = 2.0; // each increase in hrun decreases gap penalty by this.
/*
private final double c;
private final double d;
private final double e;
private final double oneMinusTwoEps;
*/
private final int trailingBases = 3;
private boolean doViterbi = false;
private final boolean useSoftClippedBases = false;
private final boolean useAffineGapModel = true;
private boolean doContextDependentPenalties = false;
private String s1;
private String s2;
private double[][] contextLogGapOpenProbabilities;
private double[][] contextLogGapContinuationProbabilities;
private double[] currentContextGOP;
private double[] currentContextGCP;
private final double[] GAP_OPEN_PROB_TABLE;
private final double[] GAP_CONT_PROB_TABLE;
@ -122,7 +112,7 @@ public class PairHMMIndelErrorModel {
static {
LOG_ONE_THIRD= -Math.log10(3.0);
LOG_ONE_HALF= -Math.log10(2.0);
final double OFFSET = -Math.log10(0.25);
END_GAP_COST = LOG_ONE_HALF;
baseMatchArray = new double[MAX_CACHED_QUAL+1];
baseMismatchArray = new double[MAX_CACHED_QUAL+1];
@ -130,8 +120,8 @@ public class PairHMMIndelErrorModel {
double baseProb = Math.pow(10, -k/10.);
baseMatchArray[k] = Math.log10(1-baseProb)+ OFFSET;
baseMismatchArray[k] = Math.log10(baseProb)+LOG_ONE_THIRD+OFFSET;
baseMatchArray[k] = Math.log10(1-baseProb);
baseMismatchArray[k] = Math.log10(baseProb);
}
}
@ -164,16 +154,17 @@ public class PairHMMIndelErrorModel {
double gcp = logGapContinuationProbability;
double step = GAP_PENALTY_HRUN_STEP/10.0;
double maxGP = -MIN_GAP_PENALTY/10.0; // phred to log prob
double maxGOP = -MIN_GAP_OPEN_PENALTY/10.0; // phred to log prob
double maxGCP = -MIN_GAP_CONT_PENALTY/10.0; // phred to log prob
for (int i=START_HRUN_GAP_IDX; i < MAX_HRUN_GAP_IDX; i++) {
gop += step;
if (gop > maxGP)
gop = maxGP;
if (gop > maxGOP)
gop = maxGOP;
gcp += step;
if(gcp > maxGP)
gcp = maxGP;
if(gcp > maxGCP)
gcp = maxGCP;
GAP_OPEN_PROB_TABLE[i] = gop;
GAP_CONT_PROB_TABLE[i] = gcp;
}
@ -182,7 +173,7 @@ public class PairHMMIndelErrorModel {
}
public double computeReadLikelihoodGivenHaplotype(byte[] haplotypeBases, byte[] readBases, byte[] readQuals) {
private double computeReadLikelihoodGivenHaplotype(byte[] haplotypeBases, byte[] readBases, byte[] readQuals) {
final int X_METRIC_LENGTH = readBases.length+1;
final int Y_METRIC_LENGTH = haplotypeBases.length+1;
@ -308,7 +299,7 @@ public class PairHMMIndelErrorModel {
}
private void getContextHomopolymerLength(final byte[] refBytes, int[] hrunArray) {
static private void getContextHomopolymerLength(final byte[] refBytes, int[] hrunArray) {
// compute forward hrun length, example:
// AGGTGACCCCCCTGAGAG
// 001000012345000000
@ -340,7 +331,7 @@ public class PairHMMIndelErrorModel {
hrunArray[i] = hforward[i]+hreverse[i];
}
private Pair<Double,Double> getGapPenalties(final int indI, final int indJ, final int X_METRIC_LENGTH,
final int Y_METRIC_LENGTH, final int tableToUpdate) {
final int Y_METRIC_LENGTH, final int tableToUpdate, double[] currentContextGOP, double[] currentContextGCP) {
double c=0.0,d=0.0;
@ -350,33 +341,34 @@ public class PairHMMIndelErrorModel {
break;
case X_OFFSET:
c = (indJ==Y_METRIC_LENGTH-1? 0: currentContextGOP[indJ-1]);
d = (indJ==Y_METRIC_LENGTH-1? 0: currentContextGCP[indJ-1]);
c = (indJ==Y_METRIC_LENGTH-1? END_GAP_COST: currentContextGOP[indJ-1]);
d = (indJ==Y_METRIC_LENGTH-1? END_GAP_COST: currentContextGCP[indJ-1]);
break;
case Y_OFFSET:
c = (indI==X_METRIC_LENGTH-1? 0: currentContextGOP[indJ-1]);
d = (indI==X_METRIC_LENGTH-1? 0: currentContextGCP[indJ-1]);
c = (indI==X_METRIC_LENGTH-1? END_GAP_COST: currentContextGOP[indJ-1]);
d = (indI==X_METRIC_LENGTH-1? END_GAP_COST: currentContextGCP[indJ-1]);
break;
default:
throw new StingException("BUG!! Invalid table offset");
}
} else {
}
else {
switch (tableToUpdate) {
case MATCH_OFFSET:
break;
case X_OFFSET:
c = (indJ==Y_METRIC_LENGTH-1? 0: logGapOpenProbability);
d = (indJ==Y_METRIC_LENGTH-1? 0: logGapContinuationProbability);
c = (indJ==Y_METRIC_LENGTH-1? END_GAP_COST: logGapOpenProbability);
d = (indJ==Y_METRIC_LENGTH-1? END_GAP_COST: logGapContinuationProbability);
break;
case Y_OFFSET:
c = (indI==X_METRIC_LENGTH-1? 0: logGapOpenProbability);
d = (indI==X_METRIC_LENGTH-1? 0: logGapContinuationProbability);
c = (indI==X_METRIC_LENGTH-1? END_GAP_COST: logGapOpenProbability);
d = (indI==X_METRIC_LENGTH-1? END_GAP_COST: logGapContinuationProbability);
break;
default:
@ -386,7 +378,8 @@ public class PairHMMIndelErrorModel {
return new Pair<Double,Double>(Double.valueOf(c),Double.valueOf(d));
}
public double computeReadLikelihoodGivenHaplotypeAffineGaps(byte[] haplotypeBases, byte[] readBases, byte[] readQuals) {
private double computeReadLikelihoodGivenHaplotypeAffineGaps(byte[] haplotypeBases, byte[] readBases, byte[] readQuals,
double[] currentGOP, double[] currentGCP) {
final int X_METRIC_LENGTH = readBases.length+1;
final int Y_METRIC_LENGTH = haplotypeBases.length+1;
@ -399,13 +392,13 @@ public class PairHMMIndelErrorModel {
int[][] bestActionArrayX = new int[X_METRIC_LENGTH][Y_METRIC_LENGTH];
int[][] bestActionArrayY = new int[X_METRIC_LENGTH][Y_METRIC_LENGTH];
matchMetricArray[0][0]= 0;//Double.NEGATIVE_INFINITY;
matchMetricArray[0][0]= END_GAP_COST;//Double.NEGATIVE_INFINITY;
for (int i=1; i < X_METRIC_LENGTH; i++) {
//initialize first column
matchMetricArray[i][0] = Double.NEGATIVE_INFINITY;
YMetricArray[i][0] = Double.NEGATIVE_INFINITY;
XMetricArray[i][0] = 0;//logGapOpenProbability + (i-1)*logGapContinuationProbability;
XMetricArray[i][0] = END_GAP_COST*(i);//logGapOpenProbability + (i-1)*logGapContinuationProbability;
bestActionArrayX[i][0] = bestActionArrayY[i][0] = bestActionArrayM[i][0] = UP_GOTO_X;
}
@ -414,7 +407,7 @@ public class PairHMMIndelErrorModel {
// initialize first row
matchMetricArray[0][j] = Double.NEGATIVE_INFINITY;
XMetricArray[0][j] = Double.NEGATIVE_INFINITY;
YMetricArray[0][j] = 0;//logGapOpenProbability + (j-1) * logGapContinuationProbability;
YMetricArray[0][j] = END_GAP_COST*(j);//logGapOpenProbability + (j-1) * logGapContinuationProbability;
bestActionArrayY[0][j] = bestActionArrayM[0][j] = bestActionArrayX[0][j] = LEFT_GOTO_Y;
}
@ -460,8 +453,7 @@ public class PairHMMIndelErrorModel {
// update X array
// State X(i,j): X(1:i) aligned to a gap in Y(1:j).
// When in last column of X, ie X(1:i) aligned to full Y, we don't want to penalize gaps
Pair<Double,Double> p = getGapPenalties(indI, indJ, X_METRIC_LENGTH,
Y_METRIC_LENGTH, X_OFFSET);
Pair<Double,Double> p = getGapPenalties(indI, indJ, X_METRIC_LENGTH, Y_METRIC_LENGTH, X_OFFSET, currentGOP, currentGCP);
metrics[MATCH_OFFSET] = matchMetricArray[indI-1][indJ] + p.first;
metrics[X_OFFSET] = XMetricArray[indI-1][indJ] + p.second;
@ -478,8 +470,7 @@ public class PairHMMIndelErrorModel {
bestActionArrayX[indI][indJ] = ACTIONS_X[bestMetricIdx];
// update Y array
p = getGapPenalties(indI, indJ, X_METRIC_LENGTH,
Y_METRIC_LENGTH, Y_OFFSET);
p = getGapPenalties(indI, indJ, X_METRIC_LENGTH, Y_METRIC_LENGTH, Y_OFFSET, currentGOP, currentGCP);
metrics[MATCH_OFFSET] = matchMetricArray[indI][indJ-1] + p.first;
metrics[X_OFFSET] = Double.NEGATIVE_INFINITY; //XMetricArray[indI][indJ-1] + logGapOpenProbability;
@ -578,7 +569,8 @@ public class PairHMMIndelErrorModel {
}
private void getGapProbabilities(int hIndex, int[] hrunProfile) {
private void fillGapProbabilities(int hIndex, int[] hrunProfile,
double[][] contextLogGapOpenProbabilities, double[][] contextLogGapContinuationProbabilities) {
// fill based on lookup table
for (int i = 0; i < hrunProfile.length; i++) {
if (hrunProfile[i] >= MAX_HRUN_GAP_IDX) {
@ -591,36 +583,44 @@ public class PairHMMIndelErrorModel {
}
}
}
public double[][] computeReadHaplotypeLikelihoods(ReadBackedPileup pileup, List<Haplotype> haplotypesInVC, ReferenceContext ref){
public synchronized double[][] computeReadHaplotypeLikelihoods(ReadBackedPileup pileup, List<Haplotype> haplotypesInVC,
ReferenceContext ref, int haplotypeSize, int eventLength){
double[][] haplotypeLikehoodMatrix = new double[haplotypesInVC.size()][haplotypesInVC.size()];
double readLikelihoods[][] = new double[pileup.getReads().size()][haplotypesInVC.size()];
int readIdx=0;
double[][] contextLogGapOpenProbabilities = null;
double[][] contextLogGapContinuationProbabilities = null;
if (DEBUG) {
System.out.println("Reference bases:");
System.out.println(new String(ref.getBases()));
}
if (doContextDependentPenalties) {
// will context dependent probabilities based on homopolymet run. Probabilities are filled based on total complete haplotypes.
for (int j=0; j < haplotypesInVC.size(); j++) {
Haplotype haplotype = haplotypesInVC.get(j);
byte[] haplotypeBases = haplotype.getBasesAsBytes();
if (contextLogGapOpenProbabilities == null) {
contextLogGapOpenProbabilities = new double[haplotypesInVC.size()][haplotypeBases.length];
contextLogGapContinuationProbabilities = new double[haplotypesInVC.size()][haplotypeBases.length];
}
// get homopolymer length profile for current haplotype
int[] hrunProfile = new int[haplotypeBases.length];
getContextHomopolymerLength(haplotypeBases,hrunProfile);
if (DEBUG) {
System.out.println("Haplotype bases:");
System.out.println(new String(haplotypeBases));
for (int i=0; i < hrunProfile.length; i++)
System.out.format("%d",hrunProfile[i]);
System.out.println();
}
getGapProbabilities(j, hrunProfile);
for (int j=0; j < haplotypesInVC.size(); j++) {
Haplotype haplotype = haplotypesInVC.get(j);
byte[] haplotypeBases = haplotype.getBasesAsBytes();
if (contextLogGapOpenProbabilities == null) {
contextLogGapOpenProbabilities = new double[haplotypesInVC.size()][haplotypeBases.length];
contextLogGapContinuationProbabilities = new double[haplotypesInVC.size()][haplotypeBases.length];
}
// get homopolymer length profile for current haplotype
int[] hrunProfile = new int[haplotypeBases.length];
getContextHomopolymerLength(haplotypeBases,hrunProfile);
if (DEBUG) {
System.out.println("Haplotype bases:");
System.out.println(new String(haplotypeBases));
for (int i=0; i < hrunProfile.length; i++)
System.out.format("%d",hrunProfile[i]);
System.out.println();
}
fillGapProbabilities(j, hrunProfile, contextLogGapOpenProbabilities, contextLogGapContinuationProbabilities);
}
}
for (SAMRecord read : pileup.getReads()) {
@ -634,16 +634,31 @@ public class PairHMMIndelErrorModel {
// get bases of candidate haplotypes that overlap with reads
int offset = trailingBases / 2;
final int trailingBases = 3;
long readStart = read.getUnclippedStart();
long readEnd = read.getUnclippedEnd();
int numStartSoftClippedBases =0, numEndSoftClippedBases = 0;
if (!useSoftClippedBases) {
numStartSoftClippedBases = read.getAlignmentStart()- read.getUnclippedStart();
numEndSoftClippedBases = read.getUnclippedEnd()- read.getAlignmentEnd();
int numStartSoftClippedBases, numEndSoftClippedBases;
// see if we want to use soft clipped bases. Aligners may soft clip all bases at insertions because they don't match,
// but they're actually consistent with the insertion!
// Rule: if a read starts in interval [eventStart-eventLength,eventStart+1] and we are at an insertion, we'll use all soft clipped bases at the beginning.
// Conversely, if a read ends at [eventStart,eventStart+eventLength] we'll use all soft clipped bases in the end of the read.
long eventStartPos = ref.getLocus().getStart();
// default: discard soft-clipped bases
numStartSoftClippedBases = read.getAlignmentStart()- read.getUnclippedStart();
numEndSoftClippedBases = read.getUnclippedEnd()- read.getAlignmentEnd();
if (eventLength > 0) {
if ((read.getAlignmentStart()>=eventStartPos-eventLength && read.getAlignmentStart() <= eventStartPos+1) ||
(read.getAlignmentEnd() >= eventStartPos && read.getAlignmentEnd() <= eventStartPos + eventLength)) {
numStartSoftClippedBases = 0;
numEndSoftClippedBases = 0;
}
}
byte[] unclippedReadBases, unclippedReadQuals;
@ -668,14 +683,10 @@ public class PairHMMIndelErrorModel {
break;
}
long start = Math.max(readStart + numStartClippedBases - offset - ReadUtils.getFirstInsertionOffset(read), 0);
long stop = readEnd -numEndClippedBases + offset + ReadUtils.getLastInsertionOffset(read);
int extraOffset = Math.abs(eventLength);
// ok, we now figured out total number of clipped bases on both ends.
// Figure out where we want to place the haplotype to score read against
if (DEBUG)
System.out.format("numStartClippedBases: %d numEndClippedBases: %d WinStart:%d WinStop:%d start: %d stop: %d readLength: %d\n",
numStartClippedBases, numEndClippedBases, ref.getWindow().getStart(), ref.getWindow().getStop(), start, stop, read.getReadLength());
long start = Math.max(readStart + numStartClippedBases - trailingBases - ReadUtils.getFirstInsertionOffset(read)-extraOffset, 0);
long stop = readEnd -numEndClippedBases + trailingBases + ReadUtils.getLastInsertionOffset(read)+extraOffset;
// Variables start and stop are coordinates (inclusive) where we want to get the haplotype from.
int readLength = read.getReadLength()-numStartSoftClippedBases-numEndSoftClippedBases;
@ -693,16 +704,13 @@ public class PairHMMIndelErrorModel {
stop = start + readLength-1;
}
// ok, we now figured out total number of clipped bases on both ends.
// Figure out where we want to place the haplotype to score read against
if (DEBUG)
System.out.format("numStartClippedBases: %d numEndClippedBases: %d WinStart:%d WinStop:%d start: %d stop: %d readLength: %d\n",
numStartClippedBases, numEndClippedBases, ref.getWindow().getStart(), ref.getWindow().getStop(), start, stop, read.getReadLength());
/*
byte[] readAlignedBases = AlignmentUtils.readToAlignmentByteArray(read.getCigar(), read.getReadBases()); // Adjust the read bases based on the Cigar string
byte[] readAlignedQuals = AlignmentUtils.readToAlignmentByteArray(read.getCigar(), read.getBaseQualities()); // Shift the location of the qual scores based on the Cigar string
System.out.println("Aligned bases:"+new String(readAlignedBases));
*/
if (numStartClippedBases + numEndClippedBases >= unclippedReadBases.length) {
if (DEBUG)
@ -731,15 +739,21 @@ public class PairHMMIndelErrorModel {
for (int j=0; j < haplotypesInVC.size(); j++) {
Haplotype haplotype = haplotypesInVC.get(j);
if (stop > haplotype.getStopPosition())
stop = haplotype.getStopPosition();
if (start < haplotype.getStartPosition())
start = haplotype.getStartPosition();
// cut haplotype bases
//long indStart = start - haplotype.getStartPosition();
//long indStop = stop - haplotype.getStartPosition();
long indStart = start - haplotype.getStartPosition();
long indStop = stop - haplotype.getStartPosition();
//long indStart = 0;
//long indStop = 400;
//byte[] haplotypeBases = Arrays.copyOfRange(haplotype.getBasesAsBytes(),
// (int)indStart, (int)indStop);
byte[] haplotypeBases = haplotype.getBasesAsBytes();
byte[] haplotypeBases = Arrays.copyOfRange(haplotype.getBasesAsBytes(),
(int)indStart, (int)indStop);
//byte[] haplotypeBases = haplotype.getBasesAsBytes();
//BAQ.BAQCalculationResult baqRes = baq.calcBAQFromHMM(read, haplotypeBases, (int)(start - readStart));
/* if (s1 != null && s2 != null) {
haplotypeBases = s2.getBytes();
@ -770,9 +784,15 @@ public class PairHMMIndelErrorModel {
}
if (useAffineGapModel) {
currentContextGOP = contextLogGapOpenProbabilities[j];
currentContextGCP = contextLogGapContinuationProbabilities[j];
readLikelihoods[readIdx][j]= computeReadLikelihoodGivenHaplotypeAffineGaps(haplotypeBases, readBases, readQuals);
double[] currentContextGOP = null;
double[] currentContextGCP = null;
if (doContextDependentPenalties) {
currentContextGOP = Arrays.copyOfRange(contextLogGapOpenProbabilities[j], (int)indStart, (int)indStop);
currentContextGCP = Arrays.copyOfRange(contextLogGapContinuationProbabilities[j], (int)indStart, (int)indStop);
}
readLikelihoods[readIdx][j]= computeReadLikelihoodGivenHaplotypeAffineGaps(haplotypeBases, readBases, readQuals, currentContextGOP, currentContextGCP);
}
else