Fixed haplotype boundary bug in PairHMMIndelErrorModel

haplotypes were being clipped to the reference window when their unclipped ends went beyond the reference window. The unclipped ends include the hard clipped bases, therefore, if the reference window ended inside the hard clipped bases of a read, the boundaries would be wrong (and the read clipper was throwing an exception).

   * updated code to use SoftEnd/SoftStart instead of UnclippedEnd/UnclippedStart where appropriate.
   * removed unnecessary code to remove hard clips after processing.
   * reorganized the logic to use the assigned read boundaries throughout the code (allowing it to be final).
This commit is contained in:
Mauricio Carneiro 2012-05-25 12:16:28 -04:00
parent 4bc04e2a9e
commit 2be5704a25
1 changed files with 37 additions and 105 deletions

View File

@ -25,15 +25,11 @@
package org.broadinstitute.sting.gatk.walkers.indels;
import net.sf.samtools.Cigar;
import net.sf.samtools.CigarElement;
import net.sf.samtools.CigarOperator;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.utils.Haplotype;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.PairHMM;
import org.broadinstitute.sting.utils.clipping.ReadClipper;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
@ -43,7 +39,6 @@ import org.broadinstitute.sting.utils.variantcontext.Allele;
import java.util.Arrays;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.Map;
public class PairHMMIndelErrorModel {
@ -58,7 +53,6 @@ public class PairHMMIndelErrorModel {
private static final double baseMismatchArray[];
private final static double LOG_ONE_HALF;
private final static double END_GAP_COST;
private static final int START_HRUN_GAP_IDX = 4;
private static final int MAX_HRUN_GAP_IDX = 20;
@ -76,7 +70,6 @@ public class PairHMMIndelErrorModel {
static {
LOG_ONE_HALF= -Math.log10(2.0);
END_GAP_COST = LOG_ONE_HALF;
baseMatchArray = new double[MAX_CACHED_QUAL+1];
baseMismatchArray = new double[MAX_CACHED_QUAL+1];
@ -128,7 +121,6 @@ public class PairHMMIndelErrorModel {
// compute forward hrun length, example:
// AGGTGACCCCCCTGAGAG
// 001000012345000000
int runCount = 0;
hrunArray[0] = 0;
int[] hforward = new int[hrunArray.length];
int[] hreverse = new int[hrunArray.length];
@ -172,17 +164,13 @@ public class PairHMMIndelErrorModel {
}
}
}
public synchronized double[] computeReadHaplotypeLikelihoods(ReadBackedPileup pileup, LinkedHashMap<Allele,Haplotype> haplotypeMap,
ReferenceContext ref, int eventLength,
HashMap<PileupElement, LinkedHashMap<Allele,Double>> indelLikelihoodMap){
int numHaplotypes = haplotypeMap.size();
public synchronized double[] computeReadHaplotypeLikelihoods(ReadBackedPileup pileup, LinkedHashMap<Allele,Haplotype> haplotypeMap, ReferenceContext ref, int eventLength, HashMap<PileupElement, LinkedHashMap<Allele,Double>> indelLikelihoodMap){
final int numHaplotypes = haplotypeMap.size();
final double readLikelihoods[][] = new double[pileup.getNumberOfElements()][numHaplotypes];
final int readCounts[] = new int[pileup.getNumberOfElements()];
final PairHMM pairHMM = new PairHMM(bandedLikelihoods);
int readIdx=0;
PairHMM pairHMM = new PairHMM(bandedLikelihoods);
for (PileupElement p: pileup) {
// > 1 when the read is a consensus read representing multiple independent observations
readCounts[readIdx] = p.getRepresentativeCount();
@ -199,84 +187,46 @@ public class PairHMMIndelErrorModel {
if (DEBUG) {
System.out.format("Read Name:%s, aln start:%d aln stop:%d orig cigar:%s\n",p.getRead().getReadName(), p.getRead().getAlignmentStart(), p.getRead().getAlignmentEnd(), p.getRead().getCigarString());
}
// System.out.format("%d %s\n",p.getRead().getAlignmentStart(), p.getRead().getClass().getName());
GATKSAMRecord read = ReadClipper.hardClipAdaptorSequence(p.getRead());
if (read.isEmpty())
continue;
if (read.getUnclippedEnd() > ref.getWindow().getStop())
if (read.getSoftEnd() > ref.getWindow().getStop())
read = ReadClipper.hardClipByReferenceCoordinatesRightTail(read, ref.getWindow().getStop());
if (read.isEmpty())
continue;
if (read.getUnclippedStart() < ref.getWindow().getStart())
if (read.getSoftStart() < ref.getWindow().getStart())
read = ReadClipper.hardClipByReferenceCoordinatesLeftTail (read, ref.getWindow().getStart());
if (read.isEmpty())
continue;
// hard-clip low quality ends - this may introduce extra H elements in CIGAR string
read = ReadClipper.hardClipLowQualEnds(read,(byte)BASE_QUAL_THRESHOLD );
read = ReadClipper.hardClipLowQualEnds(read, (byte) BASE_QUAL_THRESHOLD );
if (read.isEmpty())
continue;
// get bases of candidate haplotypes that overlap with reads
final int trailingBases = 3;
long readStart = read.getUnclippedStart();
long readEnd = read.getUnclippedEnd();
int numStartSoftClippedBases, numEndSoftClippedBases;
final long readStart = read.getSoftStart();
final long readEnd = read.getSoftEnd();
// see if we want to use soft clipped bases. Aligners may soft clip all bases at insertions because they don't match,
// but they're actually consistent with the insertion!
// Rule: if a read starts in interval [eventStart-eventLength,eventStart+1] and we are at an insertion, we'll use all soft clipped bases at the beginning.
// Conversely, if a read ends at [eventStart,eventStart+eventLength] we'll use all soft clipped bases in the end of the read.
long eventStartPos = ref.getLocus().getStart();
// compute total number of clipped bases (soft or hard clipped)
numStartSoftClippedBases = read.getAlignmentStart()- read.getUnclippedStart();
numEndSoftClippedBases = read.getUnclippedEnd()- read.getAlignmentEnd();
// check for hard clips (never consider these bases):
Cigar c = read.getCigar();
CigarElement first = c.getCigarElement(0);
CigarElement last = c.getCigarElement(c.numCigarElements()-1);
int numStartHardClippedBases = 0, numEndHardClippedBases = 0;
if (first.getOperator() == CigarOperator.H) {
numStartHardClippedBases = first.getLength();
}
if (last.getOperator() == CigarOperator.H) {
numEndHardClippedBases = last.getLength();
}
// correct for hard clips
numStartSoftClippedBases -= numStartHardClippedBases;
numEndSoftClippedBases -= numEndHardClippedBases;
readStart += numStartHardClippedBases;
readEnd -= numEndHardClippedBases;
// remove soft clips if necessary
if ((read.getAlignmentStart()>=eventStartPos-eventLength && read.getAlignmentStart() <= eventStartPos+1) ||
(read.getAlignmentEnd() >= eventStartPos && read.getAlignmentEnd() <= eventStartPos + eventLength)) {
numStartSoftClippedBases = 0;
numEndSoftClippedBases = 0;
}
byte[] unclippedReadBases, unclippedReadQuals;
int numStartClippedBases = numStartSoftClippedBases;
int numEndClippedBases = numEndSoftClippedBases;
unclippedReadBases = read.getReadBases();
unclippedReadQuals = read.getBaseQualities();
final long eventStartPos = ref.getLocus().getStart();
// compute total number of clipped bases (soft or hard clipped) and only use them if necessary
final boolean softClips = useSoftClippedBases(read, eventStartPos, eventLength);
final int numStartSoftClippedBases = softClips ? read.getAlignmentStart()- read.getSoftStart() : 0;
final int numEndSoftClippedBases = softClips ? read.getSoftEnd()- read.getAlignmentEnd() : 0 ;
final byte [] unclippedReadBases = read.getReadBases();
final byte [] unclippedReadQuals = read.getBaseQualities();
final int extraOffset = Math.abs(eventLength);
/**
@ -284,64 +234,53 @@ public class PairHMMIndelErrorModel {
* Read start and stop locations (variables readStart and readEnd) are the original unclipped positions from SAMRecord,
* adjusted by hard clips from Cigar string and by qual-based soft-clipping performed above.
* We will propose haplotypes that overlap the read with some padding.
* True read start = readStart + numStartClippedBases - ReadUtils.getFirstInsertionOffset(read)
* True read start = readStart + numStartSoftClippedBases - ReadUtils.getFirstInsertionOffset(read)
* Last term is because if a read starts with an insertion then these bases are not accounted for in readStart.
* trailingBases is a padding constant(=3) and we additionally add abs(eventLength) to both sides of read to be able to
* differentiate context between two haplotypes
*/
long startLocationInRefForHaplotypes = Math.max(readStart + numStartClippedBases - trailingBases - ReadUtils.getFirstInsertionOffset(read)-extraOffset, 0);
long stopLocationInRefForHaplotypes = readEnd -numEndClippedBases + trailingBases + ReadUtils.getLastInsertionOffset(read)+extraOffset;
long startLocationInRefForHaplotypes = Math.max(readStart + numStartSoftClippedBases - trailingBases - ReadUtils.getFirstInsertionOffset(read)-extraOffset, 0);
long stopLocationInRefForHaplotypes = readEnd -numEndSoftClippedBases + trailingBases + ReadUtils.getLastInsertionOffset(read)+extraOffset;
if (DEBUG)
System.out.format("orig Start:%d orig stop: %d\n", startLocationInRefForHaplotypes, stopLocationInRefForHaplotypes);
int readLength = read.getReadLength()-numStartSoftClippedBases-numEndSoftClippedBases;
// check if start of read will be before start of reference context
if (startLocationInRefForHaplotypes < ref.getWindow().getStart()) {
// read starts before haplotype: read will have to be cut
//numStartClippedBases += ref.getWindow().getStart() - startLocationInRefForHaplotypes;
startLocationInRefForHaplotypes = ref.getWindow().getStart();
startLocationInRefForHaplotypes = ref.getWindow().getStart(); // read starts before haplotype: read will have to be cut numStartSoftClippedBases += ref.getWindow().getStart() - startLocationInRefForHaplotypes;
}
// check also if end of read will go beyond reference context
if (stopLocationInRefForHaplotypes > ref.getWindow().getStop()) {
//numEndClippedBases += stopLocationInRefForHaplotypes - ref.getWindow().getStop();
stopLocationInRefForHaplotypes = ref.getWindow().getStop();
stopLocationInRefForHaplotypes = ref.getWindow().getStop(); // check also if end of read will go beyond reference context
}
// if there's an insertion in the read, the read stop position will be less than start + read legnth,
// but we want to compute likelihoods in the whole region that a read might overlap
if (stopLocationInRefForHaplotypes <= startLocationInRefForHaplotypes + readLength) {
stopLocationInRefForHaplotypes = startLocationInRefForHaplotypes + readLength-1;
stopLocationInRefForHaplotypes = startLocationInRefForHaplotypes + readLength-1; // if there's an insertion in the read, the read stop position will be less than start + read legnth, but we want to compute likelihoods in the whole region that a read might overlap
}
// ok, we now figured out total number of clipped bases on both ends.
// ok, we now figured out the total number of clipped bases on both ends.
// Figure out where we want to place the haplotype to score read against
if (DEBUG)
System.out.format("numStartClippedBases: %d numEndClippedBases: %d WinStart:%d WinStop:%d start: %d stop: %d readLength: %d\n",
numStartClippedBases, numEndClippedBases, ref.getWindow().getStart(), ref.getWindow().getStop(), startLocationInRefForHaplotypes, stopLocationInRefForHaplotypes, read.getReadLength());
System.out.format("numStartSoftClippedBases: %d numEndSoftClippedBases: %d WinStart:%d WinStop:%d start: %d stop: %d readLength: %d\n",
numStartSoftClippedBases, numEndSoftClippedBases, ref.getWindow().getStart(), ref.getWindow().getStop(), startLocationInRefForHaplotypes, stopLocationInRefForHaplotypes, read.getReadLength());
LinkedHashMap<Allele,Double> readEl = new LinkedHashMap<Allele,Double>();
/**
* Check if we'll end up with an empty read once all clipping is done
*/
if (numStartClippedBases + numEndClippedBases >= unclippedReadBases.length) {
if (numStartSoftClippedBases + numEndSoftClippedBases >= unclippedReadBases.length) {
int j=0;
for (Allele a: haplotypeMap.keySet()) {
readEl.put(a,0.0);
readLikelihoods[readIdx][j++] = 0.0;
}
}
else {
final byte[] readBases = Arrays.copyOfRange(unclippedReadBases,numStartClippedBases,
unclippedReadBases.length-numEndClippedBases);
final byte[] readQuals = Arrays.copyOfRange(unclippedReadQuals,numStartClippedBases,
unclippedReadBases.length-numEndClippedBases);
final byte[] readBases = Arrays.copyOfRange(unclippedReadBases,numStartSoftClippedBases, unclippedReadBases.length-numEndSoftClippedBases);
final byte[] readQuals = Arrays.copyOfRange(unclippedReadQuals,numStartSoftClippedBases, unclippedReadBases.length-numEndSoftClippedBases);
int j=0;
// initialize path metric and traceback memories for likelihood computation
@ -351,7 +290,7 @@ public class PairHMMIndelErrorModel {
final byte[] contextLogGapContinuationProbabilities = new byte[readBases.length];
// get homopolymer length profile for current haplotype
int[] hrunProfile = new int[readBases.length];
final int[] hrunProfile = new int[readBases.length];
getContextHomopolymerLength(readBases,hrunProfile);
fillGapProbabilities(hrunProfile, contextLogGapOpenProbabilities, contextLogGapContinuationProbabilities);
@ -431,6 +370,10 @@ public class PairHMMIndelErrorModel {
return getHaplotypeLikelihoods(numHaplotypes, readCounts, readLikelihoods);
}
private boolean useSoftClippedBases(GATKSAMRecord read, long eventStartPos, int eventLength) {
return !((read.getAlignmentStart() >= eventStartPos-eventLength && read.getAlignmentStart() <= eventStartPos+1) || (read.getAlignmentEnd() >= eventStartPos && read.getAlignmentEnd() <= eventStartPos + eventLength));
}
private int computeFirstDifferingPosition(byte[] b1, byte[] b2) {
if (b1.length != b2.length)
return 0; // sanity check
@ -442,18 +385,7 @@ public class PairHMMIndelErrorModel {
return b1.length;
}
private int computeFirstDifferingPosition(double[] b1, double[] b2) {
if (b1.length != b2.length)
return 0; // sanity check
for (int i=0; i < b1.length; i++ ){
if ( MathUtils.compareDoubles(b1[i], b2[i]) != 0 )
return i;
}
return b1.length;
}
private final static double[] getHaplotypeLikelihoods(final int numHaplotypes, final int readCounts[], final double readLikelihoods[][]) {
private static double[] getHaplotypeLikelihoods(final int numHaplotypes, final int readCounts[], final double readLikelihoods[][]) {
final double[][] haplotypeLikehoodMatrix = new double[numHaplotypes][numHaplotypes];
// todo: MAD 09/26/11 -- I'm almost certain this calculation can be simplified to just a single loop without the intermediate NxN matrix