Merged bug fix from Stable into Unstable

This commit is contained in:
Mauricio Carneiro 2012-05-25 13:03:05 -04:00
commit 4109fcbb08
1 changed files with 37 additions and 105 deletions

View File

@ -25,15 +25,11 @@
package org.broadinstitute.sting.gatk.walkers.indels;
import net.sf.samtools.Cigar;
import net.sf.samtools.CigarElement;
import net.sf.samtools.CigarOperator;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.utils.Haplotype;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.PairHMM;
import org.broadinstitute.sting.utils.clipping.ReadClipper;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
@ -43,7 +39,6 @@ import org.broadinstitute.sting.utils.variantcontext.Allele;
import java.util.Arrays;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.Map;
public class PairHMMIndelErrorModel {
@ -58,7 +53,6 @@ public class PairHMMIndelErrorModel {
private static final double baseMismatchArray[];
private final static double LOG_ONE_HALF;
private final static double END_GAP_COST;
private static final int START_HRUN_GAP_IDX = 4;
private static final int MAX_HRUN_GAP_IDX = 20;
@ -76,7 +70,6 @@ public class PairHMMIndelErrorModel {
static {
LOG_ONE_HALF= -Math.log10(2.0);
END_GAP_COST = LOG_ONE_HALF;
baseMatchArray = new double[MAX_CACHED_QUAL+1];
baseMismatchArray = new double[MAX_CACHED_QUAL+1];
@ -128,7 +121,6 @@ public class PairHMMIndelErrorModel {
// compute forward hrun length, example:
// AGGTGACCCCCCTGAGAG
// 001000012345000000
int runCount = 0;
hrunArray[0] = 0;
int[] hforward = new int[hrunArray.length];
int[] hreverse = new int[hrunArray.length];
@ -172,17 +164,13 @@ public class PairHMMIndelErrorModel {
}
}
}
public synchronized double[] computeReadHaplotypeLikelihoods(ReadBackedPileup pileup, LinkedHashMap<Allele,Haplotype> haplotypeMap,
ReferenceContext ref, int eventLength,
HashMap<PileupElement, LinkedHashMap<Allele,Double>> indelLikelihoodMap){
int numHaplotypes = haplotypeMap.size();
public synchronized double[] computeReadHaplotypeLikelihoods(ReadBackedPileup pileup, LinkedHashMap<Allele,Haplotype> haplotypeMap, ReferenceContext ref, int eventLength, HashMap<PileupElement, LinkedHashMap<Allele,Double>> indelLikelihoodMap){
final int numHaplotypes = haplotypeMap.size();
final double readLikelihoods[][] = new double[pileup.getNumberOfElements()][numHaplotypes];
final int readCounts[] = new int[pileup.getNumberOfElements()];
final PairHMM pairHMM = new PairHMM(bandedLikelihoods);
int readIdx=0;
PairHMM pairHMM = new PairHMM(bandedLikelihoods);
for (PileupElement p: pileup) {
// > 1 when the read is a consensus read representing multiple independent observations
readCounts[readIdx] = p.getRepresentativeCount();
@ -199,84 +187,46 @@ public class PairHMMIndelErrorModel {
if (DEBUG) {
System.out.format("Read Name:%s, aln start:%d aln stop:%d orig cigar:%s\n",p.getRead().getReadName(), p.getRead().getAlignmentStart(), p.getRead().getAlignmentEnd(), p.getRead().getCigarString());
}
// System.out.format("%d %s\n",p.getRead().getAlignmentStart(), p.getRead().getClass().getName());
GATKSAMRecord read = ReadClipper.hardClipAdaptorSequence(p.getRead());
if (read.isEmpty())
continue;
if (read.getUnclippedEnd() > ref.getWindow().getStop())
if (read.getSoftEnd() > ref.getWindow().getStop())
read = ReadClipper.hardClipByReferenceCoordinatesRightTail(read, ref.getWindow().getStop());
if (read.isEmpty())
continue;
if (read.getUnclippedStart() < ref.getWindow().getStart())
if (read.getSoftStart() < ref.getWindow().getStart())
read = ReadClipper.hardClipByReferenceCoordinatesLeftTail (read, ref.getWindow().getStart());
if (read.isEmpty())
continue;
// hard-clip low quality ends - this may introduce extra H elements in CIGAR string
read = ReadClipper.hardClipLowQualEnds(read,(byte)BASE_QUAL_THRESHOLD );
read = ReadClipper.hardClipLowQualEnds(read, (byte) BASE_QUAL_THRESHOLD );
if (read.isEmpty())
continue;
// get bases of candidate haplotypes that overlap with reads
final int trailingBases = 3;
long readStart = read.getUnclippedStart();
long readEnd = read.getUnclippedEnd();
int numStartSoftClippedBases, numEndSoftClippedBases;
final long readStart = read.getSoftStart();
final long readEnd = read.getSoftEnd();
// see if we want to use soft clipped bases. Aligners may soft clip all bases at insertions because they don't match,
// but they're actually consistent with the insertion!
// Rule: if a read starts in interval [eventStart-eventLength,eventStart+1] and we are at an insertion, we'll use all soft clipped bases at the beginning.
// Conversely, if a read ends at [eventStart,eventStart+eventLength] we'll use all soft clipped bases in the end of the read.
long eventStartPos = ref.getLocus().getStart();
// compute total number of clipped bases (soft or hard clipped)
numStartSoftClippedBases = read.getAlignmentStart()- read.getUnclippedStart();
numEndSoftClippedBases = read.getUnclippedEnd()- read.getAlignmentEnd();
// check for hard clips (never consider these bases):
Cigar c = read.getCigar();
CigarElement first = c.getCigarElement(0);
CigarElement last = c.getCigarElement(c.numCigarElements()-1);
int numStartHardClippedBases = 0, numEndHardClippedBases = 0;
if (first.getOperator() == CigarOperator.H) {
numStartHardClippedBases = first.getLength();
}
if (last.getOperator() == CigarOperator.H) {
numEndHardClippedBases = last.getLength();
}
// correct for hard clips
numStartSoftClippedBases -= numStartHardClippedBases;
numEndSoftClippedBases -= numEndHardClippedBases;
readStart += numStartHardClippedBases;
readEnd -= numEndHardClippedBases;
// remove soft clips if necessary
if ((read.getAlignmentStart()>=eventStartPos-eventLength && read.getAlignmentStart() <= eventStartPos+1) ||
(read.getAlignmentEnd() >= eventStartPos && read.getAlignmentEnd() <= eventStartPos + eventLength)) {
numStartSoftClippedBases = 0;
numEndSoftClippedBases = 0;
}
byte[] unclippedReadBases, unclippedReadQuals;
int numStartClippedBases = numStartSoftClippedBases;
int numEndClippedBases = numEndSoftClippedBases;
unclippedReadBases = read.getReadBases();
unclippedReadQuals = read.getBaseQualities();
final long eventStartPos = ref.getLocus().getStart();
// compute total number of clipped bases (soft or hard clipped) and only use them if necessary
final boolean softClips = useSoftClippedBases(read, eventStartPos, eventLength);
final int numStartSoftClippedBases = softClips ? read.getAlignmentStart()- read.getSoftStart() : 0;
final int numEndSoftClippedBases = softClips ? read.getSoftEnd()- read.getAlignmentEnd() : 0 ;
final byte [] unclippedReadBases = read.getReadBases();
final byte [] unclippedReadQuals = read.getBaseQualities();
final int extraOffset = Math.abs(eventLength);
/**
@ -284,64 +234,53 @@ public class PairHMMIndelErrorModel {
* Read start and stop locations (variables readStart and readEnd) are the original unclipped positions from SAMRecord,
* adjusted by hard clips from Cigar string and by qual-based soft-clipping performed above.
* We will propose haplotypes that overlap the read with some padding.
* True read start = readStart + numStartClippedBases - ReadUtils.getFirstInsertionOffset(read)
* True read start = readStart + numStartSoftClippedBases - ReadUtils.getFirstInsertionOffset(read)
* Last term is because if a read starts with an insertion then these bases are not accounted for in readStart.
* trailingBases is a padding constant(=3) and we additionally add abs(eventLength) to both sides of read to be able to
* differentiate context between two haplotypes
*/
long startLocationInRefForHaplotypes = Math.max(readStart + numStartClippedBases - trailingBases - ReadUtils.getFirstInsertionOffset(read)-extraOffset, 0);
long stopLocationInRefForHaplotypes = readEnd -numEndClippedBases + trailingBases + ReadUtils.getLastInsertionOffset(read)+extraOffset;
long startLocationInRefForHaplotypes = Math.max(readStart + numStartSoftClippedBases - trailingBases - ReadUtils.getFirstInsertionOffset(read)-extraOffset, 0);
long stopLocationInRefForHaplotypes = readEnd -numEndSoftClippedBases + trailingBases + ReadUtils.getLastInsertionOffset(read)+extraOffset;
if (DEBUG)
System.out.format("orig Start:%d orig stop: %d\n", startLocationInRefForHaplotypes, stopLocationInRefForHaplotypes);
int readLength = read.getReadLength()-numStartSoftClippedBases-numEndSoftClippedBases;
// check if start of read will be before start of reference context
if (startLocationInRefForHaplotypes < ref.getWindow().getStart()) {
// read starts before haplotype: read will have to be cut
//numStartClippedBases += ref.getWindow().getStart() - startLocationInRefForHaplotypes;
startLocationInRefForHaplotypes = ref.getWindow().getStart();
startLocationInRefForHaplotypes = ref.getWindow().getStart(); // read starts before haplotype: read will have to be cut numStartSoftClippedBases += ref.getWindow().getStart() - startLocationInRefForHaplotypes;
}
// check also if end of read will go beyond reference context
if (stopLocationInRefForHaplotypes > ref.getWindow().getStop()) {
//numEndClippedBases += stopLocationInRefForHaplotypes - ref.getWindow().getStop();
stopLocationInRefForHaplotypes = ref.getWindow().getStop();
stopLocationInRefForHaplotypes = ref.getWindow().getStop(); // check also if end of read will go beyond reference context
}
// if there's an insertion in the read, the read stop position will be less than start + read legnth,
// but we want to compute likelihoods in the whole region that a read might overlap
if (stopLocationInRefForHaplotypes <= startLocationInRefForHaplotypes + readLength) {
stopLocationInRefForHaplotypes = startLocationInRefForHaplotypes + readLength-1;
stopLocationInRefForHaplotypes = startLocationInRefForHaplotypes + readLength-1; // if there's an insertion in the read, the read stop position will be less than start + read legnth, but we want to compute likelihoods in the whole region that a read might overlap
}
// ok, we now figured out total number of clipped bases on both ends.
// ok, we now figured out the total number of clipped bases on both ends.
// Figure out where we want to place the haplotype to score read against
if (DEBUG)
System.out.format("numStartClippedBases: %d numEndClippedBases: %d WinStart:%d WinStop:%d start: %d stop: %d readLength: %d\n",
numStartClippedBases, numEndClippedBases, ref.getWindow().getStart(), ref.getWindow().getStop(), startLocationInRefForHaplotypes, stopLocationInRefForHaplotypes, read.getReadLength());
System.out.format("numStartSoftClippedBases: %d numEndSoftClippedBases: %d WinStart:%d WinStop:%d start: %d stop: %d readLength: %d\n",
numStartSoftClippedBases, numEndSoftClippedBases, ref.getWindow().getStart(), ref.getWindow().getStop(), startLocationInRefForHaplotypes, stopLocationInRefForHaplotypes, read.getReadLength());
LinkedHashMap<Allele,Double> readEl = new LinkedHashMap<Allele,Double>();
/**
* Check if we'll end up with an empty read once all clipping is done
*/
if (numStartClippedBases + numEndClippedBases >= unclippedReadBases.length) {
if (numStartSoftClippedBases + numEndSoftClippedBases >= unclippedReadBases.length) {
int j=0;
for (Allele a: haplotypeMap.keySet()) {
readEl.put(a,0.0);
readLikelihoods[readIdx][j++] = 0.0;
}
}
else {
final byte[] readBases = Arrays.copyOfRange(unclippedReadBases,numStartClippedBases,
unclippedReadBases.length-numEndClippedBases);
final byte[] readQuals = Arrays.copyOfRange(unclippedReadQuals,numStartClippedBases,
unclippedReadBases.length-numEndClippedBases);
final byte[] readBases = Arrays.copyOfRange(unclippedReadBases,numStartSoftClippedBases, unclippedReadBases.length-numEndSoftClippedBases);
final byte[] readQuals = Arrays.copyOfRange(unclippedReadQuals,numStartSoftClippedBases, unclippedReadBases.length-numEndSoftClippedBases);
int j=0;
// initialize path metric and traceback memories for likelihood computation
@ -351,7 +290,7 @@ public class PairHMMIndelErrorModel {
final byte[] contextLogGapContinuationProbabilities = new byte[readBases.length];
// get homopolymer length profile for current haplotype
int[] hrunProfile = new int[readBases.length];
final int[] hrunProfile = new int[readBases.length];
getContextHomopolymerLength(readBases,hrunProfile);
fillGapProbabilities(hrunProfile, contextLogGapOpenProbabilities, contextLogGapContinuationProbabilities);
@ -431,6 +370,10 @@ public class PairHMMIndelErrorModel {
return getHaplotypeLikelihoods(numHaplotypes, readCounts, readLikelihoods);
}
private boolean useSoftClippedBases(GATKSAMRecord read, long eventStartPos, int eventLength) {
return !((read.getAlignmentStart() >= eventStartPos-eventLength && read.getAlignmentStart() <= eventStartPos+1) || (read.getAlignmentEnd() >= eventStartPos && read.getAlignmentEnd() <= eventStartPos + eventLength));
}
private int computeFirstDifferingPosition(byte[] b1, byte[] b2) {
if (b1.length != b2.length)
return 0; // sanity check
@ -442,18 +385,7 @@ public class PairHMMIndelErrorModel {
return b1.length;
}
private int computeFirstDifferingPosition(double[] b1, double[] b2) {
if (b1.length != b2.length)
return 0; // sanity check
for (int i=0; i < b1.length; i++ ){
if ( MathUtils.compareDoubles(b1[i], b2[i]) != 0 )
return i;
}
return b1.length;
}
private final static double[] getHaplotypeLikelihoods(final int numHaplotypes, final int readCounts[], final double readLikelihoods[][]) {
private static double[] getHaplotypeLikelihoods(final int numHaplotypes, final int readCounts[], final double readLikelihoods[][]) {
final double[][] haplotypeLikehoodMatrix = new double[numHaplotypes][numHaplotypes];
// todo: MAD 09/26/11 -- I'm almost certain this calculation can be simplified to just a single loop without the intermediate NxN matrix