Merged bug fix from Stable into Unstable
This commit is contained in:
commit
4109fcbb08
|
|
@ -25,15 +25,11 @@
|
|||
|
||||
package org.broadinstitute.sting.gatk.walkers.indels;
|
||||
|
||||
import net.sf.samtools.Cigar;
|
||||
import net.sf.samtools.CigarElement;
|
||||
import net.sf.samtools.CigarOperator;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.utils.Haplotype;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.PairHMM;
|
||||
import org.broadinstitute.sting.utils.clipping.ReadClipper;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
|
@ -43,7 +39,6 @@ import org.broadinstitute.sting.utils.variantcontext.Allele;
|
|||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.Map;
|
||||
|
||||
|
||||
public class PairHMMIndelErrorModel {
|
||||
|
|
@ -58,7 +53,6 @@ public class PairHMMIndelErrorModel {
|
|||
private static final double baseMismatchArray[];
|
||||
|
||||
private final static double LOG_ONE_HALF;
|
||||
private final static double END_GAP_COST;
|
||||
|
||||
private static final int START_HRUN_GAP_IDX = 4;
|
||||
private static final int MAX_HRUN_GAP_IDX = 20;
|
||||
|
|
@ -76,7 +70,6 @@ public class PairHMMIndelErrorModel {
|
|||
|
||||
static {
|
||||
LOG_ONE_HALF= -Math.log10(2.0);
|
||||
END_GAP_COST = LOG_ONE_HALF;
|
||||
|
||||
baseMatchArray = new double[MAX_CACHED_QUAL+1];
|
||||
baseMismatchArray = new double[MAX_CACHED_QUAL+1];
|
||||
|
|
@ -128,7 +121,6 @@ public class PairHMMIndelErrorModel {
|
|||
// compute forward hrun length, example:
|
||||
// AGGTGACCCCCCTGAGAG
|
||||
// 001000012345000000
|
||||
int runCount = 0;
|
||||
hrunArray[0] = 0;
|
||||
int[] hforward = new int[hrunArray.length];
|
||||
int[] hreverse = new int[hrunArray.length];
|
||||
|
|
@ -172,17 +164,13 @@ public class PairHMMIndelErrorModel {
|
|||
}
|
||||
}
|
||||
}
|
||||
public synchronized double[] computeReadHaplotypeLikelihoods(ReadBackedPileup pileup, LinkedHashMap<Allele,Haplotype> haplotypeMap,
|
||||
ReferenceContext ref, int eventLength,
|
||||
HashMap<PileupElement, LinkedHashMap<Allele,Double>> indelLikelihoodMap){
|
||||
|
||||
int numHaplotypes = haplotypeMap.size();
|
||||
public synchronized double[] computeReadHaplotypeLikelihoods(ReadBackedPileup pileup, LinkedHashMap<Allele,Haplotype> haplotypeMap, ReferenceContext ref, int eventLength, HashMap<PileupElement, LinkedHashMap<Allele,Double>> indelLikelihoodMap){
|
||||
final int numHaplotypes = haplotypeMap.size();
|
||||
final double readLikelihoods[][] = new double[pileup.getNumberOfElements()][numHaplotypes];
|
||||
final int readCounts[] = new int[pileup.getNumberOfElements()];
|
||||
final PairHMM pairHMM = new PairHMM(bandedLikelihoods);
|
||||
|
||||
int readIdx=0;
|
||||
|
||||
|
||||
PairHMM pairHMM = new PairHMM(bandedLikelihoods);
|
||||
for (PileupElement p: pileup) {
|
||||
// > 1 when the read is a consensus read representing multiple independent observations
|
||||
readCounts[readIdx] = p.getRepresentativeCount();
|
||||
|
|
@ -199,84 +187,46 @@ public class PairHMMIndelErrorModel {
|
|||
if (DEBUG) {
|
||||
System.out.format("Read Name:%s, aln start:%d aln stop:%d orig cigar:%s\n",p.getRead().getReadName(), p.getRead().getAlignmentStart(), p.getRead().getAlignmentEnd(), p.getRead().getCigarString());
|
||||
}
|
||||
// System.out.format("%d %s\n",p.getRead().getAlignmentStart(), p.getRead().getClass().getName());
|
||||
GATKSAMRecord read = ReadClipper.hardClipAdaptorSequence(p.getRead());
|
||||
|
||||
if (read.isEmpty())
|
||||
continue;
|
||||
|
||||
if (read.getUnclippedEnd() > ref.getWindow().getStop())
|
||||
if (read.getSoftEnd() > ref.getWindow().getStop())
|
||||
read = ReadClipper.hardClipByReferenceCoordinatesRightTail(read, ref.getWindow().getStop());
|
||||
|
||||
if (read.isEmpty())
|
||||
continue;
|
||||
|
||||
if (read.getUnclippedStart() < ref.getWindow().getStart())
|
||||
if (read.getSoftStart() < ref.getWindow().getStart())
|
||||
read = ReadClipper.hardClipByReferenceCoordinatesLeftTail (read, ref.getWindow().getStart());
|
||||
|
||||
if (read.isEmpty())
|
||||
continue;
|
||||
|
||||
// hard-clip low quality ends - this may introduce extra H elements in CIGAR string
|
||||
read = ReadClipper.hardClipLowQualEnds(read,(byte)BASE_QUAL_THRESHOLD );
|
||||
read = ReadClipper.hardClipLowQualEnds(read, (byte) BASE_QUAL_THRESHOLD );
|
||||
|
||||
if (read.isEmpty())
|
||||
continue;
|
||||
|
||||
|
||||
// get bases of candidate haplotypes that overlap with reads
|
||||
final int trailingBases = 3;
|
||||
|
||||
long readStart = read.getUnclippedStart();
|
||||
long readEnd = read.getUnclippedEnd();
|
||||
|
||||
int numStartSoftClippedBases, numEndSoftClippedBases;
|
||||
final long readStart = read.getSoftStart();
|
||||
final long readEnd = read.getSoftEnd();
|
||||
|
||||
// see if we want to use soft clipped bases. Aligners may soft clip all bases at insertions because they don't match,
|
||||
// but they're actually consistent with the insertion!
|
||||
// Rule: if a read starts in interval [eventStart-eventLength,eventStart+1] and we are at an insertion, we'll use all soft clipped bases at the beginning.
|
||||
// Conversely, if a read ends at [eventStart,eventStart+eventLength] we'll use all soft clipped bases in the end of the read.
|
||||
long eventStartPos = ref.getLocus().getStart();
|
||||
|
||||
// compute total number of clipped bases (soft or hard clipped)
|
||||
numStartSoftClippedBases = read.getAlignmentStart()- read.getUnclippedStart();
|
||||
numEndSoftClippedBases = read.getUnclippedEnd()- read.getAlignmentEnd();
|
||||
|
||||
// check for hard clips (never consider these bases):
|
||||
Cigar c = read.getCigar();
|
||||
CigarElement first = c.getCigarElement(0);
|
||||
CigarElement last = c.getCigarElement(c.numCigarElements()-1);
|
||||
int numStartHardClippedBases = 0, numEndHardClippedBases = 0;
|
||||
|
||||
if (first.getOperator() == CigarOperator.H) {
|
||||
numStartHardClippedBases = first.getLength();
|
||||
}
|
||||
|
||||
if (last.getOperator() == CigarOperator.H) {
|
||||
numEndHardClippedBases = last.getLength();
|
||||
}
|
||||
|
||||
// correct for hard clips
|
||||
numStartSoftClippedBases -= numStartHardClippedBases;
|
||||
numEndSoftClippedBases -= numEndHardClippedBases;
|
||||
readStart += numStartHardClippedBases;
|
||||
readEnd -= numEndHardClippedBases;
|
||||
|
||||
// remove soft clips if necessary
|
||||
if ((read.getAlignmentStart()>=eventStartPos-eventLength && read.getAlignmentStart() <= eventStartPos+1) ||
|
||||
(read.getAlignmentEnd() >= eventStartPos && read.getAlignmentEnd() <= eventStartPos + eventLength)) {
|
||||
numStartSoftClippedBases = 0;
|
||||
numEndSoftClippedBases = 0;
|
||||
}
|
||||
|
||||
|
||||
|
||||
byte[] unclippedReadBases, unclippedReadQuals;
|
||||
|
||||
int numStartClippedBases = numStartSoftClippedBases;
|
||||
int numEndClippedBases = numEndSoftClippedBases;
|
||||
unclippedReadBases = read.getReadBases();
|
||||
unclippedReadQuals = read.getBaseQualities();
|
||||
final long eventStartPos = ref.getLocus().getStart();
|
||||
|
||||
// compute total number of clipped bases (soft or hard clipped) and only use them if necessary
|
||||
final boolean softClips = useSoftClippedBases(read, eventStartPos, eventLength);
|
||||
final int numStartSoftClippedBases = softClips ? read.getAlignmentStart()- read.getSoftStart() : 0;
|
||||
final int numEndSoftClippedBases = softClips ? read.getSoftEnd()- read.getAlignmentEnd() : 0 ;
|
||||
final byte [] unclippedReadBases = read.getReadBases();
|
||||
final byte [] unclippedReadQuals = read.getBaseQualities();
|
||||
final int extraOffset = Math.abs(eventLength);
|
||||
|
||||
/**
|
||||
|
|
@ -284,64 +234,53 @@ public class PairHMMIndelErrorModel {
|
|||
* Read start and stop locations (variables readStart and readEnd) are the original unclipped positions from SAMRecord,
|
||||
* adjusted by hard clips from Cigar string and by qual-based soft-clipping performed above.
|
||||
* We will propose haplotypes that overlap the read with some padding.
|
||||
* True read start = readStart + numStartClippedBases - ReadUtils.getFirstInsertionOffset(read)
|
||||
* True read start = readStart + numStartSoftClippedBases - ReadUtils.getFirstInsertionOffset(read)
|
||||
* Last term is because if a read starts with an insertion then these bases are not accounted for in readStart.
|
||||
* trailingBases is a padding constant(=3) and we additionally add abs(eventLength) to both sides of read to be able to
|
||||
* differentiate context between two haplotypes
|
||||
*/
|
||||
long startLocationInRefForHaplotypes = Math.max(readStart + numStartClippedBases - trailingBases - ReadUtils.getFirstInsertionOffset(read)-extraOffset, 0);
|
||||
long stopLocationInRefForHaplotypes = readEnd -numEndClippedBases + trailingBases + ReadUtils.getLastInsertionOffset(read)+extraOffset;
|
||||
long startLocationInRefForHaplotypes = Math.max(readStart + numStartSoftClippedBases - trailingBases - ReadUtils.getFirstInsertionOffset(read)-extraOffset, 0);
|
||||
long stopLocationInRefForHaplotypes = readEnd -numEndSoftClippedBases + trailingBases + ReadUtils.getLastInsertionOffset(read)+extraOffset;
|
||||
|
||||
if (DEBUG)
|
||||
System.out.format("orig Start:%d orig stop: %d\n", startLocationInRefForHaplotypes, stopLocationInRefForHaplotypes);
|
||||
|
||||
int readLength = read.getReadLength()-numStartSoftClippedBases-numEndSoftClippedBases;
|
||||
// check if start of read will be before start of reference context
|
||||
|
||||
if (startLocationInRefForHaplotypes < ref.getWindow().getStart()) {
|
||||
// read starts before haplotype: read will have to be cut
|
||||
//numStartClippedBases += ref.getWindow().getStart() - startLocationInRefForHaplotypes;
|
||||
startLocationInRefForHaplotypes = ref.getWindow().getStart();
|
||||
startLocationInRefForHaplotypes = ref.getWindow().getStart(); // read starts before haplotype: read will have to be cut numStartSoftClippedBases += ref.getWindow().getStart() - startLocationInRefForHaplotypes;
|
||||
}
|
||||
// check also if end of read will go beyond reference context
|
||||
|
||||
if (stopLocationInRefForHaplotypes > ref.getWindow().getStop()) {
|
||||
//numEndClippedBases += stopLocationInRefForHaplotypes - ref.getWindow().getStop();
|
||||
stopLocationInRefForHaplotypes = ref.getWindow().getStop();
|
||||
stopLocationInRefForHaplotypes = ref.getWindow().getStop(); // check also if end of read will go beyond reference context
|
||||
}
|
||||
|
||||
// if there's an insertion in the read, the read stop position will be less than start + read legnth,
|
||||
// but we want to compute likelihoods in the whole region that a read might overlap
|
||||
if (stopLocationInRefForHaplotypes <= startLocationInRefForHaplotypes + readLength) {
|
||||
stopLocationInRefForHaplotypes = startLocationInRefForHaplotypes + readLength-1;
|
||||
stopLocationInRefForHaplotypes = startLocationInRefForHaplotypes + readLength-1; // if there's an insertion in the read, the read stop position will be less than start + read legnth, but we want to compute likelihoods in the whole region that a read might overlap
|
||||
}
|
||||
|
||||
// ok, we now figured out total number of clipped bases on both ends.
|
||||
// ok, we now figured out the total number of clipped bases on both ends.
|
||||
// Figure out where we want to place the haplotype to score read against
|
||||
|
||||
if (DEBUG)
|
||||
System.out.format("numStartClippedBases: %d numEndClippedBases: %d WinStart:%d WinStop:%d start: %d stop: %d readLength: %d\n",
|
||||
numStartClippedBases, numEndClippedBases, ref.getWindow().getStart(), ref.getWindow().getStop(), startLocationInRefForHaplotypes, stopLocationInRefForHaplotypes, read.getReadLength());
|
||||
|
||||
System.out.format("numStartSoftClippedBases: %d numEndSoftClippedBases: %d WinStart:%d WinStop:%d start: %d stop: %d readLength: %d\n",
|
||||
numStartSoftClippedBases, numEndSoftClippedBases, ref.getWindow().getStart(), ref.getWindow().getStop(), startLocationInRefForHaplotypes, stopLocationInRefForHaplotypes, read.getReadLength());
|
||||
|
||||
LinkedHashMap<Allele,Double> readEl = new LinkedHashMap<Allele,Double>();
|
||||
|
||||
/**
|
||||
* Check if we'll end up with an empty read once all clipping is done
|
||||
*/
|
||||
if (numStartClippedBases + numEndClippedBases >= unclippedReadBases.length) {
|
||||
if (numStartSoftClippedBases + numEndSoftClippedBases >= unclippedReadBases.length) {
|
||||
int j=0;
|
||||
for (Allele a: haplotypeMap.keySet()) {
|
||||
readEl.put(a,0.0);
|
||||
readLikelihoods[readIdx][j++] = 0.0;
|
||||
}
|
||||
|
||||
}
|
||||
else {
|
||||
final byte[] readBases = Arrays.copyOfRange(unclippedReadBases,numStartClippedBases,
|
||||
unclippedReadBases.length-numEndClippedBases);
|
||||
|
||||
final byte[] readQuals = Arrays.copyOfRange(unclippedReadQuals,numStartClippedBases,
|
||||
unclippedReadBases.length-numEndClippedBases);
|
||||
|
||||
final byte[] readBases = Arrays.copyOfRange(unclippedReadBases,numStartSoftClippedBases, unclippedReadBases.length-numEndSoftClippedBases);
|
||||
final byte[] readQuals = Arrays.copyOfRange(unclippedReadQuals,numStartSoftClippedBases, unclippedReadBases.length-numEndSoftClippedBases);
|
||||
int j=0;
|
||||
|
||||
// initialize path metric and traceback memories for likelihood computation
|
||||
|
|
@ -351,7 +290,7 @@ public class PairHMMIndelErrorModel {
|
|||
final byte[] contextLogGapContinuationProbabilities = new byte[readBases.length];
|
||||
|
||||
// get homopolymer length profile for current haplotype
|
||||
int[] hrunProfile = new int[readBases.length];
|
||||
final int[] hrunProfile = new int[readBases.length];
|
||||
getContextHomopolymerLength(readBases,hrunProfile);
|
||||
fillGapProbabilities(hrunProfile, contextLogGapOpenProbabilities, contextLogGapContinuationProbabilities);
|
||||
|
||||
|
|
@ -431,6 +370,10 @@ public class PairHMMIndelErrorModel {
|
|||
return getHaplotypeLikelihoods(numHaplotypes, readCounts, readLikelihoods);
|
||||
}
|
||||
|
||||
private boolean useSoftClippedBases(GATKSAMRecord read, long eventStartPos, int eventLength) {
|
||||
return !((read.getAlignmentStart() >= eventStartPos-eventLength && read.getAlignmentStart() <= eventStartPos+1) || (read.getAlignmentEnd() >= eventStartPos && read.getAlignmentEnd() <= eventStartPos + eventLength));
|
||||
}
|
||||
|
||||
private int computeFirstDifferingPosition(byte[] b1, byte[] b2) {
|
||||
if (b1.length != b2.length)
|
||||
return 0; // sanity check
|
||||
|
|
@ -442,18 +385,7 @@ public class PairHMMIndelErrorModel {
|
|||
return b1.length;
|
||||
}
|
||||
|
||||
private int computeFirstDifferingPosition(double[] b1, double[] b2) {
|
||||
if (b1.length != b2.length)
|
||||
return 0; // sanity check
|
||||
|
||||
for (int i=0; i < b1.length; i++ ){
|
||||
if ( MathUtils.compareDoubles(b1[i], b2[i]) != 0 )
|
||||
return i;
|
||||
}
|
||||
return b1.length;
|
||||
}
|
||||
|
||||
private final static double[] getHaplotypeLikelihoods(final int numHaplotypes, final int readCounts[], final double readLikelihoods[][]) {
|
||||
private static double[] getHaplotypeLikelihoods(final int numHaplotypes, final int readCounts[], final double readLikelihoods[][]) {
|
||||
final double[][] haplotypeLikehoodMatrix = new double[numHaplotypes][numHaplotypes];
|
||||
|
||||
// todo: MAD 09/26/11 -- I'm almost certain this calculation can be simplified to just a single loop without the intermediate NxN matrix
|
||||
|
|
|
|||
Loading…
Reference in New Issue