diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java index bcb9ea591..08bd51134 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java @@ -25,15 +25,11 @@ package org.broadinstitute.sting.gatk.walkers.indels; -import net.sf.samtools.Cigar; -import net.sf.samtools.CigarElement; -import net.sf.samtools.CigarOperator; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.utils.Haplotype; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.PairHMM; import org.broadinstitute.sting.utils.clipping.ReadClipper; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; @@ -43,7 +39,6 @@ import org.broadinstitute.sting.utils.variantcontext.Allele; import java.util.Arrays; import java.util.HashMap; import java.util.LinkedHashMap; -import java.util.Map; public class PairHMMIndelErrorModel { @@ -58,7 +53,6 @@ public class PairHMMIndelErrorModel { private static final double baseMismatchArray[]; private final static double LOG_ONE_HALF; - private final static double END_GAP_COST; private static final int START_HRUN_GAP_IDX = 4; private static final int MAX_HRUN_GAP_IDX = 20; @@ -76,7 +70,6 @@ public class PairHMMIndelErrorModel { static { LOG_ONE_HALF= -Math.log10(2.0); - END_GAP_COST = LOG_ONE_HALF; baseMatchArray = new double[MAX_CACHED_QUAL+1]; baseMismatchArray = new double[MAX_CACHED_QUAL+1]; @@ -128,7 +121,6 @@ public class PairHMMIndelErrorModel { // compute forward hrun length, example: // AGGTGACCCCCCTGAGAG // 001000012345000000 - int runCount = 0; hrunArray[0] = 0; int[] hforward = new int[hrunArray.length]; int[] hreverse = new int[hrunArray.length]; @@ -172,17 +164,13 @@ public class PairHMMIndelErrorModel { } } } - public synchronized double[] computeReadHaplotypeLikelihoods(ReadBackedPileup pileup, LinkedHashMap haplotypeMap, - ReferenceContext ref, int eventLength, - HashMap> indelLikelihoodMap){ - - int numHaplotypes = haplotypeMap.size(); + public synchronized double[] computeReadHaplotypeLikelihoods(ReadBackedPileup pileup, LinkedHashMap haplotypeMap, ReferenceContext ref, int eventLength, HashMap> indelLikelihoodMap){ + final int numHaplotypes = haplotypeMap.size(); final double readLikelihoods[][] = new double[pileup.getNumberOfElements()][numHaplotypes]; final int readCounts[] = new int[pileup.getNumberOfElements()]; + final PairHMM pairHMM = new PairHMM(bandedLikelihoods); + int readIdx=0; - - - PairHMM pairHMM = new PairHMM(bandedLikelihoods); for (PileupElement p: pileup) { // > 1 when the read is a consensus read representing multiple independent observations readCounts[readIdx] = p.getRepresentativeCount(); @@ -199,84 +187,46 @@ public class PairHMMIndelErrorModel { if (DEBUG) { System.out.format("Read Name:%s, aln start:%d aln stop:%d orig cigar:%s\n",p.getRead().getReadName(), p.getRead().getAlignmentStart(), p.getRead().getAlignmentEnd(), p.getRead().getCigarString()); } - // System.out.format("%d %s\n",p.getRead().getAlignmentStart(), p.getRead().getClass().getName()); GATKSAMRecord read = ReadClipper.hardClipAdaptorSequence(p.getRead()); if (read.isEmpty()) continue; - if (read.getUnclippedEnd() > ref.getWindow().getStop()) + if (read.getSoftEnd() > ref.getWindow().getStop()) read = ReadClipper.hardClipByReferenceCoordinatesRightTail(read, ref.getWindow().getStop()); if (read.isEmpty()) continue; - if (read.getUnclippedStart() < ref.getWindow().getStart()) + if (read.getSoftStart() < ref.getWindow().getStart()) read = ReadClipper.hardClipByReferenceCoordinatesLeftTail (read, ref.getWindow().getStart()); if (read.isEmpty()) continue; + // hard-clip low quality ends - this may introduce extra H elements in CIGAR string - read = ReadClipper.hardClipLowQualEnds(read,(byte)BASE_QUAL_THRESHOLD ); + read = ReadClipper.hardClipLowQualEnds(read, (byte) BASE_QUAL_THRESHOLD ); if (read.isEmpty()) continue; - // get bases of candidate haplotypes that overlap with reads final int trailingBases = 3; - - long readStart = read.getUnclippedStart(); - long readEnd = read.getUnclippedEnd(); - - int numStartSoftClippedBases, numEndSoftClippedBases; + final long readStart = read.getSoftStart(); + final long readEnd = read.getSoftEnd(); // see if we want to use soft clipped bases. Aligners may soft clip all bases at insertions because they don't match, // but they're actually consistent with the insertion! // Rule: if a read starts in interval [eventStart-eventLength,eventStart+1] and we are at an insertion, we'll use all soft clipped bases at the beginning. // Conversely, if a read ends at [eventStart,eventStart+eventLength] we'll use all soft clipped bases in the end of the read. - long eventStartPos = ref.getLocus().getStart(); - - // compute total number of clipped bases (soft or hard clipped) - numStartSoftClippedBases = read.getAlignmentStart()- read.getUnclippedStart(); - numEndSoftClippedBases = read.getUnclippedEnd()- read.getAlignmentEnd(); - - // check for hard clips (never consider these bases): - Cigar c = read.getCigar(); - CigarElement first = c.getCigarElement(0); - CigarElement last = c.getCigarElement(c.numCigarElements()-1); - int numStartHardClippedBases = 0, numEndHardClippedBases = 0; - - if (first.getOperator() == CigarOperator.H) { - numStartHardClippedBases = first.getLength(); - } - - if (last.getOperator() == CigarOperator.H) { - numEndHardClippedBases = last.getLength(); - } - - // correct for hard clips - numStartSoftClippedBases -= numStartHardClippedBases; - numEndSoftClippedBases -= numEndHardClippedBases; - readStart += numStartHardClippedBases; - readEnd -= numEndHardClippedBases; - - // remove soft clips if necessary - if ((read.getAlignmentStart()>=eventStartPos-eventLength && read.getAlignmentStart() <= eventStartPos+1) || - (read.getAlignmentEnd() >= eventStartPos && read.getAlignmentEnd() <= eventStartPos + eventLength)) { - numStartSoftClippedBases = 0; - numEndSoftClippedBases = 0; - } - - - - byte[] unclippedReadBases, unclippedReadQuals; - - int numStartClippedBases = numStartSoftClippedBases; - int numEndClippedBases = numEndSoftClippedBases; - unclippedReadBases = read.getReadBases(); - unclippedReadQuals = read.getBaseQualities(); + final long eventStartPos = ref.getLocus().getStart(); + // compute total number of clipped bases (soft or hard clipped) and only use them if necessary + final boolean softClips = useSoftClippedBases(read, eventStartPos, eventLength); + final int numStartSoftClippedBases = softClips ? read.getAlignmentStart()- read.getSoftStart() : 0; + final int numEndSoftClippedBases = softClips ? read.getSoftEnd()- read.getAlignmentEnd() : 0 ; + final byte [] unclippedReadBases = read.getReadBases(); + final byte [] unclippedReadQuals = read.getBaseQualities(); final int extraOffset = Math.abs(eventLength); /** @@ -284,64 +234,53 @@ public class PairHMMIndelErrorModel { * Read start and stop locations (variables readStart and readEnd) are the original unclipped positions from SAMRecord, * adjusted by hard clips from Cigar string and by qual-based soft-clipping performed above. * We will propose haplotypes that overlap the read with some padding. - * True read start = readStart + numStartClippedBases - ReadUtils.getFirstInsertionOffset(read) + * True read start = readStart + numStartSoftClippedBases - ReadUtils.getFirstInsertionOffset(read) * Last term is because if a read starts with an insertion then these bases are not accounted for in readStart. * trailingBases is a padding constant(=3) and we additionally add abs(eventLength) to both sides of read to be able to * differentiate context between two haplotypes */ - long startLocationInRefForHaplotypes = Math.max(readStart + numStartClippedBases - trailingBases - ReadUtils.getFirstInsertionOffset(read)-extraOffset, 0); - long stopLocationInRefForHaplotypes = readEnd -numEndClippedBases + trailingBases + ReadUtils.getLastInsertionOffset(read)+extraOffset; + long startLocationInRefForHaplotypes = Math.max(readStart + numStartSoftClippedBases - trailingBases - ReadUtils.getFirstInsertionOffset(read)-extraOffset, 0); + long stopLocationInRefForHaplotypes = readEnd -numEndSoftClippedBases + trailingBases + ReadUtils.getLastInsertionOffset(read)+extraOffset; if (DEBUG) System.out.format("orig Start:%d orig stop: %d\n", startLocationInRefForHaplotypes, stopLocationInRefForHaplotypes); int readLength = read.getReadLength()-numStartSoftClippedBases-numEndSoftClippedBases; - // check if start of read will be before start of reference context + if (startLocationInRefForHaplotypes < ref.getWindow().getStart()) { - // read starts before haplotype: read will have to be cut - //numStartClippedBases += ref.getWindow().getStart() - startLocationInRefForHaplotypes; - startLocationInRefForHaplotypes = ref.getWindow().getStart(); + startLocationInRefForHaplotypes = ref.getWindow().getStart(); // read starts before haplotype: read will have to be cut numStartSoftClippedBases += ref.getWindow().getStart() - startLocationInRefForHaplotypes; } - // check also if end of read will go beyond reference context + if (stopLocationInRefForHaplotypes > ref.getWindow().getStop()) { - //numEndClippedBases += stopLocationInRefForHaplotypes - ref.getWindow().getStop(); - stopLocationInRefForHaplotypes = ref.getWindow().getStop(); + stopLocationInRefForHaplotypes = ref.getWindow().getStop(); // check also if end of read will go beyond reference context } - // if there's an insertion in the read, the read stop position will be less than start + read legnth, - // but we want to compute likelihoods in the whole region that a read might overlap if (stopLocationInRefForHaplotypes <= startLocationInRefForHaplotypes + readLength) { - stopLocationInRefForHaplotypes = startLocationInRefForHaplotypes + readLength-1; + stopLocationInRefForHaplotypes = startLocationInRefForHaplotypes + readLength-1; // if there's an insertion in the read, the read stop position will be less than start + read legnth, but we want to compute likelihoods in the whole region that a read might overlap } - // ok, we now figured out total number of clipped bases on both ends. + // ok, we now figured out the total number of clipped bases on both ends. // Figure out where we want to place the haplotype to score read against if (DEBUG) - System.out.format("numStartClippedBases: %d numEndClippedBases: %d WinStart:%d WinStop:%d start: %d stop: %d readLength: %d\n", - numStartClippedBases, numEndClippedBases, ref.getWindow().getStart(), ref.getWindow().getStop(), startLocationInRefForHaplotypes, stopLocationInRefForHaplotypes, read.getReadLength()); - + System.out.format("numStartSoftClippedBases: %d numEndSoftClippedBases: %d WinStart:%d WinStop:%d start: %d stop: %d readLength: %d\n", + numStartSoftClippedBases, numEndSoftClippedBases, ref.getWindow().getStart(), ref.getWindow().getStop(), startLocationInRefForHaplotypes, stopLocationInRefForHaplotypes, read.getReadLength()); LinkedHashMap readEl = new LinkedHashMap(); /** * Check if we'll end up with an empty read once all clipping is done */ - if (numStartClippedBases + numEndClippedBases >= unclippedReadBases.length) { + if (numStartSoftClippedBases + numEndSoftClippedBases >= unclippedReadBases.length) { int j=0; for (Allele a: haplotypeMap.keySet()) { readEl.put(a,0.0); readLikelihoods[readIdx][j++] = 0.0; } - } else { - final byte[] readBases = Arrays.copyOfRange(unclippedReadBases,numStartClippedBases, - unclippedReadBases.length-numEndClippedBases); - - final byte[] readQuals = Arrays.copyOfRange(unclippedReadQuals,numStartClippedBases, - unclippedReadBases.length-numEndClippedBases); - + final byte[] readBases = Arrays.copyOfRange(unclippedReadBases,numStartSoftClippedBases, unclippedReadBases.length-numEndSoftClippedBases); + final byte[] readQuals = Arrays.copyOfRange(unclippedReadQuals,numStartSoftClippedBases, unclippedReadBases.length-numEndSoftClippedBases); int j=0; // initialize path metric and traceback memories for likelihood computation @@ -351,7 +290,7 @@ public class PairHMMIndelErrorModel { final byte[] contextLogGapContinuationProbabilities = new byte[readBases.length]; // get homopolymer length profile for current haplotype - int[] hrunProfile = new int[readBases.length]; + final int[] hrunProfile = new int[readBases.length]; getContextHomopolymerLength(readBases,hrunProfile); fillGapProbabilities(hrunProfile, contextLogGapOpenProbabilities, contextLogGapContinuationProbabilities); @@ -431,6 +370,10 @@ public class PairHMMIndelErrorModel { return getHaplotypeLikelihoods(numHaplotypes, readCounts, readLikelihoods); } + private boolean useSoftClippedBases(GATKSAMRecord read, long eventStartPos, int eventLength) { + return !((read.getAlignmentStart() >= eventStartPos-eventLength && read.getAlignmentStart() <= eventStartPos+1) || (read.getAlignmentEnd() >= eventStartPos && read.getAlignmentEnd() <= eventStartPos + eventLength)); + } + private int computeFirstDifferingPosition(byte[] b1, byte[] b2) { if (b1.length != b2.length) return 0; // sanity check @@ -442,18 +385,7 @@ public class PairHMMIndelErrorModel { return b1.length; } - private int computeFirstDifferingPosition(double[] b1, double[] b2) { - if (b1.length != b2.length) - return 0; // sanity check - - for (int i=0; i < b1.length; i++ ){ - if ( MathUtils.compareDoubles(b1[i], b2[i]) != 0 ) - return i; - } - return b1.length; - } - - private final static double[] getHaplotypeLikelihoods(final int numHaplotypes, final int readCounts[], final double readLikelihoods[][]) { + private static double[] getHaplotypeLikelihoods(final int numHaplotypes, final int readCounts[], final double readLikelihoods[][]) { final double[][] haplotypeLikehoodMatrix = new double[numHaplotypes][numHaplotypes]; // todo: MAD 09/26/11 -- I'm almost certain this calculation can be simplified to just a single loop without the intermediate NxN matrix