diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DindelGenotypeLikelihoodsCalculationModel.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DindelGenotypeLikelihoodsCalculationModel.java index 6cf1a1c1a..c1f23b7e4 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DindelGenotypeLikelihoodsCalculationModel.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DindelGenotypeLikelihoodsCalculationModel.java @@ -320,7 +320,7 @@ public class DindelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoo return null; // check if there is enough reference window to create haplotypes (can be an issue at end of contigs) - if (ref.getWindow().getStop() <= loc.getStop()+HAPLOTYPE_SIZE) + if (ref.getWindow().getStop() < loc.getStop()+HAPLOTYPE_SIZE) return null; if ( !(priors instanceof DiploidIndelGenotypePriors) ) throw new StingException("Only diploid-based Indel priors are supported in the DINDEL GL model"); @@ -330,16 +330,19 @@ public class DindelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoo refAllele = alleleList.get(0); altAllele = alleleList.get(1); - int eventLength = refAllele.getBaseString().length() - altAllele.getBaseString().length(); + int eventLength = altAllele.getBaseString().length() - refAllele.getBaseString().length(); // assume only one alt allele for now - if (eventLength<0) - eventLength = - eventLength; - // int numSamples = getNSamples(contexts); List haplotypesInVC; - if (newLike) + if (newLike) { + int hsize = (int)ref.getWindow().size()-Math.abs(eventLength)-1; + int numPrefBases = ref.getLocus().getStart()-ref.getWindow().getStart()+1; + if (DEBUG) + System.out.format("hsize: %d eventLength: %d refSize: %d, locStart: %d numpr: %d\n",hsize,eventLength, + (int)ref.getWindow().size(), loc.getStart(), numPrefBases); haplotypesInVC = Haplotype.makeHaplotypeListFromAlleles( alleleList, loc.getStart(), - ref, HAPLOTYPE_SIZE, HAPLOTYPE_SIZE/2); + ref, hsize, numPrefBases); + } else haplotypesInVC = Haplotype.makeHaplotypeListFromAlleles( alleleList, loc.getStart(), ref, HAPLOTYPE_SIZE, 20); @@ -351,13 +354,6 @@ public class DindelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoo double[][] haplotypeLikehoodMatrix; - if (useFlatPriors) { - priors = new DiploidIndelGenotypePriors(); - } - else - priors = new DiploidIndelGenotypePriors(indelHeterozygosity,eventLength,HAPLOTYPE_SIZE); - - //double[] priorLikelihoods = priors.getPriors(); for ( Map.Entry sample : contexts.entrySet() ) { AlignmentContext context = sample.getValue().getContext(contextType); @@ -371,7 +367,7 @@ public class DindelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoo if (pileup != null ) { if (newLike) - haplotypeLikehoodMatrix = pairModel.computeReadHaplotypeLikelihoods( pileup, haplotypesInVC, ref); + haplotypeLikehoodMatrix = pairModel.computeReadHaplotypeLikelihoods( pileup, haplotypesInVC, ref, HAPLOTYPE_SIZE, eventLength); else haplotypeLikehoodMatrix = model.computeReadHaplotypeLikelihoods( pileup, haplotypesInVC); diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java b/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java index c911038fb..1c731b162 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java @@ -45,7 +45,7 @@ import java.util.List; public class PairHMMIndelErrorModel { - private final int BASE_QUAL_THRESHOLD = 4; + private final int BASE_QUAL_THRESHOLD = 10; private static final int MATCH_OFFSET = 0; @@ -85,34 +85,24 @@ public class PairHMMIndelErrorModel { private final static double LOG_ONE_THIRD; private final static double LOG_ONE_HALF; + private final static double END_GAP_COST; private static final int START_HRUN_GAP_IDX = 4; private static final int MAX_HRUN_GAP_IDX = 20; - private static final double MIN_GAP_PENALTY = 20.0; + private static final double MIN_GAP_OPEN_PENALTY = 20.0; + private static final double MIN_GAP_CONT_PENALTY = 6.0; private static final double GAP_PENALTY_HRUN_STEP = 2.0; // each increase in hrun decreases gap penalty by this. - /* - private final double c; - private final double d; - private final double e; - private final double oneMinusTwoEps; - */ - private final int trailingBases = 3; private boolean doViterbi = false; - private final boolean useSoftClippedBases = false; + private final boolean useAffineGapModel = true; private boolean doContextDependentPenalties = false; private String s1; private String s2; - private double[][] contextLogGapOpenProbabilities; - private double[][] contextLogGapContinuationProbabilities; - - private double[] currentContextGOP; - private double[] currentContextGCP; private final double[] GAP_OPEN_PROB_TABLE; private final double[] GAP_CONT_PROB_TABLE; @@ -122,7 +112,7 @@ public class PairHMMIndelErrorModel { static { LOG_ONE_THIRD= -Math.log10(3.0); LOG_ONE_HALF= -Math.log10(2.0); - final double OFFSET = -Math.log10(0.25); + END_GAP_COST = LOG_ONE_HALF; baseMatchArray = new double[MAX_CACHED_QUAL+1]; baseMismatchArray = new double[MAX_CACHED_QUAL+1]; @@ -130,8 +120,8 @@ public class PairHMMIndelErrorModel { double baseProb = Math.pow(10, -k/10.); - baseMatchArray[k] = Math.log10(1-baseProb)+ OFFSET; - baseMismatchArray[k] = Math.log10(baseProb)+LOG_ONE_THIRD+OFFSET; + baseMatchArray[k] = Math.log10(1-baseProb); + baseMismatchArray[k] = Math.log10(baseProb); } } @@ -164,16 +154,17 @@ public class PairHMMIndelErrorModel { double gcp = logGapContinuationProbability; double step = GAP_PENALTY_HRUN_STEP/10.0; - double maxGP = -MIN_GAP_PENALTY/10.0; // phred to log prob + double maxGOP = -MIN_GAP_OPEN_PENALTY/10.0; // phred to log prob + double maxGCP = -MIN_GAP_CONT_PENALTY/10.0; // phred to log prob for (int i=START_HRUN_GAP_IDX; i < MAX_HRUN_GAP_IDX; i++) { gop += step; - if (gop > maxGP) - gop = maxGP; + if (gop > maxGOP) + gop = maxGOP; gcp += step; - if(gcp > maxGP) - gcp = maxGP; + if(gcp > maxGCP) + gcp = maxGCP; GAP_OPEN_PROB_TABLE[i] = gop; GAP_CONT_PROB_TABLE[i] = gcp; } @@ -182,7 +173,7 @@ public class PairHMMIndelErrorModel { } - public double computeReadLikelihoodGivenHaplotype(byte[] haplotypeBases, byte[] readBases, byte[] readQuals) { + private double computeReadLikelihoodGivenHaplotype(byte[] haplotypeBases, byte[] readBases, byte[] readQuals) { final int X_METRIC_LENGTH = readBases.length+1; final int Y_METRIC_LENGTH = haplotypeBases.length+1; @@ -308,7 +299,7 @@ public class PairHMMIndelErrorModel { } - private void getContextHomopolymerLength(final byte[] refBytes, int[] hrunArray) { + static private void getContextHomopolymerLength(final byte[] refBytes, int[] hrunArray) { // compute forward hrun length, example: // AGGTGACCCCCCTGAGAG // 001000012345000000 @@ -340,7 +331,7 @@ public class PairHMMIndelErrorModel { hrunArray[i] = hforward[i]+hreverse[i]; } private Pair getGapPenalties(final int indI, final int indJ, final int X_METRIC_LENGTH, - final int Y_METRIC_LENGTH, final int tableToUpdate) { + final int Y_METRIC_LENGTH, final int tableToUpdate, double[] currentContextGOP, double[] currentContextGCP) { double c=0.0,d=0.0; @@ -350,33 +341,34 @@ public class PairHMMIndelErrorModel { break; case X_OFFSET: - c = (indJ==Y_METRIC_LENGTH-1? 0: currentContextGOP[indJ-1]); - d = (indJ==Y_METRIC_LENGTH-1? 0: currentContextGCP[indJ-1]); + c = (indJ==Y_METRIC_LENGTH-1? END_GAP_COST: currentContextGOP[indJ-1]); + d = (indJ==Y_METRIC_LENGTH-1? END_GAP_COST: currentContextGCP[indJ-1]); break; case Y_OFFSET: - c = (indI==X_METRIC_LENGTH-1? 0: currentContextGOP[indJ-1]); - d = (indI==X_METRIC_LENGTH-1? 0: currentContextGCP[indJ-1]); + c = (indI==X_METRIC_LENGTH-1? END_GAP_COST: currentContextGOP[indJ-1]); + d = (indI==X_METRIC_LENGTH-1? END_GAP_COST: currentContextGCP[indJ-1]); break; default: throw new StingException("BUG!! Invalid table offset"); } - } else { + } + else { switch (tableToUpdate) { case MATCH_OFFSET: break; case X_OFFSET: - c = (indJ==Y_METRIC_LENGTH-1? 0: logGapOpenProbability); - d = (indJ==Y_METRIC_LENGTH-1? 0: logGapContinuationProbability); + c = (indJ==Y_METRIC_LENGTH-1? END_GAP_COST: logGapOpenProbability); + d = (indJ==Y_METRIC_LENGTH-1? END_GAP_COST: logGapContinuationProbability); break; case Y_OFFSET: - c = (indI==X_METRIC_LENGTH-1? 0: logGapOpenProbability); - d = (indI==X_METRIC_LENGTH-1? 0: logGapContinuationProbability); + c = (indI==X_METRIC_LENGTH-1? END_GAP_COST: logGapOpenProbability); + d = (indI==X_METRIC_LENGTH-1? END_GAP_COST: logGapContinuationProbability); break; default: @@ -386,7 +378,8 @@ public class PairHMMIndelErrorModel { return new Pair(Double.valueOf(c),Double.valueOf(d)); } - public double computeReadLikelihoodGivenHaplotypeAffineGaps(byte[] haplotypeBases, byte[] readBases, byte[] readQuals) { + private double computeReadLikelihoodGivenHaplotypeAffineGaps(byte[] haplotypeBases, byte[] readBases, byte[] readQuals, + double[] currentGOP, double[] currentGCP) { final int X_METRIC_LENGTH = readBases.length+1; final int Y_METRIC_LENGTH = haplotypeBases.length+1; @@ -399,13 +392,13 @@ public class PairHMMIndelErrorModel { int[][] bestActionArrayX = new int[X_METRIC_LENGTH][Y_METRIC_LENGTH]; int[][] bestActionArrayY = new int[X_METRIC_LENGTH][Y_METRIC_LENGTH]; - matchMetricArray[0][0]= 0;//Double.NEGATIVE_INFINITY; + matchMetricArray[0][0]= END_GAP_COST;//Double.NEGATIVE_INFINITY; for (int i=1; i < X_METRIC_LENGTH; i++) { //initialize first column matchMetricArray[i][0] = Double.NEGATIVE_INFINITY; YMetricArray[i][0] = Double.NEGATIVE_INFINITY; - XMetricArray[i][0] = 0;//logGapOpenProbability + (i-1)*logGapContinuationProbability; + XMetricArray[i][0] = END_GAP_COST*(i);//logGapOpenProbability + (i-1)*logGapContinuationProbability; bestActionArrayX[i][0] = bestActionArrayY[i][0] = bestActionArrayM[i][0] = UP_GOTO_X; } @@ -414,7 +407,7 @@ public class PairHMMIndelErrorModel { // initialize first row matchMetricArray[0][j] = Double.NEGATIVE_INFINITY; XMetricArray[0][j] = Double.NEGATIVE_INFINITY; - YMetricArray[0][j] = 0;//logGapOpenProbability + (j-1) * logGapContinuationProbability; + YMetricArray[0][j] = END_GAP_COST*(j);//logGapOpenProbability + (j-1) * logGapContinuationProbability; bestActionArrayY[0][j] = bestActionArrayM[0][j] = bestActionArrayX[0][j] = LEFT_GOTO_Y; } @@ -460,8 +453,7 @@ public class PairHMMIndelErrorModel { // update X array // State X(i,j): X(1:i) aligned to a gap in Y(1:j). // When in last column of X, ie X(1:i) aligned to full Y, we don't want to penalize gaps - Pair p = getGapPenalties(indI, indJ, X_METRIC_LENGTH, - Y_METRIC_LENGTH, X_OFFSET); + Pair p = getGapPenalties(indI, indJ, X_METRIC_LENGTH, Y_METRIC_LENGTH, X_OFFSET, currentGOP, currentGCP); metrics[MATCH_OFFSET] = matchMetricArray[indI-1][indJ] + p.first; metrics[X_OFFSET] = XMetricArray[indI-1][indJ] + p.second; @@ -478,8 +470,7 @@ public class PairHMMIndelErrorModel { bestActionArrayX[indI][indJ] = ACTIONS_X[bestMetricIdx]; // update Y array - p = getGapPenalties(indI, indJ, X_METRIC_LENGTH, - Y_METRIC_LENGTH, Y_OFFSET); + p = getGapPenalties(indI, indJ, X_METRIC_LENGTH, Y_METRIC_LENGTH, Y_OFFSET, currentGOP, currentGCP); metrics[MATCH_OFFSET] = matchMetricArray[indI][indJ-1] + p.first; metrics[X_OFFSET] = Double.NEGATIVE_INFINITY; //XMetricArray[indI][indJ-1] + logGapOpenProbability; @@ -578,7 +569,8 @@ public class PairHMMIndelErrorModel { } - private void getGapProbabilities(int hIndex, int[] hrunProfile) { + private void fillGapProbabilities(int hIndex, int[] hrunProfile, + double[][] contextLogGapOpenProbabilities, double[][] contextLogGapContinuationProbabilities) { // fill based on lookup table for (int i = 0; i < hrunProfile.length; i++) { if (hrunProfile[i] >= MAX_HRUN_GAP_IDX) { @@ -591,36 +583,44 @@ public class PairHMMIndelErrorModel { } } } - public double[][] computeReadHaplotypeLikelihoods(ReadBackedPileup pileup, List haplotypesInVC, ReferenceContext ref){ + public synchronized double[][] computeReadHaplotypeLikelihoods(ReadBackedPileup pileup, List haplotypesInVC, + ReferenceContext ref, int haplotypeSize, int eventLength){ double[][] haplotypeLikehoodMatrix = new double[haplotypesInVC.size()][haplotypesInVC.size()]; double readLikelihoods[][] = new double[pileup.getReads().size()][haplotypesInVC.size()]; int readIdx=0; + double[][] contextLogGapOpenProbabilities = null; + double[][] contextLogGapContinuationProbabilities = null; + + if (DEBUG) { System.out.println("Reference bases:"); System.out.println(new String(ref.getBases())); } + if (doContextDependentPenalties) { + // will context dependent probabilities based on homopolymet run. Probabilities are filled based on total complete haplotypes. - for (int j=0; j < haplotypesInVC.size(); j++) { - Haplotype haplotype = haplotypesInVC.get(j); - byte[] haplotypeBases = haplotype.getBasesAsBytes(); - if (contextLogGapOpenProbabilities == null) { - contextLogGapOpenProbabilities = new double[haplotypesInVC.size()][haplotypeBases.length]; - contextLogGapContinuationProbabilities = new double[haplotypesInVC.size()][haplotypeBases.length]; - } - // get homopolymer length profile for current haplotype - int[] hrunProfile = new int[haplotypeBases.length]; - getContextHomopolymerLength(haplotypeBases,hrunProfile); - if (DEBUG) { - System.out.println("Haplotype bases:"); - System.out.println(new String(haplotypeBases)); - for (int i=0; i < hrunProfile.length; i++) - System.out.format("%d",hrunProfile[i]); - System.out.println(); - } - getGapProbabilities(j, hrunProfile); + for (int j=0; j < haplotypesInVC.size(); j++) { + Haplotype haplotype = haplotypesInVC.get(j); + byte[] haplotypeBases = haplotype.getBasesAsBytes(); + if (contextLogGapOpenProbabilities == null) { + contextLogGapOpenProbabilities = new double[haplotypesInVC.size()][haplotypeBases.length]; + contextLogGapContinuationProbabilities = new double[haplotypesInVC.size()][haplotypeBases.length]; + } + // get homopolymer length profile for current haplotype + int[] hrunProfile = new int[haplotypeBases.length]; + getContextHomopolymerLength(haplotypeBases,hrunProfile); + if (DEBUG) { + System.out.println("Haplotype bases:"); + System.out.println(new String(haplotypeBases)); + for (int i=0; i < hrunProfile.length; i++) + System.out.format("%d",hrunProfile[i]); + System.out.println(); + } + fillGapProbabilities(j, hrunProfile, contextLogGapOpenProbabilities, contextLogGapContinuationProbabilities); + } } for (SAMRecord read : pileup.getReads()) { @@ -634,16 +634,31 @@ public class PairHMMIndelErrorModel { // get bases of candidate haplotypes that overlap with reads - int offset = trailingBases / 2; + final int trailingBases = 3; + long readStart = read.getUnclippedStart(); long readEnd = read.getUnclippedEnd(); - int numStartSoftClippedBases =0, numEndSoftClippedBases = 0; - if (!useSoftClippedBases) { - numStartSoftClippedBases = read.getAlignmentStart()- read.getUnclippedStart(); - numEndSoftClippedBases = read.getUnclippedEnd()- read.getAlignmentEnd(); + int numStartSoftClippedBases, numEndSoftClippedBases; + // see if we want to use soft clipped bases. Aligners may soft clip all bases at insertions because they don't match, + // but they're actually consistent with the insertion! + // Rule: if a read starts in interval [eventStart-eventLength,eventStart+1] and we are at an insertion, we'll use all soft clipped bases at the beginning. + // Conversely, if a read ends at [eventStart,eventStart+eventLength] we'll use all soft clipped bases in the end of the read. + long eventStartPos = ref.getLocus().getStart(); + + // default: discard soft-clipped bases + numStartSoftClippedBases = read.getAlignmentStart()- read.getUnclippedStart(); + numEndSoftClippedBases = read.getUnclippedEnd()- read.getAlignmentEnd(); + + if (eventLength > 0) { + if ((read.getAlignmentStart()>=eventStartPos-eventLength && read.getAlignmentStart() <= eventStartPos+1) || + (read.getAlignmentEnd() >= eventStartPos && read.getAlignmentEnd() <= eventStartPos + eventLength)) { + numStartSoftClippedBases = 0; + numEndSoftClippedBases = 0; + } } + byte[] unclippedReadBases, unclippedReadQuals; @@ -668,14 +683,10 @@ public class PairHMMIndelErrorModel { break; } - long start = Math.max(readStart + numStartClippedBases - offset - ReadUtils.getFirstInsertionOffset(read), 0); - long stop = readEnd -numEndClippedBases + offset + ReadUtils.getLastInsertionOffset(read); + int extraOffset = Math.abs(eventLength); - // ok, we now figured out total number of clipped bases on both ends. - // Figure out where we want to place the haplotype to score read against - if (DEBUG) - System.out.format("numStartClippedBases: %d numEndClippedBases: %d WinStart:%d WinStop:%d start: %d stop: %d readLength: %d\n", - numStartClippedBases, numEndClippedBases, ref.getWindow().getStart(), ref.getWindow().getStop(), start, stop, read.getReadLength()); + long start = Math.max(readStart + numStartClippedBases - trailingBases - ReadUtils.getFirstInsertionOffset(read)-extraOffset, 0); + long stop = readEnd -numEndClippedBases + trailingBases + ReadUtils.getLastInsertionOffset(read)+extraOffset; // Variables start and stop are coordinates (inclusive) where we want to get the haplotype from. int readLength = read.getReadLength()-numStartSoftClippedBases-numEndSoftClippedBases; @@ -693,16 +704,13 @@ public class PairHMMIndelErrorModel { stop = start + readLength-1; } + // ok, we now figured out total number of clipped bases on both ends. + // Figure out where we want to place the haplotype to score read against + if (DEBUG) + System.out.format("numStartClippedBases: %d numEndClippedBases: %d WinStart:%d WinStop:%d start: %d stop: %d readLength: %d\n", + numStartClippedBases, numEndClippedBases, ref.getWindow().getStart(), ref.getWindow().getStop(), start, stop, read.getReadLength()); - /* - byte[] readAlignedBases = AlignmentUtils.readToAlignmentByteArray(read.getCigar(), read.getReadBases()); // Adjust the read bases based on the Cigar string - byte[] readAlignedQuals = AlignmentUtils.readToAlignmentByteArray(read.getCigar(), read.getBaseQualities()); // Shift the location of the qual scores based on the Cigar string - - - System.out.println("Aligned bases:"+new String(readAlignedBases)); - */ - if (numStartClippedBases + numEndClippedBases >= unclippedReadBases.length) { if (DEBUG) @@ -731,15 +739,21 @@ public class PairHMMIndelErrorModel { for (int j=0; j < haplotypesInVC.size(); j++) { Haplotype haplotype = haplotypesInVC.get(j); + if (stop > haplotype.getStopPosition()) + stop = haplotype.getStopPosition(); + + if (start < haplotype.getStartPosition()) + start = haplotype.getStartPosition(); + // cut haplotype bases - //long indStart = start - haplotype.getStartPosition(); - //long indStop = stop - haplotype.getStartPosition(); + long indStart = start - haplotype.getStartPosition(); + long indStop = stop - haplotype.getStartPosition(); //long indStart = 0; //long indStop = 400; - //byte[] haplotypeBases = Arrays.copyOfRange(haplotype.getBasesAsBytes(), - // (int)indStart, (int)indStop); - byte[] haplotypeBases = haplotype.getBasesAsBytes(); + byte[] haplotypeBases = Arrays.copyOfRange(haplotype.getBasesAsBytes(), + (int)indStart, (int)indStop); + //byte[] haplotypeBases = haplotype.getBasesAsBytes(); //BAQ.BAQCalculationResult baqRes = baq.calcBAQFromHMM(read, haplotypeBases, (int)(start - readStart)); /* if (s1 != null && s2 != null) { haplotypeBases = s2.getBytes(); @@ -770,9 +784,15 @@ public class PairHMMIndelErrorModel { } if (useAffineGapModel) { - currentContextGOP = contextLogGapOpenProbabilities[j]; - currentContextGCP = contextLogGapContinuationProbabilities[j]; - readLikelihoods[readIdx][j]= computeReadLikelihoodGivenHaplotypeAffineGaps(haplotypeBases, readBases, readQuals); + + double[] currentContextGOP = null; + double[] currentContextGCP = null; + + if (doContextDependentPenalties) { + currentContextGOP = Arrays.copyOfRange(contextLogGapOpenProbabilities[j], (int)indStart, (int)indStop); + currentContextGCP = Arrays.copyOfRange(contextLogGapContinuationProbabilities[j], (int)indStart, (int)indStop); + } + readLikelihoods[readIdx][j]= computeReadLikelihoodGivenHaplotypeAffineGaps(haplotypeBases, readBases, readQuals, currentContextGOP, currentContextGCP); } else