diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/indels/HaplotypeIndelErrorModel.java b/java/src/org/broadinstitute/sting/gatk/walkers/indels/HaplotypeIndelErrorModel.java index ccdf30f06..dd3bea0a7 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/indels/HaplotypeIndelErrorModel.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/indels/HaplotypeIndelErrorModel.java @@ -32,6 +32,7 @@ import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.genotype.Haplotype; +import java.util.Arrays; import java.util.List; /** @@ -63,6 +64,15 @@ public class HaplotypeIndelErrorModel { private final double logInsertionEndProbability; private final double logOneMinusInsertionEndProbability; + private boolean DEBUG = false; + private boolean doSimpleCalculationModel = false; + + public HaplotypeIndelErrorModel(int mrdl, double insStart, double insEnd, double alpha, int haplotypeSize, int maxReadLength, + boolean dosimple, boolean deb) { + this(mrdl, insStart, insEnd, alpha, haplotypeSize, maxReadLength); + this.DEBUG = deb; + this.doSimpleCalculationModel = dosimple; + } public HaplotypeIndelErrorModel(int mrdl, double insStart, double insEnd, double alpha, int haplotypeSize, int maxReadLength) { this.maxReadDeletionLength = mrdl; this.noDeletionProbability = 1-alpha; @@ -115,56 +125,133 @@ public class HaplotypeIndelErrorModel { } public double computeReadLikelihoodGivenHaplotype(Haplotype haplotype, SAMRecord read) { - + long numStartClippedBases = read.getAlignmentStart() - read.getUnclippedStart(); + long numEndClippedBases = read.getUnclippedEnd() - read.getAlignmentEnd(); final long readStartPosition = read.getAlignmentStart(); final long haplotypeStartPosition = haplotype.getStartPosition(); - final long readEndPosition = read.getUnclippedEnd(); - final long haplotypeEndPosition = haplotype.getStartPosition() + haplotypeSize-1; - final byte[] readBases = read.getReadBases(); - final byte[] readQuals = read.getBaseQualities(); + //final long readEndPosition = read.getUnclippedEnd(); + //final long haplotypeEndPosition = haplotype.getStartPosition() + haplotypeSize-1; + + byte[] readBases = Arrays.copyOfRange(read.getReadBases(),(int)numStartClippedBases, + (int)(read.getReadBases().length-numEndClippedBases)); + + byte[] readQuals = Arrays.copyOfRange(read.getBaseQualities(),(int)numStartClippedBases, + (int)(read.getReadBases().length-numEndClippedBases)); double[] previousPathMetricArray = new double[2*PATH_METRIC_TABLE_LENGTH]; int readStartIdx, initialIndexInHaplotype; - /* List b = read.getAlignmentBlocks(); - // TODO temp hack, dont take realigned reads for now - if (b.size()>1) { - rrc++; - //System.out.format("found realigned read %d\n", rrc); - return 0.0; - - } - */ - // case 1: read started before haplotype start position, we need to iterate only over remainder of read if (readStartPosition < haplotypeStartPosition) { - if (readEndPosition<= haplotypeEndPosition) - readStartIdx = (int)(haplotypeStartPosition - (readEndPosition - read.getReadBases().length)-1); - else - readStartIdx = (int)(haplotypeStartPosition- readStartPosition); + readStartIdx = (int)(haplotypeStartPosition- readStartPosition); initialIndexInHaplotype = 0; // implicit above - + if ((readStartPosition+readBases.length-1) < haplotypeStartPosition) + // when ignoring read CIGAR alignment, this read actually didn't overlap with haplotype: + // just ignore read + return INFINITE; } else { readStartIdx = 0; initialIndexInHaplotype = (int)(readStartPosition-haplotypeStartPosition); } + if (DEBUG) { + System.out.println("READ: "+read.getReadName()); + System.out.println(haplotypeStartPosition); + System.out.println(readStartPosition); + System.out.println(readStartIdx); + System.out.println(initialIndexInHaplotype); + } + + double pRead = 0; + if (doSimpleCalculationModel) { + // No Viterbi algorithm - assume no sequencing indel artifacts, + // so we can collapse computations and pr(read | haplotype) is just probability of observing overlap + // of read with haplotype. + int haplotypeIndex = initialIndexInHaplotype; + double c = 0.0;//deletionErrorProbabilities[1] +logOneMinusInsertionStartProbability; + // compute likelihood of portion of base to the left of the haplotype + for (int indR=readStartIdx-1; indR >= 0; indR--) { + byte readBase = readBases[indR]; + byte readQual = readQuals[indR]; + if (readQual <= 2) + continue; + double pBaseRead = getProbabilityOfReadBaseGivenXandI((byte)0, readBase, readQual, LEFT_ALIGN_INDEX, 0); + + // pBaseRead has -10*log10(Prob(base[i]|haplotype[i]) + pRead += pBaseRead; + + } + //System.out.format("\nSt: %d Pre-Likelihood:%f\n",readStartIdx, pRead); + + for (int indR=readStartIdx; indR < readBases.length; indR++) { + byte readBase = readBases[indR]; + byte readQual = readQuals[indR]; + + byte haplotypeBase; + if (haplotypeIndex < RIGHT_ALIGN_INDEX) + haplotypeBase = haplotype.getBasesAsBytes()[haplotypeIndex]; + else + haplotypeBase = (byte)0; // dummy + + double pBaseRead = getProbabilityOfReadBaseGivenXandI(haplotypeBase, readBase, readQual, haplotypeIndex, 0); + if (haplotypeBase != 0) + pBaseRead += c; + + // pBaseRead has -10*log10(Prob(base[i]|haplotype[i]) + if (readQual > 3) + pRead += pBaseRead; + haplotypeIndex++; + if (haplotypeIndex >= haplotype.getBasesAsBytes().length) + haplotypeIndex = RIGHT_ALIGN_INDEX; + //System.out.format("H:%c R:%c RQ:%d HI:%d %4.5f %4.5f\n", haplotypeBase, readBase, (int)readQual, haplotypeIndex, pBaseRead, pRead); + } + //System.out.format("\nSt: %d Post-Likelihood:%f\n",readStartIdx, pRead); + + if (DEBUG) { + System.out.println(read.getReadName()); + System.out.print("Haplotype:"); + + for (int k=0; k = 0; indR--) { @@ -223,42 +310,44 @@ public class HaplotypeIndelErrorModel { // for debugging only: compute backtracking to find optimal route through trellis. Since I'm only interested // in log-likelihood of best state, this isn't really necessary. -/* - int[] bestIndexArray = new int[readBases.length+1]; - + if (DEBUG) { - int bestIndex = MathUtils.minElementIndex(forwardPathMetricArray); - bestIndexArray[readBases.length] = bestIndex; + int[] bestIndexArray = new int[readBases.length+1]; - for (int k=readBases.length-1; k>=0; k--) { - bestIndex = bestStateIndexArray[k][bestIndex]; - bestIndexArray[k] = bestIndex; + + int bestIndex = MathUtils.minElementIndex(forwardPathMetricArray); + bestIndexArray[readBases.length] = bestIndex; + + for (int k=readBases.length-1; k>=0; k--) { + bestIndex = bestStateIndexArray[k][bestIndex]; + bestIndexArray[k] = bestIndex; + } + + + System.out.println(read.getReadName()); + System.out.print("Haplotype:"); + + for (int k=0; k { @Output @@ -73,6 +73,14 @@ public class SimpleIndelGenotyperWalker extends RefWalker { int HAPLOTYPE_SIZE = 40; + @Argument(fullName="indelHeterozygozity",shortName="indhet",doc="Indel Heterozygosity (assumed 1/6500 from empirical human data)",required=false) + double indelHeterozygosity = (double)1.0/6500.0; + + @Argument(fullName="doSimpleCalculationModel",shortName="simple",doc="Use Simple Calculation Model for Pr(Reads | Haplotype)",required=false) + boolean doSimple = false; + + @Argument(fullName="enableDebugOutput",shortName="debugout",doc="Output debug data",required=false) + boolean DEBUG = false; @Override public boolean generateExtendedEvents() { return true; } @@ -83,7 +91,7 @@ public class SimpleIndelGenotyperWalker extends RefWalker { private HaplotypeIndelErrorModel model; - private static final int MAX_READ_LENGTH = 200; // TODO- make this dynamic + private static final int MAX_READ_LENGTH = 1000; // TODO- make this dynamic @@ -91,20 +99,12 @@ public class SimpleIndelGenotyperWalker extends RefWalker { @Override public void initialize() { model = new HaplotypeIndelErrorModel(maxReadDeletionLength, insertionStartProbability, - insertionEndProbability, alphaDeletionProbability, HAPLOTYPE_SIZE, MAX_READ_LENGTH); + insertionEndProbability, alphaDeletionProbability, HAPLOTYPE_SIZE, MAX_READ_LENGTH, doSimple, DEBUG); } - /* private void countIndels(ReadBackedExtendedEventPileup p) { - for ( ExtendedEventPileupElement pe : p.toExtendedIterable() ) { - if ( ! pe.isIndel() ) continue; - if ( pe.getEventLength() > MAX_LENGTH ) continue; - if ( pe.isInsertion() ) insCounts[pe.getEventLength()-1]++; - else delCounts[pe.getEventLength()-1]++; - } - } - */ + public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { if ( tracker == null ) return 0; @@ -132,6 +132,36 @@ public class SimpleIndelGenotyperWalker extends RefWalker { ReadBackedPileup pileup = context.getBasePileup().getPileupWithoutMappingQualityZeroReads(); + // compute prior likelihoods on haplotypes, and initialize haplotype likelihood matrix with them. + // In general, we'll assume: even spread of indels throughout genome (not true, but simplifying assumption), + // and memoryless spread (i.e. probability that an indel lies in an interval A is independent of probability of + // another indel lying in interval B iff A and B don't overlap), then we can approximate inter-indel distances + // by an exponential distribution of mean 1/theta (theta = heterozygozity), and the number of indels on an interval + // of size L is Poisson-distributed with parameter lambda = theta*L. + + // Since typically, for small haplotype sizes and human heterozygozity, lambda will be <<1, we'll further approximate it + // by assuming that only one indel can happen in a particular interval, with Pr(indel present) = lambda*exp(-lambda), and + // pr(no indel) = 1-lambda*exp(-lambda) ~= exp(-lambda) for small lambda. + + // We also assume that a deletion is equally likely as an insertion (empirical observation, see e.g. Mills et al, Genome Research 2006) + // and we assume the following frequency spectrum for indel sizes Pr(event Length = L)= K*abs(L)^(-1.89)*10^(-0.015*abs(L)), + // taking positive L = insertions, negative L = deletions. K turns out to be about 1.5716 for probabilities to sum to one. + // so -10*log10(Pr event Length = L) =-10*log10(K)+ 18.9*log10(abs(L)) + 0.15*abs(L). + // Hence, Pr(observe event size = L in interval) ~ Pr(observe event L | event present) Pr (event present in interval) + // and -10*log10(above) = -10*log10(K)+ 18.9*log10(abs(L)) + 0.15*abs(L) - 10*log10(theta*L), and we ignore terms that would be + // added to ref hypothesis. + // Equation above is prior model. + int eventLength = vc.getReference().getBaseString().length() - vc.getAlternateAllele(0).getBaseString().length(); // assume only one alt allele for now + if (eventLength<0) + eventLength = - eventLength; + + double lambda = (double)HAPLOTYPE_SIZE * indelHeterozygosity; + double altPrior = HaplotypeIndelErrorModel.probToQual(lambda)-HaplotypeIndelErrorModel.probToQual(eventLength)*1.89 + 0.15*eventLength + + HaplotypeIndelErrorModel.probToQual(1.5716)+ HaplotypeIndelErrorModel.probToQual(0.5); + + haplotypeLikehoodMatrix[0][1] = altPrior; + haplotypeLikehoodMatrix[1][1] = 2*altPrior; + int bestIndexI =0, bestIndexJ=0; for (int i=0; i < haplotypesInVC.size(); i++) { for (int j=i; j < haplotypesInVC.size(); j++){ @@ -140,9 +170,13 @@ public class SimpleIndelGenotyperWalker extends RefWalker { // compute Pr(r | hi, hj) = 1/2*(Pr(r|hi) + Pr(r|hj) double readLikelihood[] = new double[2]; - readLikelihood[0]= -model.computeReadLikelihoodGivenHaplotype(haplotypesInVC.get(i), read)/10.0; - if (i != j) - readLikelihood[1] = -model.computeReadLikelihoodGivenHaplotype(haplotypesInVC.get(j), read)/10.0; + double readLikeGivenH; + readLikeGivenH = model.computeReadLikelihoodGivenHaplotype(haplotypesInVC.get(i), read); + readLikelihood[0]= -readLikeGivenH/10.0; + if (i != j){ + readLikeGivenH = model.computeReadLikelihoodGivenHaplotype(haplotypesInVC.get(j), read); + readLikelihood[1] = -readLikeGivenH/10.0; + } else readLikelihood[1] = readLikelihood[0]; @@ -150,6 +184,11 @@ public class SimpleIndelGenotyperWalker extends RefWalker { // sumlog10 computes 10^x[0]+10^x[1]+... double probRGivenHPair = MathUtils.sumLog10(readLikelihood)/2; haplotypeLikehoodMatrix[i][j] += HaplotypeIndelErrorModel.probToQual(probRGivenHPair); + /*System.out.print(read.getReadName()+" "); + System.out.format("%d %d S:%d US:%d E:%d UE:%d C:%s %3.4f %3.4f\n",i, j, read.getAlignmentStart(), + read.getUnclippedStart(), read.getAlignmentEnd(), read.getUnclippedEnd(), + read.getCigarString(), probRGivenHPair, haplotypeLikehoodMatrix[i][j]); + */ } diff --git a/java/src/org/broadinstitute/sting/utils/genotype/Haplotype.java b/java/src/org/broadinstitute/sting/utils/genotype/Haplotype.java index a92d9f242..cb37f12e6 100755 --- a/java/src/org/broadinstitute/sting/utils/genotype/Haplotype.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/Haplotype.java @@ -111,7 +111,7 @@ public class Haplotype { byte[] refBases = ref.getBases(); - int numPrefBases = 5; // haplotypeSize/2; + int numPrefBases = haplotypeSize/2; int startIdxInReference = (int)(1+vc.getStart()-numPrefBases-ref.getWindow().getStart()); //int numPrefBases = (int)(vc.getStart()-ref.getWindow().getStart()+1); // indel vc starts one before event