First major update of indel genotyper:
a) Really fix this time strand bias computation for indels, previous version was a partial fix only. b) Change way in which we deal with bad bases at the edge of reads. Even if a base is soft clipped in CIGAR string, there may still be dangling bases with Q=2 that may throw off QUAL computation in some sites. So, we're stricter and we also trim off those bases off read edges even if they are not soft-clipped officially. c) First feeble-minded attempt at runtime optimization - don't compute log and 10^base_qual every time. Rather, cache 10^-k/10 and log(1-10^-k/10) for all k <=60. This speeds up code about 4x. d) Further optimization: don't compute log(10^x+10^y) but rather use softMax function recently put into ExactAFCalculationModel. e) Skip bad reads where all Q=2 (sic) f) Avoid log to lin and back to log conversions of genotype likelihoods - this was legacy code from back when exact model did stuff in linear domain. This improves precision overall. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4802 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
e2d45ec2af
commit
ca7810f11d
|
|
@ -45,7 +45,6 @@ public class DindelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoo
|
|||
private final double insertionEndProbability = 0.5;
|
||||
private final double alphaDeletionProbability = 1e-3;
|
||||
private final int HAPLOTYPE_SIZE = 80;
|
||||
private static final double MINUS_LOG_INFINITY = -300;
|
||||
|
||||
// todo - the following need to be exposed for command line argument control
|
||||
private final double indelHeterozygosity = 1.0/8000;
|
||||
|
|
@ -89,11 +88,23 @@ public class DindelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoo
|
|||
if (!vc.isIndel())
|
||||
return null;
|
||||
|
||||
if (sitesVisited.contains(new Integer(vc.getStart())) &&
|
||||
contextType.equals(StratifiedAlignmentContext.StratifiedContextType.COMPLETE))
|
||||
return null;
|
||||
boolean visitedBefore = false;
|
||||
synchronized (this) {
|
||||
if (sitesVisited.contains(new Integer(vc.getStart())) &&
|
||||
contextType.equals(StratifiedAlignmentContext.StratifiedContextType.COMPLETE))
|
||||
visitedBefore = true;
|
||||
else {
|
||||
sitesVisited.add(new Integer(vc.getStart()));
|
||||
}
|
||||
}
|
||||
|
||||
sitesVisited.add(new Integer(vc.getStart()));
|
||||
if (visitedBefore)
|
||||
return null;
|
||||
|
||||
// protect against having an indel too close to the edge of a contig
|
||||
if (vc.getStart() <= HAPLOTYPE_SIZE)
|
||||
return null;
|
||||
|
||||
|
||||
if ( !(priors instanceof DiploidIndelGenotypePriors) )
|
||||
throw new StingException("Only diploid-based Indel priors are supported in the DINDEL GL model");
|
||||
|
|
@ -105,11 +116,9 @@ public class DindelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoo
|
|||
eventLength = - eventLength;
|
||||
|
||||
int currentHaplotypeSize = HAPLOTYPE_SIZE;
|
||||
List<Haplotype> haplotypesInVC = new ArrayList<Haplotype>();
|
||||
int minHaplotypeSize = Haplotype.LEFT_WINDOW_SIZE + eventLength + 2; // to be safe
|
||||
|
||||
// int numSamples = getNSamples(contexts);
|
||||
haplotypesInVC = Haplotype.makeHaplotypeListFromVariantContextAlleles( vc, ref, currentHaplotypeSize);
|
||||
List<Haplotype> haplotypesInVC = Haplotype.makeHaplotypeListFromVariantContextAlleles( vc, ref, currentHaplotypeSize);
|
||||
// For each sample, get genotype likelihoods based on pileup
|
||||
// compute prior likelihoods on haplotypes, and initialize haplotype likelihood matrix with them.
|
||||
// initialize the GenotypeLikelihoods
|
||||
|
|
@ -126,7 +135,7 @@ public class DindelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoo
|
|||
//double[] priorLikelihoods = priors.getPriors();
|
||||
|
||||
for ( Map.Entry<String, StratifiedAlignmentContext> sample : contexts.entrySet() ) {
|
||||
AlignmentContext context = sample.getValue().getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE);
|
||||
AlignmentContext context = sample.getValue().getContext(contextType);
|
||||
|
||||
ReadBackedPileup pileup = null;
|
||||
if (context.hasExtendedEventPileup())
|
||||
|
|
@ -138,14 +147,8 @@ public class DindelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoo
|
|||
haplotypeLikehoodMatrix = model.computeReadHaplotypeLikelihoods( pileup, haplotypesInVC, vc, eventLength);
|
||||
|
||||
|
||||
double[] genotypeLikelihoods = HaplotypeIndelErrorModel.getPosteriorProbabilitesFromHaplotypeLikelihoods( haplotypeLikehoodMatrix);
|
||||
double[] genotypeLikelihoods = HaplotypeIndelErrorModel.getHaplotypeLikelihoods( haplotypeLikehoodMatrix);
|
||||
|
||||
// todo- cleaner solution for case where probability is of form (1,0,0) or similar
|
||||
for (int k=0; k < 3; k++) {
|
||||
genotypeLikelihoods[k] = Math.log10(genotypeLikelihoods[k]);
|
||||
if (Double.isInfinite(genotypeLikelihoods[k]))
|
||||
genotypeLikelihoods[k] = MINUS_LOG_INFINITY;
|
||||
}
|
||||
GLs.put(sample.getKey(), new BiallelicGenotypeLikelihoods(sample.getKey(),
|
||||
vc.getReference(),
|
||||
vc.getAlternateAllele(0),
|
||||
|
|
|
|||
|
|
@ -151,7 +151,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
|||
return softMaxPair(a,vec[2]);
|
||||
}
|
||||
|
||||
double softMaxPair(double x, double y) {
|
||||
static public double softMaxPair(double x, double y) {
|
||||
if (Double.isInfinite(x))
|
||||
return y;
|
||||
|
||||
|
|
|
|||
|
|
@ -28,6 +28,7 @@ package org.broadinstitute.sting.gatk.walkers.indels;
|
|||
import net.sf.samtools.AlignmentBlock;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broad.tribble.util.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.ExactAFCalculationModel;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
|
|
@ -42,12 +43,13 @@ public class HaplotypeIndelErrorModel {
|
|||
private final int maxReadDeletionLength; // maximum length of deletion on a read
|
||||
private final double noDeletionProbability; // alpha for geometric distribution for deletion length
|
||||
private final int haplotypeSize;
|
||||
|
||||
private final int BASE_QUAL_THRESHOLD = 6;
|
||||
|
||||
private final int PATH_METRIC_TABLE_LENGTH;
|
||||
private final int RIGHT_ALIGN_INDEX;
|
||||
private final int LEFT_ALIGN_INDEX;
|
||||
|
||||
private static final double INFINITE = 10000000000000.0;
|
||||
|
||||
private double deletionErrorProbabilities[];
|
||||
|
||||
|
|
@ -64,6 +66,23 @@ public class HaplotypeIndelErrorModel {
|
|||
|
||||
private static final double QUAL_ONE_HALF = -10*Math.log10(0.5);
|
||||
|
||||
private static final int MAX_CACHED_QUAL = 60;
|
||||
|
||||
private static final double baseMatchArray[];
|
||||
private static final double baseMismatchArray[];
|
||||
|
||||
static {
|
||||
baseMatchArray = new double[MAX_CACHED_QUAL+1];
|
||||
baseMismatchArray = new double[MAX_CACHED_QUAL+1];
|
||||
for (int k=1; k <= MAX_CACHED_QUAL; k++) {
|
||||
double baseProb = QualityUtils.qualToProb(k);
|
||||
|
||||
|
||||
baseMatchArray[k] = probToQual(baseProb);
|
||||
baseMismatchArray[k] = (double)(k);
|
||||
}
|
||||
}
|
||||
|
||||
public HaplotypeIndelErrorModel(int mrdl, double insStart, double insEnd, double alpha, int haplotypeSize,
|
||||
boolean dosimple, boolean deb) {
|
||||
this(mrdl, insStart, insEnd, alpha, haplotypeSize);
|
||||
|
|
@ -115,17 +134,38 @@ public class HaplotypeIndelErrorModel {
|
|||
|
||||
public double computeReadLikelihoodGivenHaplotype(Haplotype haplotype, SAMRecord read,
|
||||
VariantContext vc, int eventLength) {
|
||||
long numStartClippedBases = read.getAlignmentStart() - read.getUnclippedStart();
|
||||
long numEndClippedBases = read.getUnclippedEnd() - read.getAlignmentEnd();
|
||||
final long readStartPosition = read.getAlignmentStart();
|
||||
final long haplotypeStartPosition = haplotype.getStartPosition();
|
||||
//final long readEndPosition = read.getUnclippedEnd();
|
||||
//final long haplotypeEndPosition = haplotype.getStartPosition() + haplotypeSize-1;
|
||||
|
||||
byte[] readBases = Arrays.copyOfRange(read.getReadBases(),(int)numStartClippedBases,
|
||||
long numStartClippedBases = 0;
|
||||
long numEndClippedBases = 0;
|
||||
|
||||
|
||||
byte[] unclippedReadQuals = read.getBaseQualities();
|
||||
byte[] unclippedReadBases = read.getReadBases();
|
||||
|
||||
|
||||
// Do a stricter base clipping than provided by CIGAR string, since this one may be too conservative,
|
||||
// and may leave a string of Q2 bases still hanging off the reads.
|
||||
for (int i=0; i < read.getReadLength(); i++) {
|
||||
if (unclippedReadQuals[i] < BASE_QUAL_THRESHOLD)
|
||||
numStartClippedBases++;
|
||||
else
|
||||
break;
|
||||
|
||||
}
|
||||
for (int i=read.getReadLength()-1; i >= 0; i-- ){
|
||||
if (unclippedReadQuals[i] < BASE_QUAL_THRESHOLD)
|
||||
numEndClippedBases++;
|
||||
else
|
||||
break;
|
||||
}
|
||||
//System.out.format("numstart: %d numend: %d\n", numStartClippedBases, numEndClippedBases);
|
||||
if (numStartClippedBases + numEndClippedBases >= read.getReadLength()) {
|
||||
return 0;///Double.POSITIVE_INFINITY;
|
||||
}
|
||||
byte[] readBases = Arrays.copyOfRange(unclippedReadBases,(int)numStartClippedBases,
|
||||
(int)(read.getReadBases().length-numEndClippedBases));
|
||||
|
||||
byte[] readQuals = Arrays.copyOfRange(read.getBaseQualities(),(int)numStartClippedBases,
|
||||
byte[] readQuals = Arrays.copyOfRange(unclippedReadQuals,(int)numStartClippedBases,
|
||||
(int)(read.getReadBases().length-numEndClippedBases));
|
||||
|
||||
|
||||
|
|
@ -233,8 +273,7 @@ public class HaplotypeIndelErrorModel {
|
|||
byte readBase = readBases[indR];
|
||||
byte readQual = readQuals[indR];
|
||||
|
||||
|
||||
for (int indX=LEFT_ALIGN_INDEX; indX <= RIGHT_ALIGN_INDEX; indX++) {
|
||||
for (int indX=LEFT_ALIGN_INDEX; indX <= RIGHT_ALIGN_INDEX; indX++) {
|
||||
|
||||
|
||||
byte haplotypeBase;
|
||||
|
|
@ -251,17 +290,8 @@ public class HaplotypeIndelErrorModel {
|
|||
|
||||
// for debugging only: compute backtracking to find optimal route through trellis. Since I'm only interested
|
||||
// in log-likelihood of best state, this isn't really necessary.
|
||||
int[] bestIndexArray = new int[readLength];
|
||||
double bestMetric = MathUtils.arrayMin(pathMetricArray[readLength]);
|
||||
|
||||
// start from last position of read, go backwards to find optimal alignment
|
||||
int bestIndex = MathUtils.minElementIndex(pathMetricArray[readLength]);
|
||||
bestIndexArray[readLength-1] = bestIndex;
|
||||
|
||||
for (int k=readLength-2; k>=0; k--) {
|
||||
bestIndex = bestStateIndexArray[k][bestIndex];
|
||||
bestIndexArray[k] = bestIndex;
|
||||
}
|
||||
if (DEBUG) {
|
||||
|
||||
|
||||
|
|
@ -280,6 +310,22 @@ public class HaplotypeIndelErrorModel {
|
|||
}
|
||||
System.out.println();
|
||||
|
||||
System.out.print("Read quals: ");
|
||||
for (int k=0; k <readQuals.length; k++) {
|
||||
System.out.format("%d ", (int)readQuals[k]);
|
||||
}
|
||||
System.out.println();
|
||||
|
||||
// start from last position of read, go backwards to find optimal alignment
|
||||
int[] bestIndexArray = new int[readLength];
|
||||
int bestIndex = MathUtils.minElementIndex(pathMetricArray[readLength]);
|
||||
bestIndexArray[readLength-1] = bestIndex;
|
||||
|
||||
for (int k=readLength-2; k>=0; k--) {
|
||||
bestIndex = bestStateIndexArray[k][bestIndex];
|
||||
bestIndexArray[k] = bestIndex;
|
||||
}
|
||||
|
||||
System.out.print("Alignment: ");
|
||||
for (int k=0; k <readBases.length; k++) {
|
||||
System.out.format("%d ", bestIndexArray[k]);
|
||||
|
|
@ -297,7 +343,7 @@ public class HaplotypeIndelErrorModel {
|
|||
|
||||
double bmetric;
|
||||
|
||||
double bestMetric = INFINITE;
|
||||
double bestMetric = Double.POSITIVE_INFINITY;
|
||||
int bestMetricIndex = -1;
|
||||
|
||||
// compute metric for match/mismatch
|
||||
|
|
@ -307,11 +353,11 @@ public class HaplotypeIndelErrorModel {
|
|||
if (readQual < 1)
|
||||
readQual = 1;
|
||||
|
||||
double baseProb = QualityUtils.qualToProb(readQual);
|
||||
if (readQual > MAX_CACHED_QUAL)
|
||||
readQual = MAX_CACHED_QUAL;
|
||||
|
||||
|
||||
double pBaseMatch = probToQual(baseProb);
|
||||
double pBaseMismatch = (double)(readQual);
|
||||
double pBaseMatch = baseMatchArray[(int)readQual];
|
||||
double pBaseMismatch = baseMismatchArray[(int)readQual];
|
||||
|
||||
if (haplotypeBase == readBase)
|
||||
pBaseRead = pBaseMatch;
|
||||
|
|
@ -406,8 +452,11 @@ public class HaplotypeIndelErrorModel {
|
|||
readLikelihood[0] = -readLikelihoods[readIdx][i]/10;
|
||||
readLikelihood[1] = -readLikelihoods[readIdx][j]/10;
|
||||
|
||||
double probRGivenHPair = MathUtils.sumLog10(readLikelihood)/2;
|
||||
haplotypeLikehoodMatrix[i][j] += HaplotypeIndelErrorModel.probToQual(probRGivenHPair);
|
||||
// Compute log10(10^x1/2 + 10^x2/2) = log10(10^x1+x0^x2)-log10(2)
|
||||
// First term is approximated by Jacobian log with table lookup.
|
||||
// Second term is a constant added to both likelihoods so will be ignored
|
||||
haplotypeLikehoodMatrix[i][j] += ExactAFCalculationModel.softMaxPair(readLikelihood[0],
|
||||
readLikelihood[1]);
|
||||
|
||||
}
|
||||
|
||||
|
|
@ -419,18 +468,25 @@ public class HaplotypeIndelErrorModel {
|
|||
|
||||
}
|
||||
|
||||
public static double[] getPosteriorProbabilitesFromHaplotypeLikelihoods(double[][] haplotypeLikehoodMatrix) {
|
||||
public static double[] getHaplotypeLikelihoods(double[][] haplotypeLikehoodMatrix) {
|
||||
int hSize = haplotypeLikehoodMatrix.length;
|
||||
double[] genotypeLikelihoods = new double[hSize*(hSize+1)/2];
|
||||
|
||||
int k=0;
|
||||
double maxElement = Double.NEGATIVE_INFINITY;
|
||||
for (int i=0; i < hSize; i++) {
|
||||
for (int j=i; j < hSize; j++){
|
||||
genotypeLikelihoods[k++] = -haplotypeLikehoodMatrix[i][j]/10;
|
||||
genotypeLikelihoods[k++] = haplotypeLikehoodMatrix[i][j];
|
||||
if (haplotypeLikehoodMatrix[i][j] > maxElement)
|
||||
maxElement = haplotypeLikehoodMatrix[i][j];
|
||||
}
|
||||
}
|
||||
// normalize likelihoods and pass to linear domain.
|
||||
return MathUtils.normalizeFromLog10(genotypeLikelihoods);
|
||||
|
||||
// renormalize
|
||||
for (int i=0; i < genotypeLikelihoods.length; i++)
|
||||
genotypeLikelihoods[i] -= maxElement;
|
||||
|
||||
return genotypeLikelihoods;
|
||||
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue