First major update of indel genotyper:

a) Really fix this time strand bias computation for indels, previous version was a partial fix only.
b) Change way in which we deal with bad bases at the edge of reads. Even if a base is soft clipped in CIGAR string, there may still be dangling bases with Q=2 that may throw off QUAL computation in some sites. So, we're stricter and we also trim off those bases off read edges even if they are not soft-clipped officially.
c) First feeble-minded attempt at runtime optimization - don't compute log and 10^base_qual every time. Rather, cache 10^-k/10 and log(1-10^-k/10) for all k <=60. This speeds up code about 4x.
d) Further optimization: don't compute log(10^x+10^y) but rather use softMax function recently put into ExactAFCalculationModel.
e) Skip bad reads where all Q=2 (sic)
f) Avoid log to lin and back to log conversions of genotype likelihoods - this was legacy code from back when exact model did stuff in linear domain. This improves precision overall.




git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4802 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
delangel 2010-12-07 18:35:22 +00:00
parent e2d45ec2af
commit ca7810f11d
3 changed files with 107 additions and 48 deletions

View File

@ -45,7 +45,6 @@ public class DindelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoo
private final double insertionEndProbability = 0.5;
private final double alphaDeletionProbability = 1e-3;
private final int HAPLOTYPE_SIZE = 80;
private static final double MINUS_LOG_INFINITY = -300;
// todo - the following need to be exposed for command line argument control
private final double indelHeterozygosity = 1.0/8000;
@ -89,11 +88,23 @@ public class DindelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoo
if (!vc.isIndel())
return null;
if (sitesVisited.contains(new Integer(vc.getStart())) &&
contextType.equals(StratifiedAlignmentContext.StratifiedContextType.COMPLETE))
return null;
boolean visitedBefore = false;
synchronized (this) {
if (sitesVisited.contains(new Integer(vc.getStart())) &&
contextType.equals(StratifiedAlignmentContext.StratifiedContextType.COMPLETE))
visitedBefore = true;
else {
sitesVisited.add(new Integer(vc.getStart()));
}
}
sitesVisited.add(new Integer(vc.getStart()));
if (visitedBefore)
return null;
// protect against having an indel too close to the edge of a contig
if (vc.getStart() <= HAPLOTYPE_SIZE)
return null;
if ( !(priors instanceof DiploidIndelGenotypePriors) )
throw new StingException("Only diploid-based Indel priors are supported in the DINDEL GL model");
@ -105,11 +116,9 @@ public class DindelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoo
eventLength = - eventLength;
int currentHaplotypeSize = HAPLOTYPE_SIZE;
List<Haplotype> haplotypesInVC = new ArrayList<Haplotype>();
int minHaplotypeSize = Haplotype.LEFT_WINDOW_SIZE + eventLength + 2; // to be safe
// int numSamples = getNSamples(contexts);
haplotypesInVC = Haplotype.makeHaplotypeListFromVariantContextAlleles( vc, ref, currentHaplotypeSize);
List<Haplotype> haplotypesInVC = Haplotype.makeHaplotypeListFromVariantContextAlleles( vc, ref, currentHaplotypeSize);
// For each sample, get genotype likelihoods based on pileup
// compute prior likelihoods on haplotypes, and initialize haplotype likelihood matrix with them.
// initialize the GenotypeLikelihoods
@ -126,7 +135,7 @@ public class DindelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoo
//double[] priorLikelihoods = priors.getPriors();
for ( Map.Entry<String, StratifiedAlignmentContext> sample : contexts.entrySet() ) {
AlignmentContext context = sample.getValue().getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE);
AlignmentContext context = sample.getValue().getContext(contextType);
ReadBackedPileup pileup = null;
if (context.hasExtendedEventPileup())
@ -138,14 +147,8 @@ public class DindelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoo
haplotypeLikehoodMatrix = model.computeReadHaplotypeLikelihoods( pileup, haplotypesInVC, vc, eventLength);
double[] genotypeLikelihoods = HaplotypeIndelErrorModel.getPosteriorProbabilitesFromHaplotypeLikelihoods( haplotypeLikehoodMatrix);
double[] genotypeLikelihoods = HaplotypeIndelErrorModel.getHaplotypeLikelihoods( haplotypeLikehoodMatrix);
// todo- cleaner solution for case where probability is of form (1,0,0) or similar
for (int k=0; k < 3; k++) {
genotypeLikelihoods[k] = Math.log10(genotypeLikelihoods[k]);
if (Double.isInfinite(genotypeLikelihoods[k]))
genotypeLikelihoods[k] = MINUS_LOG_INFINITY;
}
GLs.put(sample.getKey(), new BiallelicGenotypeLikelihoods(sample.getKey(),
vc.getReference(),
vc.getAlternateAllele(0),

View File

@ -151,7 +151,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
return softMaxPair(a,vec[2]);
}
double softMaxPair(double x, double y) {
static public double softMaxPair(double x, double y) {
if (Double.isInfinite(x))
return y;

View File

@ -28,6 +28,7 @@ package org.broadinstitute.sting.gatk.walkers.indels;
import net.sf.samtools.AlignmentBlock;
import net.sf.samtools.SAMRecord;
import org.broad.tribble.util.variantcontext.VariantContext;
import org.broadinstitute.sting.gatk.walkers.genotyper.ExactAFCalculationModel;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.Utils;
@ -42,12 +43,13 @@ public class HaplotypeIndelErrorModel {
private final int maxReadDeletionLength; // maximum length of deletion on a read
private final double noDeletionProbability; // alpha for geometric distribution for deletion length
private final int haplotypeSize;
private final int BASE_QUAL_THRESHOLD = 6;
private final int PATH_METRIC_TABLE_LENGTH;
private final int RIGHT_ALIGN_INDEX;
private final int LEFT_ALIGN_INDEX;
private static final double INFINITE = 10000000000000.0;
private double deletionErrorProbabilities[];
@ -64,6 +66,23 @@ public class HaplotypeIndelErrorModel {
private static final double QUAL_ONE_HALF = -10*Math.log10(0.5);
private static final int MAX_CACHED_QUAL = 60;
private static final double baseMatchArray[];
private static final double baseMismatchArray[];
static {
baseMatchArray = new double[MAX_CACHED_QUAL+1];
baseMismatchArray = new double[MAX_CACHED_QUAL+1];
for (int k=1; k <= MAX_CACHED_QUAL; k++) {
double baseProb = QualityUtils.qualToProb(k);
baseMatchArray[k] = probToQual(baseProb);
baseMismatchArray[k] = (double)(k);
}
}
public HaplotypeIndelErrorModel(int mrdl, double insStart, double insEnd, double alpha, int haplotypeSize,
boolean dosimple, boolean deb) {
this(mrdl, insStart, insEnd, alpha, haplotypeSize);
@ -115,17 +134,38 @@ public class HaplotypeIndelErrorModel {
public double computeReadLikelihoodGivenHaplotype(Haplotype haplotype, SAMRecord read,
VariantContext vc, int eventLength) {
long numStartClippedBases = read.getAlignmentStart() - read.getUnclippedStart();
long numEndClippedBases = read.getUnclippedEnd() - read.getAlignmentEnd();
final long readStartPosition = read.getAlignmentStart();
final long haplotypeStartPosition = haplotype.getStartPosition();
//final long readEndPosition = read.getUnclippedEnd();
//final long haplotypeEndPosition = haplotype.getStartPosition() + haplotypeSize-1;
byte[] readBases = Arrays.copyOfRange(read.getReadBases(),(int)numStartClippedBases,
long numStartClippedBases = 0;
long numEndClippedBases = 0;
byte[] unclippedReadQuals = read.getBaseQualities();
byte[] unclippedReadBases = read.getReadBases();
// Do a stricter base clipping than provided by CIGAR string, since this one may be too conservative,
// and may leave a string of Q2 bases still hanging off the reads.
for (int i=0; i < read.getReadLength(); i++) {
if (unclippedReadQuals[i] < BASE_QUAL_THRESHOLD)
numStartClippedBases++;
else
break;
}
for (int i=read.getReadLength()-1; i >= 0; i-- ){
if (unclippedReadQuals[i] < BASE_QUAL_THRESHOLD)
numEndClippedBases++;
else
break;
}
//System.out.format("numstart: %d numend: %d\n", numStartClippedBases, numEndClippedBases);
if (numStartClippedBases + numEndClippedBases >= read.getReadLength()) {
return 0;///Double.POSITIVE_INFINITY;
}
byte[] readBases = Arrays.copyOfRange(unclippedReadBases,(int)numStartClippedBases,
(int)(read.getReadBases().length-numEndClippedBases));
byte[] readQuals = Arrays.copyOfRange(read.getBaseQualities(),(int)numStartClippedBases,
byte[] readQuals = Arrays.copyOfRange(unclippedReadQuals,(int)numStartClippedBases,
(int)(read.getReadBases().length-numEndClippedBases));
@ -233,8 +273,7 @@ public class HaplotypeIndelErrorModel {
byte readBase = readBases[indR];
byte readQual = readQuals[indR];
for (int indX=LEFT_ALIGN_INDEX; indX <= RIGHT_ALIGN_INDEX; indX++) {
for (int indX=LEFT_ALIGN_INDEX; indX <= RIGHT_ALIGN_INDEX; indX++) {
byte haplotypeBase;
@ -251,17 +290,8 @@ public class HaplotypeIndelErrorModel {
// for debugging only: compute backtracking to find optimal route through trellis. Since I'm only interested
// in log-likelihood of best state, this isn't really necessary.
int[] bestIndexArray = new int[readLength];
double bestMetric = MathUtils.arrayMin(pathMetricArray[readLength]);
// start from last position of read, go backwards to find optimal alignment
int bestIndex = MathUtils.minElementIndex(pathMetricArray[readLength]);
bestIndexArray[readLength-1] = bestIndex;
for (int k=readLength-2; k>=0; k--) {
bestIndex = bestStateIndexArray[k][bestIndex];
bestIndexArray[k] = bestIndex;
}
if (DEBUG) {
@ -280,6 +310,22 @@ public class HaplotypeIndelErrorModel {
}
System.out.println();
System.out.print("Read quals: ");
for (int k=0; k <readQuals.length; k++) {
System.out.format("%d ", (int)readQuals[k]);
}
System.out.println();
// start from last position of read, go backwards to find optimal alignment
int[] bestIndexArray = new int[readLength];
int bestIndex = MathUtils.minElementIndex(pathMetricArray[readLength]);
bestIndexArray[readLength-1] = bestIndex;
for (int k=readLength-2; k>=0; k--) {
bestIndex = bestStateIndexArray[k][bestIndex];
bestIndexArray[k] = bestIndex;
}
System.out.print("Alignment: ");
for (int k=0; k <readBases.length; k++) {
System.out.format("%d ", bestIndexArray[k]);
@ -297,7 +343,7 @@ public class HaplotypeIndelErrorModel {
double bmetric;
double bestMetric = INFINITE;
double bestMetric = Double.POSITIVE_INFINITY;
int bestMetricIndex = -1;
// compute metric for match/mismatch
@ -307,11 +353,11 @@ public class HaplotypeIndelErrorModel {
if (readQual < 1)
readQual = 1;
double baseProb = QualityUtils.qualToProb(readQual);
if (readQual > MAX_CACHED_QUAL)
readQual = MAX_CACHED_QUAL;
double pBaseMatch = probToQual(baseProb);
double pBaseMismatch = (double)(readQual);
double pBaseMatch = baseMatchArray[(int)readQual];
double pBaseMismatch = baseMismatchArray[(int)readQual];
if (haplotypeBase == readBase)
pBaseRead = pBaseMatch;
@ -406,8 +452,11 @@ public class HaplotypeIndelErrorModel {
readLikelihood[0] = -readLikelihoods[readIdx][i]/10;
readLikelihood[1] = -readLikelihoods[readIdx][j]/10;
double probRGivenHPair = MathUtils.sumLog10(readLikelihood)/2;
haplotypeLikehoodMatrix[i][j] += HaplotypeIndelErrorModel.probToQual(probRGivenHPair);
// Compute log10(10^x1/2 + 10^x2/2) = log10(10^x1+x0^x2)-log10(2)
// First term is approximated by Jacobian log with table lookup.
// Second term is a constant added to both likelihoods so will be ignored
haplotypeLikehoodMatrix[i][j] += ExactAFCalculationModel.softMaxPair(readLikelihood[0],
readLikelihood[1]);
}
@ -419,18 +468,25 @@ public class HaplotypeIndelErrorModel {
}
public static double[] getPosteriorProbabilitesFromHaplotypeLikelihoods(double[][] haplotypeLikehoodMatrix) {
public static double[] getHaplotypeLikelihoods(double[][] haplotypeLikehoodMatrix) {
int hSize = haplotypeLikehoodMatrix.length;
double[] genotypeLikelihoods = new double[hSize*(hSize+1)/2];
int k=0;
double maxElement = Double.NEGATIVE_INFINITY;
for (int i=0; i < hSize; i++) {
for (int j=i; j < hSize; j++){
genotypeLikelihoods[k++] = -haplotypeLikehoodMatrix[i][j]/10;
genotypeLikelihoods[k++] = haplotypeLikehoodMatrix[i][j];
if (haplotypeLikehoodMatrix[i][j] > maxElement)
maxElement = haplotypeLikehoodMatrix[i][j];
}
}
// normalize likelihoods and pass to linear domain.
return MathUtils.normalizeFromLog10(genotypeLikelihoods);
// renormalize
for (int i=0; i < genotypeLikelihoods.length; i++)
genotypeLikelihoods[i] -= maxElement;
return genotypeLikelihoods;
}