First major update of indel genotyper:
a) Really fix this time strand bias computation for indels, previous version was a partial fix only. b) Change way in which we deal with bad bases at the edge of reads. Even if a base is soft clipped in CIGAR string, there may still be dangling bases with Q=2 that may throw off QUAL computation in some sites. So, we're stricter and we also trim off those bases off read edges even if they are not soft-clipped officially. c) First feeble-minded attempt at runtime optimization - don't compute log and 10^base_qual every time. Rather, cache 10^-k/10 and log(1-10^-k/10) for all k <=60. This speeds up code about 4x. d) Further optimization: don't compute log(10^x+10^y) but rather use softMax function recently put into ExactAFCalculationModel. e) Skip bad reads where all Q=2 (sic) f) Avoid log to lin and back to log conversions of genotype likelihoods - this was legacy code from back when exact model did stuff in linear domain. This improves precision overall. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4802 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
e2d45ec2af
commit
ca7810f11d
|
|
@ -45,7 +45,6 @@ public class DindelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoo
|
||||||
private final double insertionEndProbability = 0.5;
|
private final double insertionEndProbability = 0.5;
|
||||||
private final double alphaDeletionProbability = 1e-3;
|
private final double alphaDeletionProbability = 1e-3;
|
||||||
private final int HAPLOTYPE_SIZE = 80;
|
private final int HAPLOTYPE_SIZE = 80;
|
||||||
private static final double MINUS_LOG_INFINITY = -300;
|
|
||||||
|
|
||||||
// todo - the following need to be exposed for command line argument control
|
// todo - the following need to be exposed for command line argument control
|
||||||
private final double indelHeterozygosity = 1.0/8000;
|
private final double indelHeterozygosity = 1.0/8000;
|
||||||
|
|
@ -89,11 +88,23 @@ public class DindelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoo
|
||||||
if (!vc.isIndel())
|
if (!vc.isIndel())
|
||||||
return null;
|
return null;
|
||||||
|
|
||||||
if (sitesVisited.contains(new Integer(vc.getStart())) &&
|
boolean visitedBefore = false;
|
||||||
contextType.equals(StratifiedAlignmentContext.StratifiedContextType.COMPLETE))
|
synchronized (this) {
|
||||||
return null;
|
if (sitesVisited.contains(new Integer(vc.getStart())) &&
|
||||||
|
contextType.equals(StratifiedAlignmentContext.StratifiedContextType.COMPLETE))
|
||||||
|
visitedBefore = true;
|
||||||
|
else {
|
||||||
|
sitesVisited.add(new Integer(vc.getStart()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
sitesVisited.add(new Integer(vc.getStart()));
|
if (visitedBefore)
|
||||||
|
return null;
|
||||||
|
|
||||||
|
// protect against having an indel too close to the edge of a contig
|
||||||
|
if (vc.getStart() <= HAPLOTYPE_SIZE)
|
||||||
|
return null;
|
||||||
|
|
||||||
|
|
||||||
if ( !(priors instanceof DiploidIndelGenotypePriors) )
|
if ( !(priors instanceof DiploidIndelGenotypePriors) )
|
||||||
throw new StingException("Only diploid-based Indel priors are supported in the DINDEL GL model");
|
throw new StingException("Only diploid-based Indel priors are supported in the DINDEL GL model");
|
||||||
|
|
@ -105,11 +116,9 @@ public class DindelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoo
|
||||||
eventLength = - eventLength;
|
eventLength = - eventLength;
|
||||||
|
|
||||||
int currentHaplotypeSize = HAPLOTYPE_SIZE;
|
int currentHaplotypeSize = HAPLOTYPE_SIZE;
|
||||||
List<Haplotype> haplotypesInVC = new ArrayList<Haplotype>();
|
|
||||||
int minHaplotypeSize = Haplotype.LEFT_WINDOW_SIZE + eventLength + 2; // to be safe
|
|
||||||
|
|
||||||
// int numSamples = getNSamples(contexts);
|
// int numSamples = getNSamples(contexts);
|
||||||
haplotypesInVC = Haplotype.makeHaplotypeListFromVariantContextAlleles( vc, ref, currentHaplotypeSize);
|
List<Haplotype> haplotypesInVC = Haplotype.makeHaplotypeListFromVariantContextAlleles( vc, ref, currentHaplotypeSize);
|
||||||
// For each sample, get genotype likelihoods based on pileup
|
// For each sample, get genotype likelihoods based on pileup
|
||||||
// compute prior likelihoods on haplotypes, and initialize haplotype likelihood matrix with them.
|
// compute prior likelihoods on haplotypes, and initialize haplotype likelihood matrix with them.
|
||||||
// initialize the GenotypeLikelihoods
|
// initialize the GenotypeLikelihoods
|
||||||
|
|
@ -126,7 +135,7 @@ public class DindelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoo
|
||||||
//double[] priorLikelihoods = priors.getPriors();
|
//double[] priorLikelihoods = priors.getPriors();
|
||||||
|
|
||||||
for ( Map.Entry<String, StratifiedAlignmentContext> sample : contexts.entrySet() ) {
|
for ( Map.Entry<String, StratifiedAlignmentContext> sample : contexts.entrySet() ) {
|
||||||
AlignmentContext context = sample.getValue().getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE);
|
AlignmentContext context = sample.getValue().getContext(contextType);
|
||||||
|
|
||||||
ReadBackedPileup pileup = null;
|
ReadBackedPileup pileup = null;
|
||||||
if (context.hasExtendedEventPileup())
|
if (context.hasExtendedEventPileup())
|
||||||
|
|
@ -138,14 +147,8 @@ public class DindelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoo
|
||||||
haplotypeLikehoodMatrix = model.computeReadHaplotypeLikelihoods( pileup, haplotypesInVC, vc, eventLength);
|
haplotypeLikehoodMatrix = model.computeReadHaplotypeLikelihoods( pileup, haplotypesInVC, vc, eventLength);
|
||||||
|
|
||||||
|
|
||||||
double[] genotypeLikelihoods = HaplotypeIndelErrorModel.getPosteriorProbabilitesFromHaplotypeLikelihoods( haplotypeLikehoodMatrix);
|
double[] genotypeLikelihoods = HaplotypeIndelErrorModel.getHaplotypeLikelihoods( haplotypeLikehoodMatrix);
|
||||||
|
|
||||||
// todo- cleaner solution for case where probability is of form (1,0,0) or similar
|
|
||||||
for (int k=0; k < 3; k++) {
|
|
||||||
genotypeLikelihoods[k] = Math.log10(genotypeLikelihoods[k]);
|
|
||||||
if (Double.isInfinite(genotypeLikelihoods[k]))
|
|
||||||
genotypeLikelihoods[k] = MINUS_LOG_INFINITY;
|
|
||||||
}
|
|
||||||
GLs.put(sample.getKey(), new BiallelicGenotypeLikelihoods(sample.getKey(),
|
GLs.put(sample.getKey(), new BiallelicGenotypeLikelihoods(sample.getKey(),
|
||||||
vc.getReference(),
|
vc.getReference(),
|
||||||
vc.getAlternateAllele(0),
|
vc.getAlternateAllele(0),
|
||||||
|
|
|
||||||
|
|
@ -151,7 +151,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
||||||
return softMaxPair(a,vec[2]);
|
return softMaxPair(a,vec[2]);
|
||||||
}
|
}
|
||||||
|
|
||||||
double softMaxPair(double x, double y) {
|
static public double softMaxPair(double x, double y) {
|
||||||
if (Double.isInfinite(x))
|
if (Double.isInfinite(x))
|
||||||
return y;
|
return y;
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -28,6 +28,7 @@ package org.broadinstitute.sting.gatk.walkers.indels;
|
||||||
import net.sf.samtools.AlignmentBlock;
|
import net.sf.samtools.AlignmentBlock;
|
||||||
import net.sf.samtools.SAMRecord;
|
import net.sf.samtools.SAMRecord;
|
||||||
import org.broad.tribble.util.variantcontext.VariantContext;
|
import org.broad.tribble.util.variantcontext.VariantContext;
|
||||||
|
import org.broadinstitute.sting.gatk.walkers.genotyper.ExactAFCalculationModel;
|
||||||
import org.broadinstitute.sting.utils.MathUtils;
|
import org.broadinstitute.sting.utils.MathUtils;
|
||||||
import org.broadinstitute.sting.utils.QualityUtils;
|
import org.broadinstitute.sting.utils.QualityUtils;
|
||||||
import org.broadinstitute.sting.utils.Utils;
|
import org.broadinstitute.sting.utils.Utils;
|
||||||
|
|
@ -42,12 +43,13 @@ public class HaplotypeIndelErrorModel {
|
||||||
private final int maxReadDeletionLength; // maximum length of deletion on a read
|
private final int maxReadDeletionLength; // maximum length of deletion on a read
|
||||||
private final double noDeletionProbability; // alpha for geometric distribution for deletion length
|
private final double noDeletionProbability; // alpha for geometric distribution for deletion length
|
||||||
private final int haplotypeSize;
|
private final int haplotypeSize;
|
||||||
|
|
||||||
|
private final int BASE_QUAL_THRESHOLD = 6;
|
||||||
|
|
||||||
private final int PATH_METRIC_TABLE_LENGTH;
|
private final int PATH_METRIC_TABLE_LENGTH;
|
||||||
private final int RIGHT_ALIGN_INDEX;
|
private final int RIGHT_ALIGN_INDEX;
|
||||||
private final int LEFT_ALIGN_INDEX;
|
private final int LEFT_ALIGN_INDEX;
|
||||||
|
|
||||||
private static final double INFINITE = 10000000000000.0;
|
|
||||||
|
|
||||||
private double deletionErrorProbabilities[];
|
private double deletionErrorProbabilities[];
|
||||||
|
|
||||||
|
|
@ -64,6 +66,23 @@ public class HaplotypeIndelErrorModel {
|
||||||
|
|
||||||
private static final double QUAL_ONE_HALF = -10*Math.log10(0.5);
|
private static final double QUAL_ONE_HALF = -10*Math.log10(0.5);
|
||||||
|
|
||||||
|
private static final int MAX_CACHED_QUAL = 60;
|
||||||
|
|
||||||
|
private static final double baseMatchArray[];
|
||||||
|
private static final double baseMismatchArray[];
|
||||||
|
|
||||||
|
static {
|
||||||
|
baseMatchArray = new double[MAX_CACHED_QUAL+1];
|
||||||
|
baseMismatchArray = new double[MAX_CACHED_QUAL+1];
|
||||||
|
for (int k=1; k <= MAX_CACHED_QUAL; k++) {
|
||||||
|
double baseProb = QualityUtils.qualToProb(k);
|
||||||
|
|
||||||
|
|
||||||
|
baseMatchArray[k] = probToQual(baseProb);
|
||||||
|
baseMismatchArray[k] = (double)(k);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
public HaplotypeIndelErrorModel(int mrdl, double insStart, double insEnd, double alpha, int haplotypeSize,
|
public HaplotypeIndelErrorModel(int mrdl, double insStart, double insEnd, double alpha, int haplotypeSize,
|
||||||
boolean dosimple, boolean deb) {
|
boolean dosimple, boolean deb) {
|
||||||
this(mrdl, insStart, insEnd, alpha, haplotypeSize);
|
this(mrdl, insStart, insEnd, alpha, haplotypeSize);
|
||||||
|
|
@ -115,17 +134,38 @@ public class HaplotypeIndelErrorModel {
|
||||||
|
|
||||||
public double computeReadLikelihoodGivenHaplotype(Haplotype haplotype, SAMRecord read,
|
public double computeReadLikelihoodGivenHaplotype(Haplotype haplotype, SAMRecord read,
|
||||||
VariantContext vc, int eventLength) {
|
VariantContext vc, int eventLength) {
|
||||||
long numStartClippedBases = read.getAlignmentStart() - read.getUnclippedStart();
|
|
||||||
long numEndClippedBases = read.getUnclippedEnd() - read.getAlignmentEnd();
|
|
||||||
final long readStartPosition = read.getAlignmentStart();
|
|
||||||
final long haplotypeStartPosition = haplotype.getStartPosition();
|
|
||||||
//final long readEndPosition = read.getUnclippedEnd();
|
|
||||||
//final long haplotypeEndPosition = haplotype.getStartPosition() + haplotypeSize-1;
|
|
||||||
|
|
||||||
byte[] readBases = Arrays.copyOfRange(read.getReadBases(),(int)numStartClippedBases,
|
long numStartClippedBases = 0;
|
||||||
|
long numEndClippedBases = 0;
|
||||||
|
|
||||||
|
|
||||||
|
byte[] unclippedReadQuals = read.getBaseQualities();
|
||||||
|
byte[] unclippedReadBases = read.getReadBases();
|
||||||
|
|
||||||
|
|
||||||
|
// Do a stricter base clipping than provided by CIGAR string, since this one may be too conservative,
|
||||||
|
// and may leave a string of Q2 bases still hanging off the reads.
|
||||||
|
for (int i=0; i < read.getReadLength(); i++) {
|
||||||
|
if (unclippedReadQuals[i] < BASE_QUAL_THRESHOLD)
|
||||||
|
numStartClippedBases++;
|
||||||
|
else
|
||||||
|
break;
|
||||||
|
|
||||||
|
}
|
||||||
|
for (int i=read.getReadLength()-1; i >= 0; i-- ){
|
||||||
|
if (unclippedReadQuals[i] < BASE_QUAL_THRESHOLD)
|
||||||
|
numEndClippedBases++;
|
||||||
|
else
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
//System.out.format("numstart: %d numend: %d\n", numStartClippedBases, numEndClippedBases);
|
||||||
|
if (numStartClippedBases + numEndClippedBases >= read.getReadLength()) {
|
||||||
|
return 0;///Double.POSITIVE_INFINITY;
|
||||||
|
}
|
||||||
|
byte[] readBases = Arrays.copyOfRange(unclippedReadBases,(int)numStartClippedBases,
|
||||||
(int)(read.getReadBases().length-numEndClippedBases));
|
(int)(read.getReadBases().length-numEndClippedBases));
|
||||||
|
|
||||||
byte[] readQuals = Arrays.copyOfRange(read.getBaseQualities(),(int)numStartClippedBases,
|
byte[] readQuals = Arrays.copyOfRange(unclippedReadQuals,(int)numStartClippedBases,
|
||||||
(int)(read.getReadBases().length-numEndClippedBases));
|
(int)(read.getReadBases().length-numEndClippedBases));
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -233,8 +273,7 @@ public class HaplotypeIndelErrorModel {
|
||||||
byte readBase = readBases[indR];
|
byte readBase = readBases[indR];
|
||||||
byte readQual = readQuals[indR];
|
byte readQual = readQuals[indR];
|
||||||
|
|
||||||
|
for (int indX=LEFT_ALIGN_INDEX; indX <= RIGHT_ALIGN_INDEX; indX++) {
|
||||||
for (int indX=LEFT_ALIGN_INDEX; indX <= RIGHT_ALIGN_INDEX; indX++) {
|
|
||||||
|
|
||||||
|
|
||||||
byte haplotypeBase;
|
byte haplotypeBase;
|
||||||
|
|
@ -251,17 +290,8 @@ public class HaplotypeIndelErrorModel {
|
||||||
|
|
||||||
// for debugging only: compute backtracking to find optimal route through trellis. Since I'm only interested
|
// for debugging only: compute backtracking to find optimal route through trellis. Since I'm only interested
|
||||||
// in log-likelihood of best state, this isn't really necessary.
|
// in log-likelihood of best state, this isn't really necessary.
|
||||||
int[] bestIndexArray = new int[readLength];
|
|
||||||
double bestMetric = MathUtils.arrayMin(pathMetricArray[readLength]);
|
double bestMetric = MathUtils.arrayMin(pathMetricArray[readLength]);
|
||||||
|
|
||||||
// start from last position of read, go backwards to find optimal alignment
|
|
||||||
int bestIndex = MathUtils.minElementIndex(pathMetricArray[readLength]);
|
|
||||||
bestIndexArray[readLength-1] = bestIndex;
|
|
||||||
|
|
||||||
for (int k=readLength-2; k>=0; k--) {
|
|
||||||
bestIndex = bestStateIndexArray[k][bestIndex];
|
|
||||||
bestIndexArray[k] = bestIndex;
|
|
||||||
}
|
|
||||||
if (DEBUG) {
|
if (DEBUG) {
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -280,6 +310,22 @@ public class HaplotypeIndelErrorModel {
|
||||||
}
|
}
|
||||||
System.out.println();
|
System.out.println();
|
||||||
|
|
||||||
|
System.out.print("Read quals: ");
|
||||||
|
for (int k=0; k <readQuals.length; k++) {
|
||||||
|
System.out.format("%d ", (int)readQuals[k]);
|
||||||
|
}
|
||||||
|
System.out.println();
|
||||||
|
|
||||||
|
// start from last position of read, go backwards to find optimal alignment
|
||||||
|
int[] bestIndexArray = new int[readLength];
|
||||||
|
int bestIndex = MathUtils.minElementIndex(pathMetricArray[readLength]);
|
||||||
|
bestIndexArray[readLength-1] = bestIndex;
|
||||||
|
|
||||||
|
for (int k=readLength-2; k>=0; k--) {
|
||||||
|
bestIndex = bestStateIndexArray[k][bestIndex];
|
||||||
|
bestIndexArray[k] = bestIndex;
|
||||||
|
}
|
||||||
|
|
||||||
System.out.print("Alignment: ");
|
System.out.print("Alignment: ");
|
||||||
for (int k=0; k <readBases.length; k++) {
|
for (int k=0; k <readBases.length; k++) {
|
||||||
System.out.format("%d ", bestIndexArray[k]);
|
System.out.format("%d ", bestIndexArray[k]);
|
||||||
|
|
@ -297,7 +343,7 @@ public class HaplotypeIndelErrorModel {
|
||||||
|
|
||||||
double bmetric;
|
double bmetric;
|
||||||
|
|
||||||
double bestMetric = INFINITE;
|
double bestMetric = Double.POSITIVE_INFINITY;
|
||||||
int bestMetricIndex = -1;
|
int bestMetricIndex = -1;
|
||||||
|
|
||||||
// compute metric for match/mismatch
|
// compute metric for match/mismatch
|
||||||
|
|
@ -307,11 +353,11 @@ public class HaplotypeIndelErrorModel {
|
||||||
if (readQual < 1)
|
if (readQual < 1)
|
||||||
readQual = 1;
|
readQual = 1;
|
||||||
|
|
||||||
double baseProb = QualityUtils.qualToProb(readQual);
|
if (readQual > MAX_CACHED_QUAL)
|
||||||
|
readQual = MAX_CACHED_QUAL;
|
||||||
|
|
||||||
|
double pBaseMatch = baseMatchArray[(int)readQual];
|
||||||
double pBaseMatch = probToQual(baseProb);
|
double pBaseMismatch = baseMismatchArray[(int)readQual];
|
||||||
double pBaseMismatch = (double)(readQual);
|
|
||||||
|
|
||||||
if (haplotypeBase == readBase)
|
if (haplotypeBase == readBase)
|
||||||
pBaseRead = pBaseMatch;
|
pBaseRead = pBaseMatch;
|
||||||
|
|
@ -406,8 +452,11 @@ public class HaplotypeIndelErrorModel {
|
||||||
readLikelihood[0] = -readLikelihoods[readIdx][i]/10;
|
readLikelihood[0] = -readLikelihoods[readIdx][i]/10;
|
||||||
readLikelihood[1] = -readLikelihoods[readIdx][j]/10;
|
readLikelihood[1] = -readLikelihoods[readIdx][j]/10;
|
||||||
|
|
||||||
double probRGivenHPair = MathUtils.sumLog10(readLikelihood)/2;
|
// Compute log10(10^x1/2 + 10^x2/2) = log10(10^x1+x0^x2)-log10(2)
|
||||||
haplotypeLikehoodMatrix[i][j] += HaplotypeIndelErrorModel.probToQual(probRGivenHPair);
|
// First term is approximated by Jacobian log with table lookup.
|
||||||
|
// Second term is a constant added to both likelihoods so will be ignored
|
||||||
|
haplotypeLikehoodMatrix[i][j] += ExactAFCalculationModel.softMaxPair(readLikelihood[0],
|
||||||
|
readLikelihood[1]);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -419,18 +468,25 @@ public class HaplotypeIndelErrorModel {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public static double[] getPosteriorProbabilitesFromHaplotypeLikelihoods(double[][] haplotypeLikehoodMatrix) {
|
public static double[] getHaplotypeLikelihoods(double[][] haplotypeLikehoodMatrix) {
|
||||||
int hSize = haplotypeLikehoodMatrix.length;
|
int hSize = haplotypeLikehoodMatrix.length;
|
||||||
double[] genotypeLikelihoods = new double[hSize*(hSize+1)/2];
|
double[] genotypeLikelihoods = new double[hSize*(hSize+1)/2];
|
||||||
|
|
||||||
int k=0;
|
int k=0;
|
||||||
|
double maxElement = Double.NEGATIVE_INFINITY;
|
||||||
for (int i=0; i < hSize; i++) {
|
for (int i=0; i < hSize; i++) {
|
||||||
for (int j=i; j < hSize; j++){
|
for (int j=i; j < hSize; j++){
|
||||||
genotypeLikelihoods[k++] = -haplotypeLikehoodMatrix[i][j]/10;
|
genotypeLikelihoods[k++] = haplotypeLikehoodMatrix[i][j];
|
||||||
|
if (haplotypeLikehoodMatrix[i][j] > maxElement)
|
||||||
|
maxElement = haplotypeLikehoodMatrix[i][j];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// normalize likelihoods and pass to linear domain.
|
|
||||||
return MathUtils.normalizeFromLog10(genotypeLikelihoods);
|
// renormalize
|
||||||
|
for (int i=0; i < genotypeLikelihoods.length; i++)
|
||||||
|
genotypeLikelihoods[i] -= maxElement;
|
||||||
|
|
||||||
|
return genotypeLikelihoods;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue