Refactoring of ReferenceConfidenceModel likelihood calculation in non variant sites
Changed a division by -10.0 to a multiplication by -.1 in QualUtils (typically multiplication is faster than division). Addresses performance issue #1081.
This commit is contained in:
parent
d4423d8303
commit
8f6daf70db
|
|
@ -775,7 +775,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
|
||||||
final String sampleName = sample.getKey();
|
final String sampleName = sample.getKey();
|
||||||
// The ploidy here is not dictated by the sample but by the simple genotyping-engine used to determine whether regions are active or not.
|
// The ploidy here is not dictated by the sample but by the simple genotyping-engine used to determine whether regions are active or not.
|
||||||
final int activeRegionDetectionHackishSamplePloidy = activeRegionEvaluationGenotyperEngine.getConfiguration().genotypeArgs.samplePloidy;
|
final int activeRegionDetectionHackishSamplePloidy = activeRegionEvaluationGenotyperEngine.getConfiguration().genotypeArgs.samplePloidy;
|
||||||
final double[] genotypeLikelihoods = referenceConfidenceModel.calcGenotypeLikelihoodsOfRefVsAny(sampleName,activeRegionDetectionHackishSamplePloidy,genotypingModel,sample.getValue().getBasePileup(), ref.getBase(), MIN_BASE_QUALTY_SCORE, averageHQSoftClips).genotypeLikelihoods;
|
final double[] genotypeLikelihoods = referenceConfidenceModel.calcGenotypeLikelihoodsOfRefVsAny(activeRegionDetectionHackishSamplePloidy,sample.getValue().getBasePileup(), ref.getBase(), MIN_BASE_QUALTY_SCORE, averageHQSoftClips).genotypeLikelihoods;
|
||||||
genotypes.add( new GenotypeBuilder(sample.getKey()).alleles(noCall).PL(genotypeLikelihoods).make() );
|
genotypes.add( new GenotypeBuilder(sample.getKey()).alleles(noCall).PL(genotypeLikelihoods).make() );
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -55,13 +55,14 @@ import htsjdk.samtools.*;
|
||||||
import htsjdk.variant.variantcontext.*;
|
import htsjdk.variant.variantcontext.*;
|
||||||
import htsjdk.variant.vcf.VCFHeaderLine;
|
import htsjdk.variant.vcf.VCFHeaderLine;
|
||||||
import htsjdk.variant.vcf.VCFSimpleHeaderLine;
|
import htsjdk.variant.vcf.VCFSimpleHeaderLine;
|
||||||
import org.broadinstitute.gatk.utils.contexts.AlignmentContext;
|
import org.broadinstitute.gatk.tools.walkers.genotyper.GenotypingModel;
|
||||||
import org.broadinstitute.gatk.tools.walkers.genotyper.*;
|
import org.broadinstitute.gatk.tools.walkers.genotyper.PloidyModel;
|
||||||
import org.broadinstitute.gatk.utils.GenomeLoc;
|
import org.broadinstitute.gatk.utils.GenomeLoc;
|
||||||
import org.broadinstitute.gatk.utils.GenomeLocParser;
|
import org.broadinstitute.gatk.utils.GenomeLocParser;
|
||||||
import org.broadinstitute.gatk.utils.MathUtils;
|
import org.broadinstitute.gatk.utils.MathUtils;
|
||||||
import org.broadinstitute.gatk.utils.QualityUtils;
|
import org.broadinstitute.gatk.utils.QualityUtils;
|
||||||
import org.broadinstitute.gatk.utils.activeregion.ActiveRegion;
|
import org.broadinstitute.gatk.utils.activeregion.ActiveRegion;
|
||||||
|
import org.broadinstitute.gatk.utils.contexts.AlignmentContext;
|
||||||
import org.broadinstitute.gatk.utils.genotyper.*;
|
import org.broadinstitute.gatk.utils.genotyper.*;
|
||||||
import org.broadinstitute.gatk.utils.haplotype.Haplotype;
|
import org.broadinstitute.gatk.utils.haplotype.Haplotype;
|
||||||
import org.broadinstitute.gatk.utils.locusiterator.LocusIteratorByState;
|
import org.broadinstitute.gatk.utils.locusiterator.LocusIteratorByState;
|
||||||
|
|
@ -205,9 +206,10 @@ public class ReferenceConfidenceModel {
|
||||||
results.add(overlappingSite);
|
results.add(overlappingSite);
|
||||||
} else {
|
} else {
|
||||||
// otherwise emit a reference confidence variant context
|
// otherwise emit a reference confidence variant context
|
||||||
|
// Assume infinite population on a single sample.
|
||||||
final int refOffset = offset + globalRefOffset;
|
final int refOffset = offset + globalRefOffset;
|
||||||
final byte refBase = ref[refOffset];
|
final byte refBase = ref[refOffset];
|
||||||
final RefVsAnyResult homRefCalc = calcGenotypeLikelihoodsOfRefVsAny(sampleName,ploidy,model,pileup, refBase, (byte)6, null);
|
final RefVsAnyResult homRefCalc = calcGenotypeLikelihoodsOfRefVsAny(ploidy, pileup, refBase, (byte)6, null);
|
||||||
homRefCalc.capByHomRefLikelihood();
|
homRefCalc.capByHomRefLikelihood();
|
||||||
|
|
||||||
final Allele refAllele = Allele.create(refBase, true);
|
final Allele refAllele = Allele.create(refBase, true);
|
||||||
|
|
@ -304,70 +306,64 @@ public class ReferenceConfidenceModel {
|
||||||
/**
|
/**
|
||||||
* Calculate the genotype likelihoods for the sample in pileup for being hom-ref contrasted with being ref vs. alt
|
* Calculate the genotype likelihoods for the sample in pileup for being hom-ref contrasted with being ref vs. alt
|
||||||
*
|
*
|
||||||
* @param sampleName target sample name.
|
|
||||||
* @param ploidy target sample ploidy.
|
* @param ploidy target sample ploidy.
|
||||||
* @param genotypingModel model to calculate likelihoods and genotypes.
|
|
||||||
* @param pileup the read backed pileup containing the data we want to evaluate
|
* @param pileup the read backed pileup containing the data we want to evaluate
|
||||||
* @param refBase the reference base at this pileup position
|
* @param refBase the reference base at this pileup position
|
||||||
* @param minBaseQual the min base quality for a read in the pileup at the pileup position to be included in the calculation
|
* @param minBaseQual the min base quality for a read in the pileup at the pileup position to be included in the calculation
|
||||||
* @param hqSoftClips running average data structure (can be null) to collect information about the number of high quality soft clips
|
* @param hqSoftClips running average data structure (can be null) to collect information about the number of high quality soft clips
|
||||||
* @return a RefVsAnyResult genotype call.
|
* @return a RefVsAnyResult genotype call.
|
||||||
*/
|
*/
|
||||||
public RefVsAnyResult calcGenotypeLikelihoodsOfRefVsAny(final String sampleName, final int ploidy,
|
public RefVsAnyResult calcGenotypeLikelihoodsOfRefVsAny(final int ploidy,
|
||||||
final GenotypingModel genotypingModel,
|
final ReadBackedPileup pileup,
|
||||||
final ReadBackedPileup pileup, final byte refBase, final byte minBaseQual, final MathUtils.RunningAverage hqSoftClips) {
|
final byte refBase,
|
||||||
final AlleleList<Allele> alleleList = new IndexedAlleleList<>(Allele.create(refBase,true), GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE);
|
final byte minBaseQual,
|
||||||
// Notice that the sample name is rather irrelevant as this information is never used, just need to be the same in both lines bellow.
|
final MathUtils.RunningAverage hqSoftClips) {
|
||||||
|
|
||||||
final int maximumReadCount = pileup.getReads().size();
|
final int likelihoodCount = ploidy + 1;
|
||||||
|
final double log10Ploidy = MathUtils.Log10Cache.get(ploidy);
|
||||||
|
|
||||||
final List<GATKSAMRecord> reads = new ArrayList<>(maximumReadCount);
|
final RefVsAnyResult result = new RefVsAnyResult(likelihoodCount);
|
||||||
final double[][] likelihoods = new double[2][maximumReadCount];
|
int readCount = 0;
|
||||||
final int[] adCounts = new int[2];
|
|
||||||
int nextIndex = 0;
|
|
||||||
for (final PileupElement p : pileup) {
|
for (final PileupElement p : pileup) {
|
||||||
final byte qual = p.isDeletion() ? REF_MODEL_DELETION_QUAL : p.getQual();
|
final byte qual = p.isDeletion() ? REF_MODEL_DELETION_QUAL : p.getQual();
|
||||||
if (!p.isDeletion() && qual <= minBaseQual)
|
if (!p.isDeletion() && qual <= minBaseQual)
|
||||||
continue;
|
continue;
|
||||||
final GATKSAMRecord read = p.getRead();
|
readCount++;
|
||||||
reads.add(read);
|
calcPileupElementRefVsNonRefLikelihoodAndCount(refBase, likelihoodCount, log10Ploidy, result, p, qual, hqSoftClips);
|
||||||
final boolean isAlt = p.getBase() != refBase || p.isDeletion() || p.isBeforeDeletionStart()
|
|
||||||
|| p.isAfterDeletionEnd() || p.isBeforeInsertion() || p.isAfterInsertion() || p.isNextToSoftClip();
|
|
||||||
final int bestAllele;
|
|
||||||
final int worstAllele;
|
|
||||||
if (isAlt) {
|
|
||||||
bestAllele = 1;
|
|
||||||
worstAllele = 0;
|
|
||||||
} else {
|
|
||||||
bestAllele = 0;
|
|
||||||
worstAllele = 1;
|
|
||||||
}
|
}
|
||||||
|
final double denominator = readCount * log10Ploidy;
|
||||||
likelihoods[bestAllele][nextIndex] = QualityUtils.qualToProbLog10(qual);
|
for (int i = 0; i < likelihoodCount; i++)
|
||||||
likelihoods[worstAllele][nextIndex++] = QualityUtils.qualToErrorProbLog10(qual) + MathUtils.LOG_ONE_THIRD;
|
result.genotypeLikelihoods[i] -= denominator;
|
||||||
adCounts[bestAllele]++;
|
|
||||||
if (isAlt && hqSoftClips != null && p.isNextToSoftClip())
|
|
||||||
hqSoftClips.add(AlignmentUtils.calcNumHighQualitySoftClips(read, (byte) 28));
|
|
||||||
}
|
|
||||||
|
|
||||||
final Map<String,List<GATKSAMRecord>> sampleToReads = Collections.singletonMap(sampleName,reads);
|
|
||||||
final ReadLikelihoods<Allele> readLikelihoods = new ReadLikelihoods<>(new IndexedSampleList(sampleName),alleleList,sampleToReads);
|
|
||||||
final ReadLikelihoods.Matrix<Allele> sampleLikelihoods = readLikelihoods.sampleMatrix(0);
|
|
||||||
final int readCount = sampleLikelihoods.readCount();
|
|
||||||
for (int i = 0; i < readCount; i++) {
|
|
||||||
sampleLikelihoods.set(0,i,likelihoods[0][i]);
|
|
||||||
sampleLikelihoods.set(1,i,likelihoods[1][i]);
|
|
||||||
}
|
|
||||||
|
|
||||||
final PloidyModel ploidyModel = new HomogeneousPloidyModel(new IndexedSampleList(sampleName),ploidy);
|
|
||||||
final GenotypingLikelihoods<Allele> genotypingLikelihoods = genotypingModel.calculateLikelihoods(alleleList, new GenotypingData<>(ploidyModel, readLikelihoods));
|
|
||||||
final double[] genotypeLikelihoodArray = genotypingLikelihoods.sampleLikelihoods(0).getAsVector();
|
|
||||||
final RefVsAnyResult result = new RefVsAnyResult(genotypeLikelihoodArray.length);
|
|
||||||
System.arraycopy(genotypeLikelihoodArray,0,result.genotypeLikelihoods,0,genotypeLikelihoodArray.length);
|
|
||||||
System.arraycopy(adCounts,0,result.AD_Ref_Any,0,2);
|
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private void calcPileupElementRefVsNonRefLikelihoodAndCount(final byte refBase, final int likelihoodCount, final double log10Ploidy, final RefVsAnyResult result, final PileupElement element, final byte qual, final MathUtils.RunningAverage hqSoftClips) {
|
||||||
|
final boolean isAlt = element.getBase() != refBase || element.isDeletion() || element.isBeforeDeletionStart()
|
||||||
|
|| element.isAfterDeletionEnd() || element.isBeforeInsertion() || element.isAfterInsertion() || element.isNextToSoftClip();
|
||||||
|
final double referenceLikelihood;
|
||||||
|
final double nonRefLikelihood;
|
||||||
|
if (isAlt) {
|
||||||
|
nonRefLikelihood = QualityUtils.qualToProbLog10(qual);
|
||||||
|
referenceLikelihood = QualityUtils.qualToErrorProbLog10(qual) + MathUtils.LOG_ONE_THIRD;
|
||||||
|
result.AD_Ref_Any[1]++;
|
||||||
|
} else {
|
||||||
|
referenceLikelihood = QualityUtils.qualToProbLog10(qual);
|
||||||
|
nonRefLikelihood = QualityUtils.qualToErrorProbLog10(qual) + MathUtils.LOG_ONE_THIRD;
|
||||||
|
result.AD_Ref_Any[0]++;
|
||||||
|
}
|
||||||
|
// Homozygous likelihoods don't need the logSum trick.
|
||||||
|
result.genotypeLikelihoods[0] += referenceLikelihood + log10Ploidy;
|
||||||
|
result.genotypeLikelihoods[likelihoodCount - 1] += nonRefLikelihood + log10Ploidy;
|
||||||
|
// Heterozyougs likelihoods need the logSum trick:
|
||||||
|
for (int i = 1, j = likelihoodCount - 2; i < likelihoodCount - 1; i++, j--)
|
||||||
|
result.genotypeLikelihoods[i] +=
|
||||||
|
MathUtils.approximateLog10SumLog10(
|
||||||
|
referenceLikelihood + MathUtils.Log10Cache.get(j),
|
||||||
|
nonRefLikelihood + MathUtils.Log10Cache.get(i));
|
||||||
|
if (isAlt && hqSoftClips != null && element.isNextToSoftClip())
|
||||||
|
hqSoftClips.add(AlignmentUtils.calcNumHighQualitySoftClips(element.getRead(), (byte) 28));
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Get a list of pileups that span the entire active region span, in order, one for each position
|
* Get a list of pileups that span the entire active region span, in order, one for each position
|
||||||
*/
|
*/
|
||||||
|
|
|
||||||
|
|
@ -205,7 +205,7 @@ public class QualityUtils {
|
||||||
@Ensures("result <= 0.0")
|
@Ensures("result <= 0.0")
|
||||||
public static double qualToErrorProbLog10(final double qual) {
|
public static double qualToErrorProbLog10(final double qual) {
|
||||||
if ( qual < 0.0 ) throw new IllegalArgumentException("qual must be >= 0.0 but got " + qual);
|
if ( qual < 0.0 ) throw new IllegalArgumentException("qual must be >= 0.0 but got " + qual);
|
||||||
return qual / -10.0;
|
return qual * -0.1;
|
||||||
}
|
}
|
||||||
|
|
||||||
// ----------------------------------------------------------------------
|
// ----------------------------------------------------------------------
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue