remove alt alleles, when genotype count is explosively large, based on alleles' highest supporting haplotype score; max tolerable genotype count is controlled by a default value overridable by user

remove alt alleles, when genotype count is explosively large, based on alleles' highest supporting haplotype score; max tolerable genotype count is controlled by a default value overridable by user
This commit is contained in:
Steve Huang 2016-06-30 22:36:49 -04:00 committed by GitHub
parent 6471bda0cd
commit 1ff234e7dd
6 changed files with 268 additions and 77 deletions

View File

@ -122,12 +122,32 @@ public class GenotypeCalculationArgumentCollection implements Cloneable{
* scales exponentially based on the number of alternate alleles. Unless there is a good reason to change the default value, we highly recommend
* that you not play around with this parameter.
*
* As of GATK 2.2 the genotyper can handle a very large number of events, so the default maximum has been increased to 6.
* See also {@link #MAX_GENOTYPE_COUNT}.
*/
@Advanced
@Argument(fullName = "max_alternate_alleles", shortName = MAX_ALTERNATE_ALLELES_SHORT_NAME, doc = "Maximum number of alternate alleles to genotype", required = false)
public int MAX_ALTERNATE_ALLELES = 6;
/**
* If there are more than this number of genotypes at a locus presented to the genotyper, then only this many genotypes will be used.
* The possible genotypes are simply different ways of partitioning alleles given a specific ploidy asumption.
* Therefore, we remove genotypes from consideration by removing alternate alleles that are the least well supported.
* The estimate of allele support is based on the ranking of the candidate haplotypes coming out of the graph building step.
* Note that the reference allele is always kept.
*
* Note that genotyping sites with large genotype counts is both CPU and memory intensive.
* Unless there is a good reason to change the default value, we highly recommend that you not play around with this parameter.
*
* The maximum number of alternative alleles used in the genotyping step will be the lesser of the two:
* 1. the largest number of alt alleles, given ploidy, that yields a genotype count no higher than {@link #MAX_GENOTYPE_COUNT}
* 2. the value of {@link #MAX_ALTERNATE_ALLELES}
*
* See also {@link #MAX_ALTERNATE_ALLELES}.
*/
@Advanced
@Argument(fullName = "max_genotype_count", shortName = "maxGT", doc = "Maximum number of genotypes to consider at any site", required = false)
public int MAX_GENOTYPE_COUNT = 1024;
/**
* Determines the maximum number of PL values that will be logged in the output. If the number of genotypes
* (which is determined by the ploidy and the number of alleles) exceeds the value provided by this argument,

View File

@ -163,7 +163,7 @@ public class GenotypeLikelihoodCalculator {
* <p>This is in fact a shallow copy if {@link GenotypeLikelihoodCalculators#ploidyLog10}</p> and is not meant to be modified by
* this class. </p>
*/
private final double[] log10;
private final double[] ploidyLog10;
/**
* Buffer field use as a temporal container for sorted allele counts when calculating the likelihood of a
@ -202,24 +202,22 @@ public class GenotypeLikelihoodCalculator {
* Creates a new calculator providing its ploidy and number of genotyping alleles.
*/
protected GenotypeLikelihoodCalculator(final int ploidy, final int alleleCount,
final int[][] alleleFirstGenotypeOffsetByPloidy,
final GenotypeAlleleCounts[][] genotypeTableByPloidy,
final double[] ploidyLog10) {
final int[][] alleleFirstGenotypeOffsetByPloidy,
final GenotypeAlleleCounts[][] genotypeTableByPloidy,
final double[] ploidyLog10) {
this.alleleFirstGenotypeOffsetByPloidy = alleleFirstGenotypeOffsetByPloidy;
genotypeAlleleCounts = genotypeTableByPloidy[ploidy];
this.alleleCount = alleleCount;
this.ploidy = ploidy;
genotypeCount = this.alleleFirstGenotypeOffsetByPloidy[ploidy][alleleCount];
if (genotypeCount == GenotypeLikelihoodCalculators.GENOTYPE_COUNT_OVERFLOW)
throw new IllegalArgumentException(
String.format("the combination of ploidy (%s) and number of alleles (%s) results in a very large number of genotypes (> %s). You need to limit ploidy or the number of alternative alleles to analyze this locus",
ploidy,alleleCount,Integer.MAX_VALUE));
alleleHeap = new IntMaxHeap(ploidy);
readLikelihoodsByGenotypeIndex = new double[genotypeCount][];
log10 = ploidyLog10;
this.ploidyLog10 = ploidyLog10;
// The number of possible components is limited by distinct allele count and ploidy.
maximumDistinctAllelesInGenotype = Math.min(ploidy, alleleCount);
genotypeAllelesAndCounts = new int[maximumDistinctAllelesInGenotype << 1];
genotypeAllelesAndCounts = new int[maximumDistinctAllelesInGenotype*2];
}
/**
@ -349,7 +347,7 @@ public class GenotypeLikelihoodCalculator {
*/
private double[] genotypeLikelihoods(final double[][] readLikelihoodsByGenotypeIndex, final int readCount) {
final double[] result = new double[genotypeCount];
final double denominator = readCount * log10[ploidy]; // instead of dividing each read likelihood by ploidy
final double denominator = readCount * ploidyLog10[ploidy]; // instead of dividing each read likelihood by ploidy
// ( so subtract log10(ploidy) ) we multiply them all and the divide by ploidy^readCount (so substract readCount * log10(ploidy) )
for (int g = 0; g < genotypeCount; g++) {
final double[] likelihoodsByRead = readLikelihoodsByGenotypeIndex[g];
@ -464,7 +462,9 @@ public class GenotypeLikelihoodCalculator {
* exactly one allele present in the genotype.
*/
private void singleComponentGenotypeLikelihoodByRead(final GenotypeAlleleCounts genotypeAlleleCounts,
final double[] likelihoodByRead, final double[] readLikelihoodComponentsByAlleleCount, final int readCount) {
final double[] likelihoodByRead,
final double[] readLikelihoodComponentsByAlleleCount,
final int readCount) {
final int allele = genotypeAlleleCounts.alleleIndexAt(0);
// the count of the only component must be = ploidy.
int offset = (allele * (ploidy + 1) + ploidy) * readCount;
@ -493,7 +493,7 @@ public class GenotypeLikelihoodCalculator {
// p = 2 because the frequency == 1 we already have it.
for (int frequency = 2, destinationOffset = frequency1Offset + readCount; frequency <= ploidy; frequency++) {
final double log10frequency = log10[frequency];
final double log10frequency = ploidyLog10[frequency];
for (int r = 0, sourceOffset = frequency1Offset; r < readCount; r++)
readAlleleLikelihoodByAlleleCount[destinationOffset++] =
readAlleleLikelihoodByAlleleCount[sourceOffset++] + log10frequency;
@ -620,7 +620,11 @@ public class GenotypeLikelihoodCalculator {
* @param destination where to store the new genotype index mapping to old.
* @param sortedAlleleCountsBuffer a buffer to re-use to get the genotype-allele-count's sorted allele counts.
*/
private void genotypeIndexMapPerGenotypeIndex(final int newGenotypeIndex, final GenotypeAlleleCounts alleleCounts, final int[] oldToNewAlleleIndexMap, final int[] destination, final int[] sortedAlleleCountsBuffer) {
private void genotypeIndexMapPerGenotypeIndex(final int newGenotypeIndex,
final GenotypeAlleleCounts alleleCounts,
final int[] oldToNewAlleleIndexMap,
final int[] destination,
final int[] sortedAlleleCountsBuffer) {
final int distinctAlleleCount = alleleCounts.distinctAlleleCount();
alleleCounts.copyAlleleCounts(sortedAlleleCountsBuffer,0);
for (int j = 0, jj = 0; j < distinctAlleleCount; j++) {

View File

@ -51,7 +51,11 @@
package org.broadinstitute.gatk.tools.walkers.genotyper;
import org.broadinstitute.gatk.utils.MathUtils;
import org.broadinstitute.gatk.utils.exceptions.GATKException;
import java.util.Arrays;
import java.util.stream.IntStream;
/**
* Genotype likelihood calculator utility.
@ -116,7 +120,10 @@ public class GenotypeLikelihoodCalculators {
private volatile static GenotypeAlleleCounts[][] genotypeTableByPloidy =
buildGenotypeAlleleCountsTable(maximumPloidy,maximumAllele,alleleFirstGenotypeOffsetByPloidy);
/**
* Cached log10 values for the first integer up to the maximum ploidy requested thus far.
*/
private volatile static double[] ploidyLog10 = IntStream.range(0, maximumPloidy + 1).mapToDouble(Math::log10).toArray();
/**
* Build the table with the genotype offsets based on ploidy and the maximum allele index with representation
@ -291,40 +298,29 @@ public class GenotypeLikelihoodCalculators {
return result;
}
/**
* Cached log10 values for the first integer up to the maximum ploidy requested thus far.
*/
private volatile static double[] ploidyLog10;
// Initialize {@link #ploidyLog10}.
static {
ploidyLog10 = new double[maximumPloidy + 1];
for (int i = 0; i <= maximumPloidy; i++)
ploidyLog10[i] = Math.log10(i);
}
/**
* Returns an instance given its ploidy and the number of alleles.
*
* @param alleleCount the required allele-count.
* @param ploidy the required ploidy-count.
*
* @throws IllegalArgumentException if either {@code ploidy} or {@code alleleCount} is {@code null}, or
* the resulting number of genotypes is too large.
* @throws IllegalArgumentException if either {@code ploidy} or {@code alleleCount} is negative, or the resulting number of genotypes is too large.
*
* @return never {@code null}.
*/
public static GenotypeLikelihoodCalculator getInstance(final int ploidy,
final int alleleCount) {
public static GenotypeLikelihoodCalculator getInstance(final int ploidy, final int alleleCount) {
checkPloidyAndMaximumAllele(ploidy, alleleCount);
// Non-thread safe (fast) check on tables capacities,
// if not enough capacity we expand the tables in a thread-safe manner:
if (alleleCount > maximumAllele || ploidy > maximumPloidy)
ensureCapacity(alleleCount, ploidy);
// if not enough capacity we expand the tables in a thread-safe manner
// also checks if the requested ploidy and allele count result in a genotype count too large to deal with
if(calculateGenotypeCountUsingTables(ploidy, alleleCount) == GENOTYPE_COUNT_OVERFLOW){
final double largeGenotypeCount = MathUtils.binomialCoefficient(ploidy + alleleCount - 1, alleleCount - 1);
throw new IllegalArgumentException(String.format("the number of genotypes is too large for ploidy %d and allele %d: approx. %.0f", ploidy, alleleCount, largeGenotypeCount));
}
// At this point the tables must have at least the requested capacity, likely to be much more.
return new GenotypeLikelihoodCalculator(ploidy,alleleCount,alleleFirstGenotypeOffsetByPloidy,genotypeTableByPloidy,ploidyLog10);
return new GenotypeLikelihoodCalculator(ploidy, alleleCount, alleleFirstGenotypeOffsetByPloidy, genotypeTableByPloidy, ploidyLog10);
}
/**
@ -413,14 +409,59 @@ public class GenotypeLikelihoodCalculators {
* @param ploidy the requested ploidy.
* @param alleleCount the requested number of alleles.
*
* @throws IllegalArgumentException if {@code ploidy} or {@code alleleCount} is negative.
* @throws IllegalArgumentException if {@code ploidy} or {@code alleleCount} is negative or
* the number of genotypes is too large (more than {@link Integer#MAX_VALUE}).
*
* @return 0 or greater.
* @return the number of genotypes given ploidy and allele count (0 or greater).
*/
public final static int genotypeCount(final int ploidy, final int alleleCount) {
final int result = calculateGenotypeCountUsingTables(ploidy, alleleCount);
if (result == GENOTYPE_COUNT_OVERFLOW) {
final double largeGenotypeCount = MathUtils.binomialCoefficient(ploidy + alleleCount - 1, alleleCount - 1);
throw new IllegalArgumentException(String.format("the number of genotypes is too large for ploidy %d and allele %d: approx. %.0f", ploidy, alleleCount, largeGenotypeCount));
}
return result;
}
/**
* Compute the maximally acceptable allele count (ref allele included) given the maximally acceptable genotype count.
* @param ploidy sample ploidy
* @param maxGenotypeCount maximum number of genotype count used to calculate upper bound on number of alleles given ploidy
* @throws IllegalArgumentException if {@code ploidy} or {@code alleleCount} is negative.
* @return the maximally acceptable allele count given ploidy and maximum number of genotypes acceptable
*/
public static int computeMaxAcceptableAlleleCount(final int ploidy, final int maxGenotypeCount){
checkPloidyAndMaximumAllele(ploidy, ploidy); // a hack to check ploidy makes sense (could duplicate code but choice must be made)
final double log10MaxGenotypeCount = Math.log10(maxGenotypeCount);
// Math explanation: genotype count is determined by ${P+A-1 \choose A-1}$, this leads to constraint
// $\log(\frac{(P+A-1)!}{(A-1)!}) \le \log(P!G)$,
// where $P$ is ploidy, $A$ is allele count, and $G$ is maxGenotypeCount
// The upper and lower bounds of the left hand side of the constraint are $P \log(A-1+P)$ and $P \log(A)$
// which require $A$ to be searched in interval $[10^{\log(P!G)/P} - (P-1), 10^{\log(P!G)/P}]$
// Denote $[10^{\log(P!G)/P}$ as $x$ in the code.
final double x = Math.pow(10, (MathUtils.log10Factorial(ploidy) + log10MaxGenotypeCount)/ploidy );
final int lower = (int)Math.floor(x) - ploidy - 1;
final int upper = (int)Math.ceil(x);
for(int a=upper; a>=lower; --a){// check one by one
final double log10GTCnt = MathUtils.log10BinomialCoefficient(ploidy+a-1, a-1);
if(log10MaxGenotypeCount >= log10GTCnt) {
return a;
}
}
throw new GATKException("Code should never reach here.");
}
private static int calculateGenotypeCountUsingTables(int ploidy, int alleleCount) {
checkPloidyAndMaximumAllele(ploidy, alleleCount);
if (ploidy > maximumPloidy || alleleCount > maximumAllele)
ensureCapacity(alleleCount,ploidy);
if (ploidy > maximumPloidy || alleleCount > maximumAllele) {
ensureCapacity(alleleCount, ploidy);
}
return alleleFirstGenotypeOffsetByPloidy[ploidy][alleleCount];
}
}

View File

@ -56,6 +56,8 @@ import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import htsjdk.samtools.util.StringUtil;
import htsjdk.variant.variantcontext.*;
import org.apache.commons.lang.ArrayUtils;
import org.apache.commons.math3.stat.StatUtils;
import org.broadinstitute.gatk.engine.arguments.GenotypeCalculationArgumentCollection;
import org.broadinstitute.gatk.utils.*;
import org.broadinstitute.gatk.utils.contexts.ReferenceContext;
@ -73,6 +75,7 @@ import org.broadinstitute.gatk.utils.variant.GATKVCFConstants;
import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils;
import java.util.*;
import java.util.stream.Collectors;
/**
* {@link HaplotypeCaller}'s genotyping strategy implementation.
@ -84,6 +87,9 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine<AssemblyBa
private static final String phase10 = "1|0";
private static final int MAX_DROPPED_ALTERNATIVE_ALLELES_LOG_STRING_LENGTH = 500;
private final int MAX_GENOTYPE_COUNT_TO_ENUMERATE = configuration.genotypeArgs.MAX_GENOTYPE_COUNT;
private static final Map<Integer, Integer> practicaAlleleCountForPloidy = new HashMap<>();
private MergeVariantsAcrossHaplotypes crossHaplotypeEventMerger;
protected final boolean doPhysicalPhasing;
@ -135,7 +141,7 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine<AssemblyBa
}
/**
* Carries the result of a call to #assignGenotypeLikelihoods
* Carries the result of a call to {@link #assignGenotypeLikelihoods(List, ReadLikelihoods, Map, byte[], GenomeLoc, GenomeLoc, GenomeLocParser, RefMetaDataTracker, List, boolean)}
*/
public static class CalledHaplotypes {
private final List<VariantContext> calls;
@ -189,16 +195,16 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine<AssemblyBa
@Requires({"refLoc.containsP(activeRegionWindow)", "haplotypes.size() > 0"})
@Ensures("result != null")
// TODO - can this be refactored? this is hard to follow!
CalledHaplotypes assignGenotypeLikelihoods( final List<Haplotype> haplotypes,
final ReadLikelihoods<Haplotype> readLikelihoods,
final Map<String, List<GATKSAMRecord>> perSampleFilteredReadList,
final byte[] ref,
final GenomeLoc refLoc,
final GenomeLoc activeRegionWindow,
final GenomeLocParser genomeLocParser,
final RefMetaDataTracker tracker,
final List<VariantContext> activeAllelesToGenotype,
final boolean emitReferenceConfidence) {
CalledHaplotypes assignGenotypeLikelihoods(final List<Haplotype> haplotypes,
final ReadLikelihoods<Haplotype> readLikelihoods,
final Map<String, List<GATKSAMRecord>> perSampleFilteredReadList,
final byte[] ref,
final GenomeLoc refLoc,
final GenomeLoc activeRegionWindow,
final GenomeLocParser genomeLocParser,
final RefMetaDataTracker tracker,
final List<VariantContext> activeAllelesToGenotype,
final boolean emitReferenceConfidence) {
// sanity check input arguments
if (haplotypes == null || haplotypes.isEmpty()) throw new IllegalArgumentException("haplotypes input should be non-empty and non-null, got "+haplotypes);
if (readLikelihoods == null || readLikelihoods.sampleCount() == 0) throw new IllegalArgumentException("readLikelihoods input should be non-empty and non-null, got "+readLikelihoods);
@ -232,15 +238,17 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine<AssemblyBa
// Merge the event to find a common reference representation
VariantContext mergedVC = GATKVariantContextUtils.simpleMerge(eventsAtThisLoc, priorityList,
GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED,
GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, null, false, false);
VariantContext mergedVC = GATKVariantContextUtils.simpleMerge(eventsAtThisLoc,
priorityList,
GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED,
GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE,
false, false, null, false, false);
if( mergedVC == null )
continue;
final GenotypeLikelihoodsCalculationModel.Model calculationModel = mergedVC.isSNP()
? GenotypeLikelihoodsCalculationModel.Model.SNP : GenotypeLikelihoodsCalculationModel.Model.INDEL;
final GenotypeLikelihoodsCalculationModel.Model calculationModel = mergedVC.isSNP() ? GenotypeLikelihoodsCalculationModel.Model.SNP
: GenotypeLikelihoodsCalculationModel.Model.INDEL;
final Map<VariantContext, Allele> mergeMap = new LinkedHashMap<>();
mergeMap.put(null, mergedVC.getReference()); // the reference event (null) --> the reference allele
@ -248,13 +256,37 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine<AssemblyBa
mergeMap.put(eventsAtThisLoc.get(iii), mergedVC.getAlternateAllele(iii)); // BUGBUG: This is assuming that the order of alleles is the same as the priority list given to simpleMerge function
}
final Map<Allele, List<Haplotype>> alleleMapper = createAlleleMapper(mergeMap, eventMapper);
if( configuration.DEBUG && logger != null ) {
if (logger != null) logger.info("Genotyping event at " + loc + " with alleles = " + mergedVC.getAlleles());
}
final ReadLikelihoods<Allele> readAlleleLikelihoods = readLikelihoods.marginalize(alleleMapper, genomeLocParser.createPaddedGenomeLoc(genomeLocParser.createGenomeLoc(mergedVC), ALLELE_EXTENSION));
final Map<Allele, List<Haplotype>> alleleMapper = createAlleleMapper(mergeMap, eventMapper);
// if the number of allele is so high that enumerating all possible genotypes is impractical,
// as determined by MAX_GENOTYPE_COUNT_TO_ENUMERATE,
// trim alleles that are not well supported by good-scored haplotypes,
// otherwise alias, i.e. no trimming if genotype count is small
final int originalAlleleCount = alleleMapper.size();
Integer practicalAlleleCount = practicaAlleleCountForPloidy.get(ploidy);
if(practicalAlleleCount==null) { // maximum allele count given this ploidy and MAX_GENOTYPE_COUNT_TO_ENUMERATE hasn't been computed
practicalAlleleCount = GenotypeLikelihoodCalculators.computeMaxAcceptableAlleleCount(ploidy, MAX_GENOTYPE_COUNT_TO_ENUMERATE);
}
Map<Allele, List<Haplotype>> practicalAlleleMapper = null;
if (practicalAlleleCount < originalAlleleCount) {
practicalAlleleMapper = reduceNumberOfAlternativeAllelesBasedOnHaplotypesScores(alleleMapper, practicalAlleleCount);
if( configuration.DEBUG && logger != null ) {
logger.warn(String.format("Removed alt alleles where ploidy is %d and original allele count is %d, whereas after trimming the allele count becomes %d",
ploidy, originalAlleleCount, practicalAlleleCount));
logger.warn(String.format("Alleles kept are:%s", practicalAlleleMapper.keySet()));
}
}else{
practicalAlleleMapper = alleleMapper;
}
final ReadLikelihoods<Allele> readAlleleLikelihoods = readLikelihoods.marginalize(practicalAlleleMapper,
genomeLocParser.createPaddedGenomeLoc(genomeLocParser.createGenomeLoc(mergedVC),
ALLELE_EXTENSION));
if (configuration.isSampleContaminationPresent())
readAlleleLikelihoods.contaminationDownsampling(configuration.getSampleContamination());
@ -269,10 +301,23 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine<AssemblyBa
readAlleleLikelihoods.addNonReferenceAllele(GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE);
}
final GenotypesContext genotypes = calculateGLsForThisEvent(readAlleleLikelihoods, noCallAlleles );
final GenotypesContext genotypes = calculateGLsForThisEvent(readAlleleLikelihoods, noCallAlleles);
final VariantContext call = calculateGenotypes(new VariantContextBuilder(mergedVC).alleles(readAlleleLikelihoods.alleles()).genotypes(genotypes).make(), calculationModel);
if ( call != null ) {
final VariantContext annotatedCall = annotateCall(readLikelihoods, perSampleFilteredReadList, ref, refLoc, genomeLocParser, tracker, emitReferenceConfidence, calledHaplotypes, mergedVC, alleleMapper, readAlleleLikelihoods, someAllelesWereDropped, call);
final VariantContext annotatedCall = annotateCall(readLikelihoods,
perSampleFilteredReadList,
ref,
refLoc,
genomeLocParser,
tracker,
emitReferenceConfidence,
calledHaplotypes,
mergedVC,
alleleMapper,
readAlleleLikelihoods,
someAllelesWereDropped,
call);
returnCalls.add( annotatedCall );
}
}
@ -300,7 +345,7 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine<AssemblyBa
final boolean someAlternativeAllelesWereDropped = call.getAlleles().size() != initialAlleleNumber;
VariantContext annotatedCall = annotationEngine.annotateContextForActiveRegion(referenceContext, tracker,readAlleleLikelihoodsForAnnotation, call, emitReferenceConfidence);
if (someAlternativeAllelesWereDropped || someAlternativeAllelesWereAlreadyDropped)
annotatedCall = GATKVariantContextUtils.reverseTrimAlleles(annotatedCall);
annotatedCall = GATKVariantContextUtils.reverseTrimAlleles(annotatedCall);
// maintain the set of all called haplotypes
for ( final Allele calledAllele : call.getAlleles() ) {
@ -312,6 +357,78 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine<AssemblyBa
return !emitReferenceConfidence ? clearUnconfidentGenotypeCalls(annotatedCall) : annotatedCall;
}
/**
* Trim excessive alt alleles when the combination of ploidy and alleles is so large that exhaustively enumerating
* all possible genotypes is impractical.
* Use the highest haplotype score of each allele to decide which ones to trim away, and in case of tie, use the second best.
* @param alleleMapper original allele to haplotype map to be trimmed
* @param desiredNumOfAlleles desired allele count (ref allele won't be removed)
* @return trimmed map of the desired size
*/
private Map<Allele, List<Haplotype>> reduceNumberOfAlternativeAllelesBasedOnHaplotypesScores(final Map<Allele, List<Haplotype>> alleleMapper, final int desiredNumOfAlleles) {
final PriorityQueue<AlleleScoredByHaplotypeScores> altAlleleMaxPriorityQ = new PriorityQueue<>((sa1, sa2) -> - sa2.compareTo(sa1)); // -1 to turn it into max priority q
final Set<Allele> allelesToRetain = new HashSet<>();
// populate allelePriorityQ with the relevant information
for(final Allele allele : alleleMapper.keySet()){
if(allele.isReference()){ // collect scores information only on alt alleles; ref allele is never trimmed by this function
allelesToRetain.add(allele);
continue;
}
final List<Double> hapScores = alleleMapper.get(allele).stream().map(hap -> hap.getScore()).collect(Collectors.toList());
Collections.sort(hapScores);
final Double highestScore = hapScores.get(hapScores.size()-1);
final Double secondHighestScore = hapScores.size()>1 ? hapScores.get(hapScores.size()-2) : Double.NEGATIVE_INFINITY;
altAlleleMaxPriorityQ.add(new AlleleScoredByHaplotypeScores(allele, highestScore, secondHighestScore));
}
while(allelesToRetain.size()<desiredNumOfAlleles){
allelesToRetain.add(altAlleleMaxPriorityQ.poll().getAllele());
}
return alleleMapper.entrySet().stream().filter(p -> allelesToRetain.contains(p.getKey()))
.collect(Collectors.toMap(p->p.getKey(), p->p.getValue()));
}
/**
* A utility class that provides ordering information, given best and second best haplotype scores.
* If there's a tie between the two alleles when comparing their best haplotype score, the second best haplotype score
* is used for breaking the tie. In the case that one allele doesn't have a second best allele, i.e. it has only one
* supportive haplotype, its second best score is set as null, and is always considered "worse" than another allele
* that has the same best haplotype score, but also has a second best haplotype score.
* TODO: in the extremely unlikely case that two alleles, having the same best haplotype score, neither have a second
* best haplotype score, the case is undecided.
*/
private static final class AlleleScoredByHaplotypeScores {
private final Allele allele;
private final Double bestHaplotypeScore;
private final Double secondBestHaplotypeScore;
public AlleleScoredByHaplotypeScores(final Allele allele, final Double bestHaplotypeScore, final Double secondBestHaplotypeScore){
this.allele = allele;
this.bestHaplotypeScore = bestHaplotypeScore;
this.secondBestHaplotypeScore = secondBestHaplotypeScore;
}
public int compareTo(final AlleleScoredByHaplotypeScores other) {
if(bestHaplotypeScore > other.bestHaplotypeScore) {
return 1;
} else if (bestHaplotypeScore < other.bestHaplotypeScore) {
return -1;
} else {
return secondBestHaplotypeScore > other.secondBestHaplotypeScore ? 1 : -1;
}
}
public Allele getAllele(){
return allele;
}
}
/**
* Reduce the number alternative alleles in a read-likelihoods collection to the maximum-alt-allele user parameter value.
* <p>
@ -462,7 +579,7 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine<AssemblyBa
*/
@VisibleForTesting
static Map<VariantContext, Set<Haplotype>> constructHaplotypeMapping(final List<VariantContext> originalCalls,
final Set<Haplotype> calledHaplotypes) {
final Set<Haplotype> calledHaplotypes) {
final Map<VariantContext, Set<Haplotype>> haplotypeMap = new HashMap<>(originalCalls.size());
for ( final VariantContext call : originalCalls ) {
// don't try to phase if there is not exactly 1 alternate allele
@ -674,14 +791,13 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine<AssemblyBa
// Builds the read-likelihoods collection to use for annotation considering user arguments and the collection
// used for genotyping.
protected ReadLikelihoods<Allele> prepareReadAlleleLikelihoodsForAnnotation(
final ReadLikelihoods<Haplotype> readHaplotypeLikelihoods,
final Map<String, List<GATKSAMRecord>> perSampleFilteredReadList,
final GenomeLocParser genomeLocParser,
final boolean emitReferenceConfidence,
final Map<Allele, List<Haplotype>> alleleMapper,
final ReadLikelihoods<Allele> readAlleleLikelihoodsForGenotyping,
final VariantContext call) {
protected ReadLikelihoods<Allele> prepareReadAlleleLikelihoodsForAnnotation(final ReadLikelihoods<Haplotype> readHaplotypeLikelihoods,
final Map<String, List<GATKSAMRecord>> perSampleFilteredReadList,
final GenomeLocParser genomeLocParser,
final boolean emitReferenceConfidence,
final Map<Allele, List<Haplotype>> alleleMapper,
final ReadLikelihoods<Allele> readAlleleLikelihoodsForGenotyping,
final VariantContext call) {
final ReadLikelihoods<Allele> readAlleleLikelihoodsForAnnotations;
final GenomeLoc loc = genomeLocParser.createGenomeLoc(call);
@ -744,10 +860,10 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine<AssemblyBa
* @return never {@code null} but perhaps an empty list if there is no variants to report.
*/
protected TreeSet<Integer> decomposeHaplotypesIntoVariantContexts(final List<Haplotype> haplotypes,
final ReadLikelihoods readLikelihoods,
final byte[] ref,
final GenomeLoc refLoc,
final List<VariantContext> activeAllelesToGenotype) {
final ReadLikelihoods readLikelihoods,
final byte[] ref,
final GenomeLoc refLoc,
final List<VariantContext> activeAllelesToGenotype) {
final boolean in_GGA_mode = !activeAllelesToGenotype.isEmpty();
// Using the cigar from each called haplotype figure out what events need to be written out in a VCF file
@ -782,8 +898,8 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine<AssemblyBa
}
protected List<VariantContext> getVCsAtThisLocation(final List<Haplotype> haplotypes,
final int loc,
final List<VariantContext> activeAllelesToGenotype) {
final int loc,
final List<VariantContext> activeAllelesToGenotype) {
// the overlapping events to merge into a common reference view
final List<VariantContext> eventsAtThisLoc = new ArrayList<>();

View File

@ -170,6 +170,15 @@ public class GenotypeLikelihoodCalculatorUnitTest {
}
@Test
public void testComputeMaxAcceptableAlleleCount(){
Assert.assertEquals(1024, GenotypeLikelihoodCalculators.computeMaxAcceptableAlleleCount(1, 1024));
Assert.assertEquals(44, GenotypeLikelihoodCalculators.computeMaxAcceptableAlleleCount(2, 1024));
Assert.assertEquals(17, GenotypeLikelihoodCalculators.computeMaxAcceptableAlleleCount(3, 1024));
Assert.assertEquals(5, GenotypeLikelihoodCalculators.computeMaxAcceptableAlleleCount(10, 1024));
Assert.assertEquals(3, GenotypeLikelihoodCalculators.computeMaxAcceptableAlleleCount(20, 1024));
Assert.assertEquals(2, GenotypeLikelihoodCalculators.computeMaxAcceptableAlleleCount(100, 1024));
}
// Simple inefficient calculation of the genotype count given the ploidy.
private int calculateGenotypeCount(final int ploidy, final int alleleCount) {
if (ploidy == 0)

View File

@ -130,6 +130,7 @@ public class Haplotype extends Allele {
final Haplotype ret = new Haplotype(newBases, isReference());
ret.setCigar(newCigar);
ret.setGenomeLocation(loc);
ret.setScore(score);
ret.setAlignmentStartHapwrtRef(newStart + getAlignmentStartHapwrtRef());
return ret;
}