remove alt alleles, when genotype count is explosively large, based on alleles' highest supporting haplotype score; max tolerable genotype count is controlled by a default value overridable by user

2016-06-30 22:36:49 -04:00 · 2016-06-30 22:36:49 -04:00 · 1ff234e7dd
parent 6471bda0cd
commit 1ff234e7dd
6 changed files with 268 additions and 77 deletions
--- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/arguments/GenotypeCalculationArgumentCollection.java
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/arguments/GenotypeCalculationArgumentCollection.java
@ -122,12 +122,32 @@ public class GenotypeCalculationArgumentCollection implements Cloneable{
     * scales exponentially based on the number of alternate alleles.  Unless there is a good reason to change the default value, we highly recommend
     * that you not play around with this parameter.
     *
-     * As of GATK 2.2 the genotyper can handle a very large number of events, so the default maximum has been increased to 6.
+     * See also {@link #MAX_GENOTYPE_COUNT}.
     */
    @Advanced
    @Argument(fullName = "max_alternate_alleles", shortName = MAX_ALTERNATE_ALLELES_SHORT_NAME, doc = "Maximum number of alternate alleles to genotype", required = false)
    public int MAX_ALTERNATE_ALLELES = 6;

+    /**
+     * If there are more than this number of genotypes at a locus presented to the genotyper, then only this many genotypes will be used.
+     * The possible genotypes are simply different ways of partitioning alleles given a specific ploidy asumption.
+     * Therefore, we remove genotypes from consideration by removing alternate alleles that are the least well supported.
+     * The estimate of allele support is based on the ranking of the candidate haplotypes coming out of the graph building step.
+     * Note that the reference allele is always kept.
+     *
+     * Note that genotyping sites with large genotype counts is both CPU and memory intensive.
+     * Unless there is a good reason to change the default value, we highly recommend that you not play around with this parameter.
+     *
+     * The maximum number of alternative alleles used in the genotyping step will be the lesser of the two:
+     * 1. the largest number of alt alleles, given ploidy, that yields a genotype count no higher than {@link #MAX_GENOTYPE_COUNT}
+     * 2. the value of {@link #MAX_ALTERNATE_ALLELES}
+     *
+     * See also {@link #MAX_ALTERNATE_ALLELES}.
+     */
+    @Advanced
+    @Argument(fullName = "max_genotype_count", shortName = "maxGT", doc = "Maximum number of genotypes to consider at any site", required = false)
+    public int MAX_GENOTYPE_COUNT = 1024;
+
    /**
     * Determines the maximum number of PL values that will be logged in the output.  If the number of genotypes
     * (which is determined by the ploidy and the number of alleles) exceeds the value provided by this argument,
--- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypeLikelihoodCalculator.java
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypeLikelihoodCalculator.java
@ -163,7 +163,7 @@ public class GenotypeLikelihoodCalculator {
     * <p>This is in fact a shallow copy if {@link GenotypeLikelihoodCalculators#ploidyLog10}</p> and is not meant to be modified by
     * this class. </p>
     */
-    private final double[] log10;
+    private final double[] ploidyLog10;

    /**
     * Buffer field use as a temporal container for sorted allele counts when calculating the likelihood of a
@ -202,24 +202,22 @@ public class GenotypeLikelihoodCalculator {
     * Creates a new calculator providing its ploidy and number of genotyping alleles.
     */
    protected GenotypeLikelihoodCalculator(final int ploidy, final int alleleCount,
-                                          final int[][] alleleFirstGenotypeOffsetByPloidy,
-                                          final GenotypeAlleleCounts[][] genotypeTableByPloidy,
-                                          final double[] ploidyLog10) {
+                                           final int[][] alleleFirstGenotypeOffsetByPloidy,
+                                           final GenotypeAlleleCounts[][] genotypeTableByPloidy,
+                                           final double[] ploidyLog10) {
+
        this.alleleFirstGenotypeOffsetByPloidy = alleleFirstGenotypeOffsetByPloidy;
        genotypeAlleleCounts = genotypeTableByPloidy[ploidy];
        this.alleleCount = alleleCount;
        this.ploidy = ploidy;
        genotypeCount = this.alleleFirstGenotypeOffsetByPloidy[ploidy][alleleCount];
-        if (genotypeCount == GenotypeLikelihoodCalculators.GENOTYPE_COUNT_OVERFLOW)
-            throw new IllegalArgumentException(
-                    String.format("the combination of ploidy (%s) and number of alleles (%s) results in a very large number of genotypes (> %s). You need to limit ploidy or the number of alternative alleles to analyze this locus",
-                            ploidy,alleleCount,Integer.MAX_VALUE));
+
        alleleHeap = new IntMaxHeap(ploidy);
        readLikelihoodsByGenotypeIndex = new double[genotypeCount][];
-        log10 = ploidyLog10;
+        this.ploidyLog10 = ploidyLog10;
        // The number of possible components is limited by distinct allele count and ploidy.
        maximumDistinctAllelesInGenotype = Math.min(ploidy, alleleCount);
-        genotypeAllelesAndCounts = new int[maximumDistinctAllelesInGenotype << 1];
+        genotypeAllelesAndCounts = new int[maximumDistinctAllelesInGenotype*2];
    }

    /**
@ -349,7 +347,7 @@ public class GenotypeLikelihoodCalculator {
     */
    private double[] genotypeLikelihoods(final double[][] readLikelihoodsByGenotypeIndex, final int readCount) {
        final double[] result = new double[genotypeCount];
-        final double denominator = readCount * log10[ploidy]; // instead of dividing each read likelihood by ploidy
+        final double denominator = readCount * ploidyLog10[ploidy]; // instead of dividing each read likelihood by ploidy
         // ( so subtract log10(ploidy) )  we multiply them all and the divide by ploidy^readCount (so substract readCount * log10(ploidy) )
        for (int g = 0; g < genotypeCount; g++) {
            final double[] likelihoodsByRead = readLikelihoodsByGenotypeIndex[g];
@ -464,7 +462,9 @@ public class GenotypeLikelihoodCalculator {
     * exactly one allele present in the genotype.
     */
    private void singleComponentGenotypeLikelihoodByRead(final GenotypeAlleleCounts genotypeAlleleCounts,
-                                                         final double[] likelihoodByRead, final double[] readLikelihoodComponentsByAlleleCount, final int readCount) {
+                                                         final double[] likelihoodByRead,
+                                                         final double[] readLikelihoodComponentsByAlleleCount,
+                                                         final int readCount) {
        final int allele = genotypeAlleleCounts.alleleIndexAt(0);
        // the count of the only component must be = ploidy.
        int offset = (allele * (ploidy + 1) + ploidy) * readCount;
@ -493,7 +493,7 @@ public class GenotypeLikelihoodCalculator {

            // p = 2 because the frequency == 1 we already have it.
            for (int frequency = 2, destinationOffset = frequency1Offset + readCount; frequency <= ploidy; frequency++) {
-                final double log10frequency = log10[frequency];
+                final double log10frequency = ploidyLog10[frequency];
                for (int r = 0, sourceOffset = frequency1Offset; r < readCount; r++)
                    readAlleleLikelihoodByAlleleCount[destinationOffset++] =
                            readAlleleLikelihoodByAlleleCount[sourceOffset++] + log10frequency;
@ -620,7 +620,11 @@ public class GenotypeLikelihoodCalculator {
     * @param destination where to store the new genotype index mapping to old.
     * @param sortedAlleleCountsBuffer a buffer to re-use to get the genotype-allele-count's sorted allele counts.
     */
-    private void genotypeIndexMapPerGenotypeIndex(final int newGenotypeIndex, final GenotypeAlleleCounts alleleCounts, final int[] oldToNewAlleleIndexMap, final int[] destination, final int[] sortedAlleleCountsBuffer) {
+    private void genotypeIndexMapPerGenotypeIndex(final int newGenotypeIndex,
+                                                  final GenotypeAlleleCounts alleleCounts,
+                                                  final int[] oldToNewAlleleIndexMap,
+                                                  final int[] destination,
+                                                  final int[] sortedAlleleCountsBuffer) {
        final int distinctAlleleCount = alleleCounts.distinctAlleleCount();
        alleleCounts.copyAlleleCounts(sortedAlleleCountsBuffer,0);
        for (int j = 0, jj = 0; j < distinctAlleleCount; j++) {
--- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypeLikelihoodCalculators.java
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypeLikelihoodCalculators.java
@ -51,7 +51,11 @@

 package org.broadinstitute.gatk.tools.walkers.genotyper;

+import org.broadinstitute.gatk.utils.MathUtils;
+import org.broadinstitute.gatk.utils.exceptions.GATKException;
+
 import java.util.Arrays;
+import java.util.stream.IntStream;

 /**
 * Genotype likelihood calculator utility.
@ -116,7 +120,10 @@ public class GenotypeLikelihoodCalculators {
    private volatile static GenotypeAlleleCounts[][] genotypeTableByPloidy =
            buildGenotypeAlleleCountsTable(maximumPloidy,maximumAllele,alleleFirstGenotypeOffsetByPloidy);

-
+    /**
+     * Cached log10 values for the first integer up to the maximum ploidy requested thus far.
+     */
+    private volatile static double[] ploidyLog10 = IntStream.range(0, maximumPloidy + 1).mapToDouble(Math::log10).toArray();

    /**
     * Build the table with the genotype offsets based on ploidy and the maximum allele index with representation
@ -291,40 +298,29 @@ public class GenotypeLikelihoodCalculators {
        return result;
    }

-    /**
-     * Cached log10 values for the first integer up to the maximum ploidy requested thus far.
-     */
-    private volatile static double[] ploidyLog10;
-
-    // Initialize {@link #ploidyLog10}.
-    static {
-        ploidyLog10 = new double[maximumPloidy + 1];
-        for (int i = 0; i <= maximumPloidy; i++)
-            ploidyLog10[i] = Math.log10(i);
-    }
-
    /**
     * Returns an instance given its ploidy and the number of alleles.
     *
     * @param alleleCount the required allele-count.
     * @param ploidy the required ploidy-count.
     *
-     * @throws IllegalArgumentException if either {@code ploidy} or {@code alleleCount} is {@code null}, or
-     *      the resulting number of genotypes is too large.
+     * @throws IllegalArgumentException if either {@code ploidy} or {@code alleleCount} is negative, or the resulting number of genotypes is too large.
     *
     * @return never {@code null}.
     */
-    public static GenotypeLikelihoodCalculator getInstance(final int ploidy,
-                                                   final int alleleCount) {
+    public static GenotypeLikelihoodCalculator getInstance(final int ploidy, final int alleleCount) {
        checkPloidyAndMaximumAllele(ploidy, alleleCount);

        // Non-thread safe (fast) check on tables capacities,
-        // if not enough capacity we expand the tables in a thread-safe manner:
-        if (alleleCount > maximumAllele || ploidy > maximumPloidy)
-            ensureCapacity(alleleCount, ploidy);
+        // if not enough capacity we expand the tables in a thread-safe manner
+        // also checks if the requested ploidy and allele count result in a genotype count too large to deal with
+        if(calculateGenotypeCountUsingTables(ploidy, alleleCount) == GENOTYPE_COUNT_OVERFLOW){
+            final double largeGenotypeCount = MathUtils.binomialCoefficient(ploidy + alleleCount - 1, alleleCount - 1);
+            throw new IllegalArgumentException(String.format("the number of genotypes is too large for ploidy %d and allele %d: approx. %.0f", ploidy, alleleCount, largeGenotypeCount));
+        }

        // At this point the tables must have at least the requested capacity, likely to be much more.
-        return new GenotypeLikelihoodCalculator(ploidy,alleleCount,alleleFirstGenotypeOffsetByPloidy,genotypeTableByPloidy,ploidyLog10);
+        return new GenotypeLikelihoodCalculator(ploidy, alleleCount, alleleFirstGenotypeOffsetByPloidy, genotypeTableByPloidy, ploidyLog10);
    }

    /**
@ -413,14 +409,59 @@ public class GenotypeLikelihoodCalculators {
     * @param ploidy the requested ploidy.
     * @param alleleCount the requested number of alleles.
     *
-     * @throws IllegalArgumentException if {@code ploidy} or {@code alleleCount} is negative.
+     * @throws IllegalArgumentException if {@code ploidy} or {@code alleleCount} is negative or
+     *                                      the number of genotypes is too large (more than {@link Integer#MAX_VALUE}).
     *
-     * @return 0 or greater.
+     * @return the number of genotypes given ploidy and allele count (0 or greater).
     */
    public final static int genotypeCount(final int ploidy, final int alleleCount) {
+
+        final int result = calculateGenotypeCountUsingTables(ploidy, alleleCount);
+        if (result == GENOTYPE_COUNT_OVERFLOW) {
+            final double largeGenotypeCount = MathUtils.binomialCoefficient(ploidy + alleleCount - 1, alleleCount - 1);
+            throw new IllegalArgumentException(String.format("the number of genotypes is too large for ploidy %d and allele %d: approx. %.0f", ploidy, alleleCount, largeGenotypeCount));
+        }
+        return result;
+    }
+
+    /**
+     * Compute the maximally acceptable allele count (ref allele included) given the maximally acceptable genotype count.
+     * @param ploidy            sample ploidy
+     * @param maxGenotypeCount  maximum number of genotype count used to calculate upper bound on number of alleles given ploidy
+     * @throws IllegalArgumentException if {@code ploidy} or {@code alleleCount} is negative.
+     * @return                  the maximally acceptable allele count given ploidy and maximum number of genotypes acceptable
+     */
+    public static int computeMaxAcceptableAlleleCount(final int ploidy, final int maxGenotypeCount){
+
+        checkPloidyAndMaximumAllele(ploidy, ploidy); // a hack to check ploidy makes sense (could duplicate code but choice must be made)
+
+        final double log10MaxGenotypeCount = Math.log10(maxGenotypeCount);
+
+        // Math explanation: genotype count is determined by ${P+A-1 \choose A-1}$, this leads to constraint
+        // $\log(\frac{(P+A-1)!}{(A-1)!}) \le \log(P!G)$,
+        // where $P$ is ploidy, $A$ is allele count, and $G$ is maxGenotypeCount
+        // The upper and lower bounds of the left hand side of the constraint are $P \log(A-1+P)$ and $P \log(A)$
+        // which require $A$ to be searched in interval $[10^{\log(P!G)/P} - (P-1), 10^{\log(P!G)/P}]$
+        // Denote $[10^{\log(P!G)/P}$ as $x$ in the code.
+
+        final double x = Math.pow(10, (MathUtils.log10Factorial(ploidy) + log10MaxGenotypeCount)/ploidy );
+        final int lower = (int)Math.floor(x) - ploidy - 1;
+        final int upper = (int)Math.ceil(x);
+        for(int a=upper; a>=lower; --a){// check one by one
+
+            final double log10GTCnt = MathUtils.log10BinomialCoefficient(ploidy+a-1, a-1);
+            if(log10MaxGenotypeCount >= log10GTCnt) {
+                return a;
+            }
+        }
+        throw new GATKException("Code should never reach here.");
+    }
+
+    private static int calculateGenotypeCountUsingTables(int ploidy, int alleleCount) {
        checkPloidyAndMaximumAllele(ploidy, alleleCount);
-        if (ploidy > maximumPloidy || alleleCount > maximumAllele)
-            ensureCapacity(alleleCount,ploidy);
+        if (ploidy > maximumPloidy || alleleCount > maximumAllele) {
+            ensureCapacity(alleleCount, ploidy);
+        }
        return alleleFirstGenotypeOffsetByPloidy[ploidy][alleleCount];
    }
 }
--- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngine.java
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngine.java
@ -56,6 +56,8 @@ import com.google.java.contract.Ensures;
 import com.google.java.contract.Requires;
 import htsjdk.samtools.util.StringUtil;
 import htsjdk.variant.variantcontext.*;
+import org.apache.commons.lang.ArrayUtils;
+import org.apache.commons.math3.stat.StatUtils;
 import org.broadinstitute.gatk.engine.arguments.GenotypeCalculationArgumentCollection;
 import org.broadinstitute.gatk.utils.*;
 import org.broadinstitute.gatk.utils.contexts.ReferenceContext;
@ -73,6 +75,7 @@ import org.broadinstitute.gatk.utils.variant.GATKVCFConstants;
 import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils;

 import java.util.*;
+import java.util.stream.Collectors;

 /**
 * {@link HaplotypeCaller}'s genotyping strategy implementation.
@ -84,6 +87,9 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine<AssemblyBa
    private static final String phase10 = "1|0";
    private static final int MAX_DROPPED_ALTERNATIVE_ALLELES_LOG_STRING_LENGTH = 500;

+    private final int MAX_GENOTYPE_COUNT_TO_ENUMERATE = configuration.genotypeArgs.MAX_GENOTYPE_COUNT;
+    private static final Map<Integer, Integer> practicaAlleleCountForPloidy = new HashMap<>();
+
    private MergeVariantsAcrossHaplotypes crossHaplotypeEventMerger;

    protected final boolean doPhysicalPhasing;
@ -135,7 +141,7 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine<AssemblyBa
    }

    /**
-     * Carries the result of a call to #assignGenotypeLikelihoods
+     * Carries the result of a call to {@link #assignGenotypeLikelihoods(List, ReadLikelihoods, Map, byte[], GenomeLoc, GenomeLoc, GenomeLocParser, RefMetaDataTracker, List, boolean)}
     */
    public static class CalledHaplotypes {
        private final List<VariantContext> calls;
@ -189,16 +195,16 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine<AssemblyBa
    @Requires({"refLoc.containsP(activeRegionWindow)", "haplotypes.size() > 0"})
    @Ensures("result != null")
    // TODO - can this be refactored? this is hard to follow!
-    CalledHaplotypes assignGenotypeLikelihoods( final List<Haplotype> haplotypes,
-                                                       final ReadLikelihoods<Haplotype> readLikelihoods,
-                                                       final Map<String, List<GATKSAMRecord>> perSampleFilteredReadList,
-                                                       final byte[] ref,
-                                                       final GenomeLoc refLoc,
-                                                       final GenomeLoc activeRegionWindow,
-                                                       final GenomeLocParser genomeLocParser,
-                                                       final RefMetaDataTracker tracker,
-                                                       final List<VariantContext> activeAllelesToGenotype,
-                                                       final boolean emitReferenceConfidence) {
+    CalledHaplotypes assignGenotypeLikelihoods(final List<Haplotype> haplotypes,
+                                               final ReadLikelihoods<Haplotype> readLikelihoods,
+                                               final Map<String, List<GATKSAMRecord>> perSampleFilteredReadList,
+                                               final byte[] ref,
+                                               final GenomeLoc refLoc,
+                                               final GenomeLoc activeRegionWindow,
+                                               final GenomeLocParser genomeLocParser,
+                                               final RefMetaDataTracker tracker,
+                                               final List<VariantContext> activeAllelesToGenotype,
+                                               final boolean emitReferenceConfidence) {
        // sanity check input arguments
        if (haplotypes == null || haplotypes.isEmpty()) throw new IllegalArgumentException("haplotypes input should be non-empty and non-null, got "+haplotypes);
        if (readLikelihoods == null || readLikelihoods.sampleCount() == 0) throw new IllegalArgumentException("readLikelihoods input should be non-empty and non-null, got "+readLikelihoods);
@ -232,15 +238,17 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine<AssemblyBa

                // Merge the event to find a common reference representation

-                VariantContext mergedVC = GATKVariantContextUtils.simpleMerge(eventsAtThisLoc, priorityList,
-                        GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED,
-                        GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, null, false, false);
+                VariantContext mergedVC = GATKVariantContextUtils.simpleMerge(eventsAtThisLoc,
+                                                                              priorityList,
+                                                                              GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED,
+                                                                              GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE,
+                                                                              false, false, null, false, false);

                if( mergedVC == null )
                    continue;

-                final GenotypeLikelihoodsCalculationModel.Model calculationModel = mergedVC.isSNP()
-                        ? GenotypeLikelihoodsCalculationModel.Model.SNP : GenotypeLikelihoodsCalculationModel.Model.INDEL;
+                final GenotypeLikelihoodsCalculationModel.Model calculationModel = mergedVC.isSNP() ? GenotypeLikelihoodsCalculationModel.Model.SNP
+                                                                                                    : GenotypeLikelihoodsCalculationModel.Model.INDEL;

                final Map<VariantContext, Allele> mergeMap = new LinkedHashMap<>();
                mergeMap.put(null, mergedVC.getReference()); // the reference event (null) --> the reference allele
@ -248,13 +256,37 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine<AssemblyBa
                    mergeMap.put(eventsAtThisLoc.get(iii), mergedVC.getAlternateAllele(iii)); // BUGBUG: This is assuming that the order of alleles is the same as the priority list given to simpleMerge function
                }

-                final Map<Allele, List<Haplotype>> alleleMapper = createAlleleMapper(mergeMap, eventMapper);
-
                if( configuration.DEBUG && logger != null ) {
                    if (logger != null) logger.info("Genotyping event at " + loc + " with alleles = " + mergedVC.getAlleles());
                }

-                final ReadLikelihoods<Allele> readAlleleLikelihoods = readLikelihoods.marginalize(alleleMapper, genomeLocParser.createPaddedGenomeLoc(genomeLocParser.createGenomeLoc(mergedVC), ALLELE_EXTENSION));
+                final Map<Allele, List<Haplotype>> alleleMapper = createAlleleMapper(mergeMap, eventMapper);
+
+                // if the number of allele is so high that enumerating all possible genotypes is impractical,
+                // as determined by MAX_GENOTYPE_COUNT_TO_ENUMERATE,
+                // trim alleles that are not well supported by good-scored haplotypes,
+                // otherwise alias, i.e. no trimming if genotype count is small
+                final int originalAlleleCount = alleleMapper.size();
+                Integer practicalAlleleCount = practicaAlleleCountForPloidy.get(ploidy);
+                if(practicalAlleleCount==null) { // maximum allele count given this ploidy and MAX_GENOTYPE_COUNT_TO_ENUMERATE hasn't been computed
+                    practicalAlleleCount = GenotypeLikelihoodCalculators.computeMaxAcceptableAlleleCount(ploidy, MAX_GENOTYPE_COUNT_TO_ENUMERATE);
+                }
+
+                Map<Allele, List<Haplotype>> practicalAlleleMapper = null;
+                if (practicalAlleleCount < originalAlleleCount) {
+                    practicalAlleleMapper = reduceNumberOfAlternativeAllelesBasedOnHaplotypesScores(alleleMapper, practicalAlleleCount);
+                    if( configuration.DEBUG && logger != null ) {
+                        logger.warn(String.format("Removed alt alleles where ploidy is %d and original allele count is %d, whereas after trimming the allele count becomes %d",
+                                ploidy, originalAlleleCount, practicalAlleleCount));
+                        logger.warn(String.format("Alleles kept are:%s", practicalAlleleMapper.keySet()));
+                    }
+                }else{
+                    practicalAlleleMapper = alleleMapper;
+                }
+
+                final ReadLikelihoods<Allele> readAlleleLikelihoods = readLikelihoods.marginalize(practicalAlleleMapper,
+                                                                                                  genomeLocParser.createPaddedGenomeLoc(genomeLocParser.createGenomeLoc(mergedVC),
+                                                                                                                                        ALLELE_EXTENSION));
                if (configuration.isSampleContaminationPresent())
                    readAlleleLikelihoods.contaminationDownsampling(configuration.getSampleContamination());

@ -269,10 +301,23 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine<AssemblyBa
                    readAlleleLikelihoods.addNonReferenceAllele(GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE);
                }

-                final GenotypesContext genotypes = calculateGLsForThisEvent(readAlleleLikelihoods, noCallAlleles );
+                final GenotypesContext genotypes = calculateGLsForThisEvent(readAlleleLikelihoods, noCallAlleles);
+
                final VariantContext call = calculateGenotypes(new VariantContextBuilder(mergedVC).alleles(readAlleleLikelihoods.alleles()).genotypes(genotypes).make(), calculationModel);
                if ( call != null ) {
-                    final VariantContext annotatedCall = annotateCall(readLikelihoods, perSampleFilteredReadList, ref, refLoc, genomeLocParser, tracker, emitReferenceConfidence, calledHaplotypes, mergedVC, alleleMapper, readAlleleLikelihoods, someAllelesWereDropped, call);
+                    final VariantContext annotatedCall = annotateCall(readLikelihoods,
+                                                                      perSampleFilteredReadList,
+                                                                      ref,
+                                                                      refLoc,
+                                                                      genomeLocParser,
+                                                                      tracker,
+                                                                      emitReferenceConfidence,
+                                                                      calledHaplotypes,
+                                                                      mergedVC,
+                                                                      alleleMapper,
+                                                                      readAlleleLikelihoods,
+                                                                      someAllelesWereDropped,
+                                                                      call);
                    returnCalls.add( annotatedCall );
                }
            }
@ -300,7 +345,7 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine<AssemblyBa
        final boolean someAlternativeAllelesWereDropped = call.getAlleles().size() != initialAlleleNumber;
        VariantContext annotatedCall = annotationEngine.annotateContextForActiveRegion(referenceContext, tracker,readAlleleLikelihoodsForAnnotation, call, emitReferenceConfidence);
        if (someAlternativeAllelesWereDropped || someAlternativeAllelesWereAlreadyDropped)
-           annotatedCall = GATKVariantContextUtils.reverseTrimAlleles(annotatedCall);
+            annotatedCall = GATKVariantContextUtils.reverseTrimAlleles(annotatedCall);

        // maintain the set of all called haplotypes
        for ( final Allele calledAllele : call.getAlleles() ) {
@ -312,6 +357,78 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine<AssemblyBa
        return !emitReferenceConfidence ? clearUnconfidentGenotypeCalls(annotatedCall) : annotatedCall;
    }

+    /**
+     * Trim excessive alt alleles when the combination of ploidy and alleles is so large that exhaustively enumerating
+     * all possible genotypes is impractical.
+     * Use the highest haplotype score of each allele to decide which ones to trim away, and in case of tie, use the second best.
+     * @param alleleMapper          original allele to haplotype map to be trimmed
+     * @param desiredNumOfAlleles   desired allele count (ref allele won't be removed)
+     * @return trimmed map of the desired size
+     */
+    private Map<Allele, List<Haplotype>> reduceNumberOfAlternativeAllelesBasedOnHaplotypesScores(final Map<Allele, List<Haplotype>> alleleMapper, final int desiredNumOfAlleles) {
+
+        final PriorityQueue<AlleleScoredByHaplotypeScores> altAlleleMaxPriorityQ = new PriorityQueue<>((sa1, sa2) -> - sa2.compareTo(sa1)); // -1 to turn it into max priority q
+
+        final Set<Allele> allelesToRetain = new HashSet<>();
+        // populate allelePriorityQ with the relevant information
+        for(final Allele allele : alleleMapper.keySet()){
+
+            if(allele.isReference()){ // collect scores information only on alt alleles; ref allele is never trimmed by this function
+                allelesToRetain.add(allele);
+                continue;
+            }
+
+            final List<Double> hapScores = alleleMapper.get(allele).stream().map(hap -> hap.getScore()).collect(Collectors.toList());
+            Collections.sort(hapScores);
+            final Double highestScore = hapScores.get(hapScores.size()-1);
+            final Double secondHighestScore = hapScores.size()>1 ? hapScores.get(hapScores.size()-2) : Double.NEGATIVE_INFINITY;
+
+            altAlleleMaxPriorityQ.add(new AlleleScoredByHaplotypeScores(allele, highestScore, secondHighestScore));
+        }
+
+        while(allelesToRetain.size()<desiredNumOfAlleles){
+            allelesToRetain.add(altAlleleMaxPriorityQ.poll().getAllele());
+        }
+
+        return alleleMapper.entrySet().stream().filter(p -> allelesToRetain.contains(p.getKey()))
+                .collect(Collectors.toMap(p->p.getKey(), p->p.getValue()));
+    }
+
+    /**
+     * A utility class that provides ordering information, given best and second best haplotype scores.
+     * If there's a tie between the two alleles when comparing their best haplotype score, the second best haplotype score
+     * is used for breaking the tie. In the case that one allele doesn't have a second best allele, i.e. it has only one
+     * supportive haplotype, its second best score is set as null, and is always considered "worse" than another allele
+     * that has the same best haplotype score, but also has a second best haplotype score.
+     * TODO: in the extremely unlikely case that two alleles, having the same best haplotype score, neither have a second
+     * best haplotype score, the case is undecided.
+     */
+    private static final class AlleleScoredByHaplotypeScores {
+        private final Allele allele;
+        private final Double bestHaplotypeScore;
+        private final Double secondBestHaplotypeScore;
+
+        public AlleleScoredByHaplotypeScores(final Allele allele, final Double bestHaplotypeScore, final Double secondBestHaplotypeScore){
+            this.allele = allele;
+            this.bestHaplotypeScore = bestHaplotypeScore;
+            this.secondBestHaplotypeScore = secondBestHaplotypeScore;
+        }
+
+        public int compareTo(final AlleleScoredByHaplotypeScores other) {
+            if(bestHaplotypeScore > other.bestHaplotypeScore) {
+                return 1;
+            } else if (bestHaplotypeScore < other.bestHaplotypeScore) {
+                return -1;
+            } else {
+                return secondBestHaplotypeScore > other.secondBestHaplotypeScore ? 1 : -1;
+            }
+        }
+
+        public Allele getAllele(){
+            return allele;
+        }
+    }
+
    /**
     * Reduce the number alternative alleles in a read-likelihoods collection to the maximum-alt-allele user parameter value.
     * <p>
@ -462,7 +579,7 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine<AssemblyBa
     */
    @VisibleForTesting
    static Map<VariantContext, Set<Haplotype>> constructHaplotypeMapping(final List<VariantContext> originalCalls,
-                                                                                   final Set<Haplotype> calledHaplotypes) {
+                                                                         final Set<Haplotype> calledHaplotypes) {
        final Map<VariantContext, Set<Haplotype>> haplotypeMap = new HashMap<>(originalCalls.size());
        for ( final VariantContext call : originalCalls ) {
            // don't try to phase if there is not exactly 1 alternate allele
@ -674,14 +791,13 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine<AssemblyBa

    // Builds the read-likelihoods collection to use for annotation considering user arguments and the collection
    // used for genotyping.
-    protected ReadLikelihoods<Allele> prepareReadAlleleLikelihoodsForAnnotation(
-            final ReadLikelihoods<Haplotype> readHaplotypeLikelihoods,
-            final Map<String, List<GATKSAMRecord>> perSampleFilteredReadList,
-            final GenomeLocParser genomeLocParser,
-            final boolean emitReferenceConfidence,
-            final Map<Allele, List<Haplotype>> alleleMapper,
-            final ReadLikelihoods<Allele> readAlleleLikelihoodsForGenotyping,
-            final VariantContext call) {
+    protected ReadLikelihoods<Allele> prepareReadAlleleLikelihoodsForAnnotation(final ReadLikelihoods<Haplotype> readHaplotypeLikelihoods,
+                                                                                final Map<String, List<GATKSAMRecord>> perSampleFilteredReadList,
+                                                                                final GenomeLocParser genomeLocParser,
+                                                                                final boolean emitReferenceConfidence,
+                                                                                final Map<Allele, List<Haplotype>> alleleMapper,
+                                                                                final ReadLikelihoods<Allele> readAlleleLikelihoodsForGenotyping,
+                                                                                final VariantContext call) {

        final ReadLikelihoods<Allele> readAlleleLikelihoodsForAnnotations;
        final GenomeLoc loc = genomeLocParser.createGenomeLoc(call);
@ -744,10 +860,10 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine<AssemblyBa
     * @return never {@code null} but perhaps an empty list if there is no variants to report.
     */
    protected TreeSet<Integer> decomposeHaplotypesIntoVariantContexts(final List<Haplotype> haplotypes,
-                                                                    final ReadLikelihoods readLikelihoods,
-                                                                    final byte[] ref,
-                                                                    final GenomeLoc refLoc,
-                                                                    final List<VariantContext> activeAllelesToGenotype) {
+                                                                      final ReadLikelihoods readLikelihoods,
+                                                                      final byte[] ref,
+                                                                      final GenomeLoc refLoc,
+                                                                      final List<VariantContext> activeAllelesToGenotype) {
        final boolean in_GGA_mode = !activeAllelesToGenotype.isEmpty();

        // Using the cigar from each called haplotype figure out what events need to be written out in a VCF file
@ -782,8 +898,8 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine<AssemblyBa
    }

    protected List<VariantContext> getVCsAtThisLocation(final List<Haplotype> haplotypes,
-                                                      final int loc,
-                                                      final List<VariantContext> activeAllelesToGenotype) {
+                                                        final int loc,
+                                                        final List<VariantContext> activeAllelesToGenotype) {
        // the overlapping events to merge into a common reference view
        final List<VariantContext> eventsAtThisLoc = new ArrayList<>();

--- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypeLikelihoodCalculatorUnitTest.java
+++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypeLikelihoodCalculatorUnitTest.java
@ -170,6 +170,15 @@ public class GenotypeLikelihoodCalculatorUnitTest {
    }


+    @Test
+    public void testComputeMaxAcceptableAlleleCount(){
+        Assert.assertEquals(1024, GenotypeLikelihoodCalculators.computeMaxAcceptableAlleleCount(1, 1024));
+        Assert.assertEquals(44, GenotypeLikelihoodCalculators.computeMaxAcceptableAlleleCount(2, 1024));
+        Assert.assertEquals(17, GenotypeLikelihoodCalculators.computeMaxAcceptableAlleleCount(3, 1024));
+        Assert.assertEquals(5, GenotypeLikelihoodCalculators.computeMaxAcceptableAlleleCount(10, 1024));
+        Assert.assertEquals(3, GenotypeLikelihoodCalculators.computeMaxAcceptableAlleleCount(20, 1024));
+        Assert.assertEquals(2, GenotypeLikelihoodCalculators.computeMaxAcceptableAlleleCount(100, 1024));
+    }
    // Simple inefficient calculation of the genotype count given the ploidy.
    private int calculateGenotypeCount(final int ploidy, final int alleleCount) {
        if (ploidy == 0)
--- a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/haplotype/Haplotype.java
+++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/haplotype/Haplotype.java
@ -130,6 +130,7 @@ public class Haplotype extends Allele {
        final Haplotype ret = new Haplotype(newBases, isReference());
        ret.setCigar(newCigar);
        ret.setGenomeLocation(loc);
+        ret.setScore(score);
        ret.setAlignmentStartHapwrtRef(newStart + getAlignmentStartHapwrtRef());
        return ret;
    }