From cc91052e6961e4592aa8d3a977e246957ae43f2a Mon Sep 17 00:00:00 2001 From: Steve Huang Date: Thu, 27 Oct 2016 11:19:49 -0400 Subject: [PATCH] fixed a max priority Q error while removing alt alleles when faced with high ploidy and allele count; added hackish integration test (#1457) --- .../HaplotypeCallerGenotypingEngine.java | 164 +++++++++++------- ...plotypeCallerGenotypingEngineUnitTest.java | 102 ++++++++++- .../HaplotypeCallerIntegrationTest.java | 10 ++ 3 files changed, 208 insertions(+), 68 deletions(-) diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngine.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngine.java index 1f31e7cd6..2a57c09ac 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngine.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngine.java @@ -54,10 +54,7 @@ package org.broadinstitute.gatk.tools.walkers.haplotypecaller; import com.google.common.annotations.VisibleForTesting; import com.google.java.contract.Ensures; import com.google.java.contract.Requires; -import htsjdk.samtools.util.StringUtil; import htsjdk.variant.variantcontext.*; -import org.apache.commons.lang.ArrayUtils; -import org.apache.commons.math3.stat.StatUtils; import org.broadinstitute.gatk.engine.arguments.GenotypeCalculationArgumentCollection; import org.broadinstitute.gatk.utils.*; import org.broadinstitute.gatk.utils.contexts.ReferenceContext; @@ -87,8 +84,8 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine practicaAlleleCountForPloidy = new HashMap<>(); + private final int maxGenotypeCountToEnumerate; + private final Map practicalAlleleCountForPloidy = new HashMap<>(); private MergeVariantsAcrossHaplotypes crossHaplotypeEventMerger; @@ -105,13 +102,18 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine readAlleleLikelihoods = readLikelihoods.marginalize(practicalAlleleMapper, + final ReadLikelihoods readAlleleLikelihoods = readLikelihoods.marginalize(alleleMapper, genomeLocParser.createPaddedGenomeLoc(genomeLocParser.createGenomeLoc(mergedVC), ALLELE_EXTENSION)); if (configuration.isSampleContaminationPresent()) @@ -358,52 +340,82 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine> reduceNumberOfAlternativeAllelesBasedOnHaplotypesScores(final Map> alleleMapper, final int desiredNumOfAlleles) { + private VariantContext removeAltAllelesIfTooManyGenotypes(final int ploidy, final Map> alleleMapper, final VariantContext mergedVC) { - final PriorityQueue altAlleleMaxPriorityQ = new PriorityQueue<>((sa1, sa2) -> - sa2.compareTo(sa1)); // -1 to turn it into max priority q + final int originalAlleleCount = alleleMapper.size(); + practicalAlleleCountForPloidy.putIfAbsent(ploidy, GenotypeLikelihoodCalculators.computeMaxAcceptableAlleleCount(ploidy, maxGenotypeCountToEnumerate)); + final int practicalAlleleCount = practicalAlleleCountForPloidy.get(ploidy); - final Set allelesToRetain = new HashSet<>(); - // populate allelePriorityQ with the relevant information + if (originalAlleleCount > practicalAlleleCount) { + final List allelesToKeep = whichAllelesToKeepBasedonHapScores(alleleMapper, practicalAlleleCount); + alleleMapper.keySet().retainAll(allelesToKeep); + logger.warn(String.format("Removed alt alleles where ploidy is %d and original allele count is %d, whereas after trimming the allele count becomes %d. Alleles kept are:%s", + ploidy, originalAlleleCount, practicalAlleleCount, allelesToKeep)); + return removeExcessAltAllelesFromVC(mergedVC, allelesToKeep); + } else { + return mergedVC; + } + } + + /** + * Returns a list of alleles that is a subset of the key set of input map {@code alleleMapper}. + * The size of the returned list is min({@code desiredNumOfAlleles}, alleleMapper.size()). + * + * Alleles kept are guaranteed to have higher precedence than those removed, where precedence is determined by + * {@link AlleleScoredByHaplotypeScores}. + * + * Entries in the returned list are guaranteed to have the same relative order as they were in the input map. + * + * @param alleleMapper original allele to haplotype map + * @param desiredNumOfAlleles desired allele count, including ref allele + */ + @VisibleForTesting + static List whichAllelesToKeepBasedonHapScores(final Map> alleleMapper, + final int desiredNumOfAlleles) { + + if(alleleMapper.size() <= desiredNumOfAlleles){ + return alleleMapper.keySet().stream().collect(Collectors.toList()); + } + + final PriorityQueue alleleMaxPriorityQ = new PriorityQueue<>(); for(final Allele allele : alleleMapper.keySet()){ - - if(allele.isReference()){ // collect scores information only on alt alleles; ref allele is never trimmed by this function - allelesToRetain.add(allele); - continue; - } - - final List hapScores = alleleMapper.get(allele).stream().map(hap -> hap.getScore()).collect(Collectors.toList()); - Collections.sort(hapScores); + final List hapScores = alleleMapper.get(allele).stream().map(Haplotype::getScore).sorted().collect(Collectors.toList()); final Double highestScore = hapScores.get(hapScores.size()-1); final Double secondHighestScore = hapScores.size()>1 ? hapScores.get(hapScores.size()-2) : Double.NEGATIVE_INFINITY; - altAlleleMaxPriorityQ.add(new AlleleScoredByHaplotypeScores(allele, highestScore, secondHighestScore)); + alleleMaxPriorityQ.add(new AlleleScoredByHaplotypeScores(allele, highestScore, secondHighestScore)); } + final Set allelesToRetain = new LinkedHashSet<>(); while(allelesToRetain.size() allelesToRetain.contains(p.getKey())) - .collect(Collectors.toMap(p->p.getKey(), p->p.getValue())); + return alleleMapper.keySet().stream().filter(allelesToRetain::contains).collect(Collectors.toList()); } /** * A utility class that provides ordering information, given best and second best haplotype scores. * If there's a tie between the two alleles when comparing their best haplotype score, the second best haplotype score * is used for breaking the tie. In the case that one allele doesn't have a second best allele, i.e. it has only one - * supportive haplotype, its second best score is set as null, and is always considered "worse" than another allele - * that has the same best haplotype score, but also has a second best haplotype score. - * TODO: in the extremely unlikely case that two alleles, having the same best haplotype score, neither have a second - * best haplotype score, the case is undecided. + * supportive haplotype, its second best score is set as {@link Double#NEGATIVE_INFINITY}. + * In the extremely unlikely cases that two alleles, having the same best haplotype score, neither have a second + * best haplotype score, or the same second best haplotype score, the order is exactly the same as determined by + * {@link Allele#compareTo(Allele)}. */ - private static final class AlleleScoredByHaplotypeScores { + private static final class AlleleScoredByHaplotypeScores implements Comparable{ private final Allele allele; private final Double bestHaplotypeScore; private final Double secondBestHaplotypeScore; @@ -414,13 +426,21 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine other.bestHaplotypeScore) { - return 1; - } else if (bestHaplotypeScore < other.bestHaplotypeScore) { + + if(allele.isReference() && other.allele.isNonReference()){ return -1; + } else if(allele.isNonReference() && other.allele.isReference()){ + return 1; + } else if(bestHaplotypeScore > other.bestHaplotypeScore) { + return -1; + } else if (bestHaplotypeScore < other.bestHaplotypeScore) { + return 1; + } else if (!secondBestHaplotypeScore.equals(other.secondBestHaplotypeScore)) { + return secondBestHaplotypeScore > other.secondBestHaplotypeScore ? -1 : 1; } else { - return secondBestHaplotypeScore > other.secondBestHaplotypeScore ? 1 : -1; + return allele.compareTo(other.allele); } } @@ -429,6 +449,28 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine allelesToKeep){ + Utils.validateArg(allelesToKeep!=null, "alleles to keep is null"); + Utils.validateArg(!allelesToKeep.contains(null), "alleles to keep contains null elements"); + Utils.validateArg(allelesToKeep.stream().anyMatch(Allele::isReference), "alleles to keep doesn't contain reference allele!"); + Utils.validateArg(inputVC.getAlleles().containsAll(allelesToKeep), "alleles to keep is not a subset of input VC alleles"); + if(inputVC.getAlleles().size() == allelesToKeep.size()) return inputVC; + + final VariantContextBuilder vcb = new VariantContextBuilder(inputVC); + final List originalList = inputVC.getAlleles(); + originalList.retainAll(allelesToKeep); + vcb.alleles(originalList); + return vcb.make(); + } + /** * Reduce the number alternative alleles in a read-likelihoods collection to the maximum-alt-allele user parameter value. *

diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngineUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngineUnitTest.java index 25ed5e59e..ac4476037 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngineUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngineUnitTest.java @@ -444,12 +444,12 @@ public class HaplotypeCallerGenotypingEngineUnitTest extends BaseTest { @Test(dataProvider="ConstructPhaseSetMappingProvider") public void testConstructPhaseSetMapping(final List calls, - final Map> haplotypeMap, - final int totalHaplotypes, - final int expectedMapSize, - final int expectedNumGroups, - final int expectedNum01, - final int expectedNum10) { + final Map> haplotypeMap, + final int totalHaplotypes, + final int expectedMapSize, + final int expectedNumGroups, + final int expectedNum01, + final int expectedNum10) { final Map> actualPhaseSetMapping = new HashMap<>(); final int actualNumGroups = HaplotypeCallerGenotypingEngine.constructPhaseSetMapping(calls, haplotypeMap, totalHaplotypes, actualPhaseSetMapping); Assert.assertEquals(actualNumGroups, expectedNumGroups); @@ -558,11 +558,99 @@ public class HaplotypeCallerGenotypingEngineUnitTest extends BaseTest { final PloidyModel ploidyModel = new HomogeneousPloidyModel(indexedSampleList, 2); final GenotypingModel genotypingModel = new InfiniteRandomMatingPopulationModel(); - final GenotypingLikelihoods genotypeLikelihoods = genotypingModel.calculateLikelihoods(readLikelihoods, new GenotypingData<>(ploidyModel,readLikelihoods)); + final GenotypingLikelihoods genotypeLikelihoods = genotypingModel.calculateLikelihoods(readLikelihoods, new GenotypingData<>(ploidyModel, readLikelihoods)); // test final Set excessAltAlleles = HaplotypeCallerGenotypingEngine.excessAlternativeAlleles(genotypeLikelihoods, 2); Assert.assertFalse(excessAltAlleles.contains(ref)); Assert.assertEquals(excessAltAlleles.size(), 1); } + + @Test + public void testReduceNumberOfAlternativeAllelesBasedOnHaplotypesScores(){ + + // first have a list of alleles, one ref, several alt + final Allele ref = Allele.create("A", true); + final Allele altC = Allele.create("C", false); + final Allele altT = Allele.create("T", false); + final Allele altT2 = Allele.create("TT", false); + final Allele altG = Allele.create("G", false); + + // then create several haplotypes, assign ad-hoc scores + final Haplotype hapRef = new Haplotype("AAAAA".getBytes()); + hapRef.setScore(Double.MAX_VALUE); + + // test case when both same best score and second best score are the same + final Haplotype hapT = new Haplotype("TAAAA".getBytes()); + hapT.setScore(-2.0); + final Haplotype hapTAnother = new Haplotype("TAAAT".getBytes()); + hapTAnother.setScore(-3.0); + final Haplotype hapT2 = new Haplotype("TTAAA".getBytes()); + hapT2.setScore(-2.0); + final Haplotype hapT2Another = new Haplotype("TTAAT".getBytes()); + hapT2Another.setScore(-3.0); + + final Haplotype hapC = new Haplotype("CAAAA".getBytes()); + hapC.setScore(-3.0); + + // for case when there's tie in highest haplotype score + final Haplotype hapG = new Haplotype("GAAAA".getBytes()); + hapG.setScore(-3.0); + final Haplotype hapGAnother = new Haplotype("GAAAG".getBytes()); + hapGAnother.setScore(-5.0); + + final Map> alleleMapper = new LinkedHashMap<>(); + alleleMapper.put(ref, Arrays.asList(hapRef)); + alleleMapper.put(altC, Arrays.asList(hapC)); + alleleMapper.put(altT, Arrays.asList(hapT, hapTAnother)); + alleleMapper.put(altT2, Arrays.asList(hapT2, hapT2Another)); + alleleMapper.put(altG, Arrays.asList(hapG, hapGAnother)); + + List allelesToKeep = HaplotypeCallerGenotypingEngine.whichAllelesToKeepBasedonHapScores(alleleMapper, 5); + Assert.assertEquals(allelesToKeep.size(), 5); + + Iterator it = allelesToKeep.iterator(); + Assert.assertEquals(it.next(), ref); + Assert.assertEquals(it.next(), altC); + Assert.assertEquals(it.next(), altT); + Assert.assertEquals(it.next(), altT2); + Assert.assertEquals(it.next(), altG); + + allelesToKeep = HaplotypeCallerGenotypingEngine.whichAllelesToKeepBasedonHapScores(alleleMapper, 4); + Assert.assertEquals(allelesToKeep.size(), 4); + it = allelesToKeep.iterator(); + Assert.assertEquals(it.next(), ref); + Assert.assertEquals(it.next(), altT); + Assert.assertEquals(it.next(), altT2); + Assert.assertEquals(it.next(), altG); + + allelesToKeep = HaplotypeCallerGenotypingEngine.whichAllelesToKeepBasedonHapScores(alleleMapper, 3); + Assert.assertEquals(allelesToKeep.size(), 3); + it = allelesToKeep.iterator(); + Assert.assertEquals(it.next(), ref); + Assert.assertEquals(it.next(), altT); + Assert.assertEquals(it.next(), altT2); + + allelesToKeep = HaplotypeCallerGenotypingEngine.whichAllelesToKeepBasedonHapScores(alleleMapper, 2); + Assert.assertEquals(allelesToKeep.size(), 2); + it = allelesToKeep.iterator(); + Assert.assertEquals(it.next(), ref); + Assert.assertEquals(it.next(), altT); + + allelesToKeep = HaplotypeCallerGenotypingEngine.whichAllelesToKeepBasedonHapScores(alleleMapper, 1); + Assert.assertEquals(allelesToKeep.size(), 1); + it = allelesToKeep.iterator(); + Assert.assertEquals(it.next(), ref); + } + + + @Test + public void testRemoveExcessiveAltAlleleFromVC(){ + final VariantContext originalVC = new VariantContextBuilder("source", "1", 1000000, 1000000, Arrays.asList(Allele.create("A", true), Allele.create("T", false), Allele.create("C", false), Allele.create("G", false))).make(); + + final VariantContext reducedVC = HaplotypeCallerGenotypingEngine.removeExcessAltAllelesFromVC(originalVC, Arrays.asList(Allele.create("A", true), Allele.create("T", false), Allele.create("C", false))); + + Assert.assertEquals(reducedVC.getNAlleles(), 3); + Assert.assertTrue(reducedVC.getAlleles().containsAll(Arrays.asList(Allele.create("A", true), Allele.create("T", false), Allele.create("C", false)))); + } } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index 5303ecdf7..6e28abe01 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -512,5 +512,15 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList(md5Variants, md5BAMOut)); executeTest("testHaplotypeCallerReadPosRankSum", spec); } + + @Test + public void testHaplotypeCallerRemoveAltAlleleBasedOnHaptypeScores() { + final File testBAM = new File(privateTestDir + "pretendTobeTetraPloidTetraAllelicSite.bam"); + final String md5 = "289304f56833ea76b60cd08763b0f68b"; + final String base = String.format("-T HaplotypeCaller -R %s -I %s -L 20:11363580-11363600 -ploidy 4 -maxGT 15 ", REF, testBAM) + + " --no_cmdline_in_header -o %s"; + final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList(md5)); + executeTest("testHaplotypeCallerRemoveAltAlleleBasedOnHaptypeScores", spec); + } }