From 4f95f850b3d84a5c8e26533a02916bffd6710bdc Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Mon, 7 Jan 2013 11:05:44 -0500 Subject: [PATCH] Bug fix in the HC's allele mapping for multi-allelic events. Using the allele alone as a key isn't sufficient because alleles change when the reference allele changes during VariantContextUtils.simpleMerge for multi-allelic events. --- .../haplotypecaller/GenotypingEngine.java | 185 +++++++++++------- .../haplotypecaller/HaplotypeCaller.java | 20 +- .../HaplotypeCallerIntegrationTest.java | 6 +- .../variantcontext/VariantContextUtils.java | 1 - 4 files changed, 124 insertions(+), 88 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java index 5aef002fe..8b7b0b918 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java @@ -59,16 +59,16 @@ public class GenotypingEngine { } @Requires({"refLoc.containsP(activeRegionWindow)", "haplotypes.size() > 0"}) - public List assignGenotypeLikelihoodsAndCallIndependentEvents( final UnifiedGenotyperEngine UG_engine, - final List haplotypes, - final List samples, - final Map haplotypeReadMap, - final Map> perSampleFilteredReadList, - final byte[] ref, - final GenomeLoc refLoc, - final GenomeLoc activeRegionWindow, - final GenomeLocParser genomeLocParser, - final List activeAllelesToGenotype ) { + public List assignGenotypeLikelihoods( final UnifiedGenotyperEngine UG_engine, + final List haplotypes, + final List samples, + final Map haplotypeReadMap, + final Map> perSampleFilteredReadList, + final byte[] ref, + final GenomeLoc refLoc, + final GenomeLoc activeRegionWindow, + final GenomeLocParser genomeLocParser, + final List activeAllelesToGenotype ) { final List returnCalls = new ArrayList(); final boolean in_GGA_mode = !activeAllelesToGenotype.isEmpty(); @@ -120,7 +120,7 @@ public class GenotypingEngine { priorityList.clear(); int alleleCount = 0; for( final Allele compAltAllele : compVC.getAlternateAlleles() ) { - HashSet alleleSet = new HashSet(2); + ArrayList alleleSet = new ArrayList(2); alleleSet.add(compVC.getReference()); alleleSet.add(compAltAllele); priorityList.add("Allele" + alleleCount); @@ -133,45 +133,26 @@ public class GenotypingEngine { if( eventsAtThisLoc.isEmpty() ) { continue; } - // Create the allele mapping object which maps the original haplotype alleles to the alleles present in just this event - Map> alleleMapper = createAlleleMapper( loc, eventsAtThisLoc, haplotypes ); + // Create the event mapping object which maps the original haplotype events to the events present at just this locus + final Map> eventMapper = createEventMapper(loc, eventsAtThisLoc, haplotypes); - final Allele refAllele = eventsAtThisLoc.get(0).getReference(); - final ArrayList alleleOrdering = new ArrayList(alleleMapper.size()); - alleleOrdering.add(refAllele); - for( final VariantContext vc : eventsAtThisLoc ) { - alleleOrdering.add(vc.getAlternateAllele(0)); - } - - // Sanity check the priority list - for( final VariantContext vc : eventsAtThisLoc ) { - if( !priorityList.contains(vc.getSource()) ) { - throw new ReviewedStingException("Event found on haplotype that wasn't added to priority list. Something went wrong in the merging of alleles."); - } - } - for( final String name : priorityList ) { - boolean found = false; - for( final VariantContext vc : eventsAtThisLoc ) { - if(vc.getSource().equals(name)) { found = true; break; } - } - if( !found ) { - throw new ReviewedStingException("Event added to priority list but wasn't found on any haplotype. Something went wrong in the merging of alleles."); - } - } + // Sanity check the priority list for mistakes + sanityCheckPriorityList( priorityList, eventsAtThisLoc ); // Merge the event to find a common reference representation final VariantContext mergedVC = VariantContextUtils.simpleMerge(eventsAtThisLoc, priorityList, VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, VariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, null, false, false); if( mergedVC == null ) { continue; } - // let's update the Allele keys in the mapper because they can change after merging when there are complex events - final Map> updatedAlleleMapper = new HashMap>(alleleMapper.size()); - for ( int i = 0; i < mergedVC.getNAlleles(); i++ ) { - final Allele oldAllele = alleleOrdering.get(i); - final Allele newAllele = mergedVC.getAlleles().get(i); - updatedAlleleMapper.put(newAllele, alleleMapper.get(oldAllele)); - alleleOrdering.set(i, newAllele); + if( eventsAtThisLoc.size() != mergedVC.getAlternateAlleles().size() ) { + throw new ReviewedStingException("Something went wrong in the merging of alleles."); } - alleleMapper = updatedAlleleMapper; + final HashMap mergeMap = new HashMap(); + mergeMap.put(null, mergedVC.getReference()); // the reference event (null) --> the reference allele + for(int iii = 0; iii < mergedVC.getAlternateAlleles().size(); iii++) { + mergeMap.put(eventsAtThisLoc.get(iii), mergedVC.getAlternateAllele(iii)); // BUGBUG: This is assuming that the order of alleles is the same as the priority list given to simpleMerge function + } + + final Map> alleleMapper = createAlleleMapper(mergeMap, eventMapper); if( DEBUG ) { System.out.println("Genotyping event at " + loc + " with alleles = " + mergedVC.getAlleles()); @@ -180,20 +161,7 @@ public class GenotypingEngine { final Map alleleReadMap = convertHaplotypeReadMapToAlleleReadMap( haplotypeReadMap, alleleMapper, UG_engine.getUAC().CONTAMINATION_FRACTION, UG_engine.getUAC().contaminationLog ); - // Grab the genotype likelihoods from the appropriate places in the haplotype likelihood matrix -- calculation performed independently per sample - final GenotypesContext genotypes = GenotypesContext.create(samples.size()); - for( final String sample : samples ) { - final int numHaplotypes = alleleMapper.size(); - final double[] genotypeLikelihoods = new double[numHaplotypes * (numHaplotypes+1) / 2]; - final double[][] haplotypeLikelihoodMatrix = LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods(sample, alleleReadMap, alleleOrdering); - int glIndex = 0; - for( int iii = 0; iii < numHaplotypes; iii++ ) { - for( int jjj = 0; jjj <= iii; jjj++ ) { - genotypeLikelihoods[glIndex++] = haplotypeLikelihoodMatrix[iii][jjj]; // for example: AA,AB,BB,AC,BC,CC - } - } - genotypes.add( new GenotypeBuilder(sample).alleles(noCall).PL(genotypeLikelihoods).make() ); - } + final GenotypesContext genotypes = calculateGLsForThisEvent( samples, alleleReadMap, mergedVC ); final VariantContext call = UG_engine.calculateGenotypes(new VariantContextBuilder(mergedVC).genotypes(genotypes).make(), UG_engine.getUAC().GLmodel); if( call != null ) { final Map alleleReadMap_annotations = ( USE_FILTERED_READ_MAP_FOR_ANNOTATIONS ? alleleReadMap : @@ -212,6 +180,41 @@ public class GenotypingEngine { return returnCalls; } + private GenotypesContext calculateGLsForThisEvent( final List samples, final Map alleleReadMap, final VariantContext mergedVC ) { + final GenotypesContext genotypes = GenotypesContext.create(samples.size()); + // Grab the genotype likelihoods from the appropriate places in the haplotype likelihood matrix -- calculation performed independently per sample + for( final String sample : samples ) { + final int numHaplotypes = mergedVC.getAlleles().size(); + final double[] genotypeLikelihoods = new double[numHaplotypes * (numHaplotypes+1) / 2]; + final double[][] haplotypeLikelihoodMatrix = LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods(sample, alleleReadMap, mergedVC.getAlleles()); + int glIndex = 0; + for( int iii = 0; iii < numHaplotypes; iii++ ) { + for( int jjj = 0; jjj <= iii; jjj++ ) { + genotypeLikelihoods[glIndex++] = haplotypeLikelihoodMatrix[iii][jjj]; // for example: AA,AB,BB,AC,BC,CC + } + } + genotypes.add( new GenotypeBuilder(sample).alleles(noCall).PL(genotypeLikelihoods).make() ); + } + return genotypes; + } + + private void sanityCheckPriorityList( final ArrayList priorityList, final ArrayList eventsAtThisLoc ) { + for( final VariantContext vc : eventsAtThisLoc ) { + if( !priorityList.contains(vc.getSource()) ) { + throw new ReviewedStingException("Event found on haplotype that wasn't added to priority list. Something went wrong in the merging of alleles."); + } + } + for( final String name : priorityList ) { + boolean found = false; + for( final VariantContext vc : eventsAtThisLoc ) { + if(vc.getSource().equals(name)) { found = true; break; } + } + if( !found ) { + throw new ReviewedStingException("Event added to priority list but wasn't found on any haplotype. Something went wrong in the merging of alleles."); + } + } + } + private static Map filterToOnlyOverlappingReads( final GenomeLocParser parser, final Map perSampleReadMap, final Map> perSampleFilteredReadList, @@ -437,27 +440,45 @@ public class GenotypingEngine { return ((pa1b1 - pa1*pb1) * (pa1b1 - pa1*pb1)) / ( pa1 * (1.0 - pa1) * pb1 * (1.0 - pb1) ); } + protected static Map> createAlleleMapper( final Map mergeMap, final Map> eventMap ) { + final Map> alleleMapper = new HashMap>(); + for( final Map.Entry entry : mergeMap.entrySet() ) { + alleleMapper.put(entry.getValue(), eventMap.get(new Event(entry.getKey()))); + } + return alleleMapper; + } + @Requires({"haplotypes.size() >= eventsAtThisLoc.size() + 1"}) @Ensures({"result.size() == eventsAtThisLoc.size() + 1"}) - protected static Map> createAlleleMapper( final int loc, final List eventsAtThisLoc, final List haplotypes ) { + protected static Map> createEventMapper( final int loc, final List eventsAtThisLoc, final List haplotypes ) { - final Map> alleleMapper = new HashMap>(eventsAtThisLoc.size()+1); + final Map> eventMapper = new HashMap>(eventsAtThisLoc.size()+1); final Allele refAllele = eventsAtThisLoc.get(0).getReference(); - alleleMapper.put(refAllele, new ArrayList()); - for( final VariantContext vc : eventsAtThisLoc ) - alleleMapper.put(vc.getAlternateAllele(0), new ArrayList()); + VariantContext refVC = eventsAtThisLoc.get(0); + eventMapper.put(new Event(null), new ArrayList()); + for( final VariantContext vc : eventsAtThisLoc ) { + eventMapper.put(new Event(vc), new ArrayList()); + } final ArrayList undeterminedHaplotypes = new ArrayList(haplotypes.size()); for( final Haplotype h : haplotypes ) { - if( h.isArtificialHaplotype() && loc == h.getArtificialAllelePosition() && alleleMapper.containsKey(h.getArtificialAllele()) ) { - alleleMapper.get(h.getArtificialAllele()).add(h); + if( h.isArtificialHaplotype() && loc == h.getArtificialAllelePosition() ) { + final ArrayList alleles = new ArrayList(2); + alleles.add(refAllele); + alleles.add(h.getArtificialAllele()); + final Event artificialVC = new Event( (new VariantContextBuilder()).source("artificialHaplotype") + .alleles(alleles) + .loc(refVC.getChr(), refVC.getStart(), refVC.getStart() + refAllele.length() - 1).make() ); + if( eventMapper.containsKey(artificialVC) ) { + eventMapper.get(artificialVC).add(h); + } } else if( h.getEventMap().get(loc) == null ) { // no event at this location so let's investigate later undeterminedHaplotypes.add(h); } else { boolean haplotypeIsDetermined = false; for( final VariantContext vcAtThisLoc : eventsAtThisLoc ) { if( h.getEventMap().get(loc).hasSameAllelesAs(vcAtThisLoc) ) { - alleleMapper.get(vcAtThisLoc.getAlternateAllele(0)).add(h); + eventMapper.get(new Event(vcAtThisLoc)).add(h); haplotypeIsDetermined = true; break; } @@ -469,25 +490,23 @@ public class GenotypingEngine { } for( final Haplotype h : undeterminedHaplotypes ) { - Allele matchingAllele = null; - for( final Map.Entry> alleleToTest : alleleMapper.entrySet() ) { + Event matchingEvent = new Event(null); + for( final Map.Entry> eventToTest : eventMapper.entrySet() ) { // don't test against the reference allele - if( alleleToTest.getKey().equals(refAllele) ) + if( eventToTest.getKey().equals(new Event(null)) ) continue; - final Haplotype artificialHaplotype = alleleToTest.getValue().get(0); + final Haplotype artificialHaplotype = eventToTest.getValue().get(0); if( isSubSetOf(artificialHaplotype.getEventMap(), h.getEventMap(), true) ) { - matchingAllele = alleleToTest.getKey(); + matchingEvent = eventToTest.getKey(); break; } } - if( matchingAllele == null ) - matchingAllele = refAllele; - alleleMapper.get(matchingAllele).add(h); + eventMapper.get(matchingEvent).add(h); } - return alleleMapper; + return eventMapper; } protected static boolean isSubSetOf(final Map subset, final Map superset, final boolean resolveSupersetToSubset) { @@ -636,4 +655,22 @@ public class GenotypingEngine { } return vcs; } + + private static class Event { + public VariantContext vc; + + public Event( final VariantContext vc ) { + this.vc = vc; + } + + @Override + public boolean equals( final Object obj ) { + return obj instanceof Event && ((((Event) obj).vc == null && vc == null) || (((Event) obj).vc != null && vc != null && ((Event) obj).vc.hasSameAllelesAs(vc))) ; + } + + @Override + public int hashCode() { + return (vc == null ? -1 : vc.getAlleles().hashCode()); + } + } } \ No newline at end of file diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index 8c8113f46..cde089b34 100755 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -416,16 +416,16 @@ public class HaplotypeCaller extends ActiveRegionWalker implem // subset down to only the best haplotypes to be genotyped in all samples ( in GGA mode use all discovered haplotypes ) final ArrayList bestHaplotypes = ( UG_engine.getUAC().GenotypingMode != GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ? likelihoodCalculationEngine.selectBestHaplotypes( haplotypes, stratifiedReadMap ) : haplotypes ); - for( final VariantContext call : genotypingEngine.assignGenotypeLikelihoodsAndCallIndependentEvents( UG_engine, - bestHaplotypes, - samplesList, - stratifiedReadMap, - perSampleFilteredReadList, - fullReferenceWithPadding, - getPaddedLoc(activeRegion), - activeRegion.getLocation(), - getToolkit().getGenomeLocParser(), - activeAllelesToGenotype ) ) { + for( final VariantContext call : genotypingEngine.assignGenotypeLikelihoods( UG_engine, + bestHaplotypes, + samplesList, + stratifiedReadMap, + perSampleFilteredReadList, + fullReferenceWithPadding, + getPaddedLoc(activeRegion), + activeRegion.getLocation(), + getToolkit().getGenomeLocParser(), + activeAllelesToGenotype ) ) { vcfWriter.add( call ); } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index 1683044f0..4c2714dde 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -21,7 +21,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSample() { - HCTest(CEUTRIO_BAM, "", "47fdbe5f01d3ce5e53056eea8c488e45"); + HCTest(CEUTRIO_BAM, "", "35c8425b44429ac7468c2cd26f8f5a42"); } @Test @@ -33,7 +33,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSampleGGA() { HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", - "54b7cc3da3d8349ff4302f99883ab188"); + "d918d25b22a551cae5d70ea30d7feed1"); } private void HCTestComplexVariants(String bam, String args, String md5) { @@ -72,7 +72,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void HCTestProblematicReadsModifiedInActiveRegions() { final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3 -L 4:49139026-49139965"; - final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("ece627de486aee69d02872891c6cb0ff")); + final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("2e8e6313228b0387008437feae7f5469")); executeTest("HCTestProblematicReadsModifiedInActiveRegions: ", spec); } diff --git a/public/java/src/org/broadinstitute/variant/variantcontext/VariantContextUtils.java b/public/java/src/org/broadinstitute/variant/variantcontext/VariantContextUtils.java index 5d9a6d476..4d1e70340 100755 --- a/public/java/src/org/broadinstitute/variant/variantcontext/VariantContextUtils.java +++ b/public/java/src/org/broadinstitute/variant/variantcontext/VariantContextUtils.java @@ -499,7 +499,6 @@ public class VariantContextUtils { final VariantContext first = VCs.get(0); final String name = first.getSource(); final Allele refAllele = determineReferenceAllele(VCs); - Byte referenceBaseForIndel = null; final Set alleles = new LinkedHashSet(); final Set filters = new HashSet();