From 4512940e8770a01b99b71a25f5942f0c3143ae63 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Mon, 14 Jul 2014 14:57:56 -0400 Subject: [PATCH] Initial implementation of functionality to add physical phasing information to the output of the HaplotypeCaller. If any pair of variants occurs on all used haplotypes together, then we propagate that information into the gVCF. Can be enabled with the --tryPhysicalPhasing argument. --- .../haplotypecaller/HaplotypeCaller.java | 11 +- .../HaplotypeCallerGenotypingEngine.java | 180 +++++++++++++++- ...plotypeCallerGenotypingEngineUnitTest.java | 200 +++++++++++++++++- .../gatk/utils/haplotype/EventMap.java | 99 +-------- .../utils/haplotype/EventMapUnitTest.java | 4 +- 5 files changed, 393 insertions(+), 101 deletions(-) diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCaller.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCaller.java index cfb29d272..2547d8bda 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCaller.java @@ -495,6 +495,12 @@ public class HaplotypeCaller extends ActiveRegionWalker, In @Argument(fullName="mergeVariantsViaLD", shortName="mergeVariantsViaLD", doc="If specified, we will merge variants together into block substitutions that are in strong local LD", required = false) protected boolean mergeVariantsViaLD = false; + @Advanced + @Argument(fullName="tryPhysicalPhasing", shortName="tryPhysicalPhasing", doc="If specified, we will add physical (read-based) phasing information", required = false) + protected boolean tryPhysicalPhasing = false; + + public static final String HAPLOTYPE_CALLER_PHASING_KEY = "HCP"; + // ----------------------------------------------------------------------------------------------- // arguments for debugging / developing the haplotype caller // ----------------------------------------------------------------------------------------------- @@ -671,7 +677,7 @@ public class HaplotypeCaller extends ActiveRegionWalker, In if( SCAC.genotypingOutputMode == GenotypingOutputMode.GENOTYPE_GIVEN_ALLELES && consensusMode ) throw new UserException("HaplotypeCaller cannot be run in both GENOTYPE_GIVEN_ALLELES mode and in consensus mode. Please choose one or the other."); - genotypingEngine = new HaplotypeCallerGenotypingEngine( getToolkit(),SCAC); + genotypingEngine = new HaplotypeCallerGenotypingEngine( getToolkit(), SCAC, tryPhysicalPhasing); // initialize the output VCF header final VariantAnnotatorEngine annotationEngine = new VariantAnnotatorEngine(Arrays.asList(annotationClassesToUse), annotationsToUse, annotationsToExclude, this, getToolkit()); @@ -692,6 +698,9 @@ public class HaplotypeCaller extends ActiveRegionWalker, In VCFConstants.DEPTH_KEY, VCFConstants.GENOTYPE_PL_KEY); + if ( tryPhysicalPhasing ) + headerInfo.add(new VCFFormatHeaderLine(HAPLOTYPE_CALLER_PHASING_KEY, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Physical phasing information, each unique ID within a given sample (but not across samples) connects alternate alleles as occurring on the same haplotype")); + // FILTER fields are added unconditionally as it's not always 100% certain the circumstances // where the filters are used. For example, in emitting all sites the lowQual field is used headerInfo.add(new VCFFilterHeaderLine(UnifiedGenotypingEngine.LOW_QUAL_FILTER_NAME, "Low quality")); diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngine.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngine.java index 90dda170e..d4a97a5af 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngine.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngine.java @@ -78,13 +78,16 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine sampleNames) { super(toolkit,configuration,sampleNames); + tryPhysicalPhasing = false; } @@ -279,7 +283,179 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine phasedCalls = tryPhysicalPhasing ? phaseCalls(returnCalls, calledHaplotypes) : returnCalls; + return new CalledHaplotypes(phasedCalls, calledHaplotypes); + } + + /** + * Tries to phase the individual alleles based on pairwise comparisons to the other alleles based on all called haplotypes + * + * @param calls the list of called alleles + * @param calledHaplotypes the set of haplotypes used for calling + * @return a non-null list which represents the possibly phased version of the calls + */ + protected List phaseCalls(final List calls, final Set calledHaplotypes) { + + // construct a mapping from alternate allele to the set of haplotypes that contain that allele + final Map> haplotypeMap = constructHaplotypeMapping(calls, calledHaplotypes); + + // construct a mapping from call to phase set ID + final Map phaseSetMapping = new HashMap<>(); + final int uniqueCounterEndValue = constructPhaseSetMapping(calls, haplotypeMap, phaseSetMapping); + + // we want to establish (potential) *groups* of phased variants, so we need to be smart when looking at pairwise phasing partners + return constructPhaseGroups(calls, phaseSetMapping, uniqueCounterEndValue); + } + + /** + * Construct the mapping from alternate allele to the set of haplotypes that contain that allele + * + * @param originalCalls the original unphased calls + * @param calledHaplotypes the set of haplotypes used for calling + * @return non-null Map + */ + protected static Map> constructHaplotypeMapping(final List originalCalls, + final Set calledHaplotypes) { + final Map> haplotypeMap = new HashMap<>(originalCalls.size()); + for ( final VariantContext call : originalCalls ) { + // don't try to phase if there is not exactly 1 alternate allele + if ( ! isBiallelic(call) ) { + haplotypeMap.put(call, Collections.emptySet()); + continue; + } + + // keep track of the haplotypes that contain this particular alternate allele + final Set hapsWithAllele = new HashSet<>(); + final Allele alt = call.getAlternateAllele(0); + + for ( final Haplotype h : calledHaplotypes ) { + for ( final VariantContext event : h.getEventMap().getVariantContexts() ) { + if ( event.getStart() == call.getStart() && event.getAlternateAlleles().contains(alt) ) + hapsWithAllele.add(h); + } + } + haplotypeMap.put(call, hapsWithAllele); + } + + return haplotypeMap; + } + + + /** + * Construct the mapping from call (variant context) to phase set ID + * + * @param originalCalls the original unphased calls + * @param haplotypeMap mapping from alternate allele to the set of haplotypes that contain that allele + * @param phaseSetMapping the map to populate in this method + * @return the next incremental unique index + */ + protected static int constructPhaseSetMapping(final List originalCalls, + final Map> haplotypeMap, + final Map phaseSetMapping) { + + final int numCalls = originalCalls.size(); + int uniqueCounter = 0; + + // use the haplotype mapping to connect variants that are always present on the same haplotypes + for ( int i = 0; i < numCalls - 1; i++ ) { + final VariantContext call = originalCalls.get(i); + final Set haplotypesWithCall = haplotypeMap.get(call); + if ( haplotypesWithCall.isEmpty() ) + continue; + + for ( int j = i+1; j < numCalls; j++ ) { + final VariantContext comp = originalCalls.get(j); + final Set haplotypesWithComp = haplotypeMap.get(comp); + + // if the variants are in phase, record that fact + if ( haplotypesWithCall.size() == haplotypesWithComp.size() && haplotypesWithCall.containsAll(haplotypesWithComp) ) { + // if it's part of an existing group, use that group's unique ID + if ( phaseSetMapping.containsKey(call) ) { + phaseSetMapping.put(comp, phaseSetMapping.get(call)); + // otherwise, create a new group + } else { + phaseSetMapping.put(call, uniqueCounter); + phaseSetMapping.put(comp, uniqueCounter); + uniqueCounter++; + } + } + } + } + + return uniqueCounter; + } + + /** + * Assemble the phase groups together and update the original calls accordingly + * + * @param originalCalls the original unphased calls + * @param phaseSetMapping mapping from call (variant context) to phase group ID + * @param indexTo last index (exclusive) of phase group IDs + * @return a non-null list which represents the possibly phased version of the calls + */ + protected static List constructPhaseGroups(final List originalCalls, + final Map phaseSetMapping, + final int indexTo) { + final List phasedCalls = new ArrayList<>(originalCalls); + + // if we managed to find any phased groups, update the VariantContexts + for ( int count = 0; count < indexTo; count++ ) { + // get all of the (indexes of the) calls that belong in this group (keeping them in the original order) + final List indexes = new ArrayList<>(); + for ( int index = 0; index < originalCalls.size(); index++ ) { + final VariantContext call = originalCalls.get(index); + if ( phaseSetMapping.containsKey(call) && phaseSetMapping.get(call) == count ) + indexes.add(index); + } + if ( indexes.size() < 2 ) + throw new IllegalStateException("Somehow we have a group of phased variants that has fewer than 2 members"); + + // create a unique ID based on the leftmost one + final String uniqueID = createUniqueID(originalCalls.get(indexes.get(0))); + + // update the VCs + for ( final int index : indexes ) { + final VariantContext phasedCall = phaseVC(originalCalls.get(index), uniqueID); + phasedCalls.set(index, phasedCall); + } + } + + return phasedCalls; + } + + /** + * Is this variant bi-allelic? This implementation is very much specific to this class so shouldn't be pulled out into a generalized place. + * + * @param vc the variant context + * @return true if this variant context is bi-allelic, ignoring the NON-REF symbolic allele, false otherwise + */ + private static boolean isBiallelic(final VariantContext vc) { + return vc.isBiallelic() || (vc.getNAlleles() == 3 && vc.getAlternateAlleles().contains(GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE)); + } + + /** + * Create a unique identifier given the variant context + * + * @param vc the variant context + * @return non-null String + */ + private static String createUniqueID(final VariantContext vc) { + return String.format("%d_%s_%s", vc.getStart(), vc.getReference().getDisplayString(), vc.getAlternateAllele(0).getDisplayString()); + // return base + "_0," + base + "_1"; + } + + /** + * Add physical phase information to the provided variant context + * + * @param vc the variant context + * @param ID the ID to use + * @return phased non-null variant context + */ + private static VariantContext phaseVC(final VariantContext vc, final String ID) { + final List phasedGenotypes = new ArrayList<>(); + for ( final Genotype g : vc.getGenotypes() ) + phasedGenotypes.add(new GenotypeBuilder(g).attribute(HaplotypeCaller.HAPLOTYPE_CALLER_PHASING_KEY, ID).make()); + return new VariantContextBuilder(vc).genotypes(phasedGenotypes).make(); } /** diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngineUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngineUnitTest.java index 2fd7a64f7..19f07eccd 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngineUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngineUnitTest.java @@ -57,6 +57,7 @@ import htsjdk.variant.variantcontext.*; import org.broadinstitute.gatk.utils.BaseTest; import org.broadinstitute.gatk.utils.*; import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile; +import org.broadinstitute.gatk.utils.haplotype.EventMap; import org.broadinstitute.gatk.utils.haplotype.Haplotype; import org.broadinstitute.gatk.utils.pileup.ReadBackedPileup; import org.broadinstitute.gatk.utils.pileup.ReadBackedPileupImpl; @@ -365,4 +366,201 @@ public class HaplotypeCallerGenotypingEngineUnitTest extends BaseTest { } return true; } -} \ No newline at end of file + + + @DataProvider(name = "CreateHaplotypeMappingProvider") + public Object[][] makeCreateHaplotypeMappingData() { + List tests = new ArrayList(); + + final Set haplotypes = new HashSet<>(); + final Allele ref = Allele.create("A", true); + final Allele altC = Allele.create("C", false); + final Allele altT = Allele.create("T", false); + + final Haplotype AtoC1 = new Haplotype("AACAA".getBytes()); + final VariantContext vc1 = new VariantContextBuilder().chr("20").start(3).stop(3).alleles(Arrays.asList(ref, altC)).make(); + AtoC1.setEventMap(new EventMap(Arrays.asList(vc1))); + AtoC1.getEventMap().put(3, vc1); + haplotypes.add(AtoC1); + + final Haplotype AtoC2 = new Haplotype("AAACA".getBytes()); + final VariantContext vc2 = new VariantContextBuilder().chr("20").start(4).stop(4).alleles(Arrays.asList(ref, altT)).make(); + AtoC2.setEventMap(new EventMap(Arrays.asList(vc2))); + AtoC2.getEventMap().put(4, vc2); + haplotypes.add(AtoC2); + + tests.add(new Object[]{vc1, haplotypes, AtoC1}); + tests.add(new Object[]{vc2, haplotypes, AtoC2}); + tests.add(new Object[]{new VariantContextBuilder().chr("20").start(1).stop(1).alleles(Arrays.asList(ref, altT)).make(), haplotypes, null}); + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider="CreateHaplotypeMappingProvider") + public void testCreateHaplotypeMapping(final VariantContext vc, final Set haplotypes, final Haplotype expected) { + final Map> mapping = HaplotypeCallerGenotypingEngine.constructHaplotypeMapping(Arrays.asList(vc), haplotypes); + final Set actual = mapping.get(vc); + if ( expected == null ) + Assert.assertTrue(actual.isEmpty(), actual.toString()); + else { + Assert.assertEquals(actual.size(), 1); + Assert.assertEquals(actual.iterator().next(), expected); + } + } + + @DataProvider(name = "ConstructPhaseSetMappingProvider") + public Object[][] makeConstructPhaseSetMappingData() { + List tests = new ArrayList(); + + final Allele ref = Allele.create("A", true); + final Allele altC = Allele.create("C", false); + final Allele altT = Allele.create("T", false); + + final VariantContext vc2 = new VariantContextBuilder().chr("20").start(2).stop(2).alleles(Arrays.asList(ref, altC)).make(); + final VariantContext vc3 = new VariantContextBuilder().chr("20").start(3).stop(3).alleles(Arrays.asList(ref, altT)).make(); + final VariantContext vc4 = new VariantContextBuilder().chr("20").start(4).stop(4).alleles(Arrays.asList(ref, altC)).make(); + final List calls = Arrays.asList(vc2, vc3, vc4); + + final Haplotype pos2 = new Haplotype("ACAAA".getBytes()); + pos2.setEventMap(new EventMap(Arrays.asList(vc2))); + pos2.getEventMap().put(2, vc2); + final Haplotype pos3 = new Haplotype("AACAA".getBytes()); + pos3.setEventMap(new EventMap(Arrays.asList(vc3))); + pos3.getEventMap().put(3, vc3); + final Haplotype pos4 = new Haplotype("AAACA".getBytes()); + pos4.setEventMap(new EventMap(Arrays.asList(vc4))); + pos4.getEventMap().put(4, vc4); + final Haplotype pos24 = new Haplotype("ACACA".getBytes()); + pos24.setEventMap(new EventMap(Arrays.asList(vc2, vc4))); + pos24.getEventMap().put(2, vc2); + pos24.getEventMap().put(4, vc4); + final Haplotype pos234 = new Haplotype("ACCCA".getBytes()); + pos234.setEventMap(new EventMap(Arrays.asList(vc2, vc3, vc4))); + pos234.getEventMap().put(2, vc2); + pos234.getEventMap().put(3, vc3); + pos234.getEventMap().put(4, vc4); + + // test no phased variants + final Set haplotypes2NoPhase = new HashSet<>(); + haplotypes2NoPhase.add(pos2); + haplotypes2NoPhase.add(pos24); + final Set haplotypes3NoPhase = new HashSet<>(); + haplotypes3NoPhase.add(pos3); + final Set haplotypes4NoPhase = new HashSet<>(); + haplotypes4NoPhase.add(pos4); + haplotypes4NoPhase.add(pos24); + final Map> haplotypeMapNoPhase = new HashMap<>(); + haplotypeMapNoPhase.put(vc2, haplotypes2NoPhase); + haplotypeMapNoPhase.put(vc3, haplotypes3NoPhase); + haplotypeMapNoPhase.put(vc4, haplotypes4NoPhase); + tests.add(new Object[]{calls, haplotypeMapNoPhase, 0, 0}); + + // test 2 phased variants + final Set haplotypes2SomePhase = new HashSet<>(); + haplotypes2SomePhase.add(pos24); + final Set haplotypes4SomePhase = new HashSet<>(); + haplotypes4SomePhase.add(pos24); + final Map> haplotypeMapSomePhase = new HashMap<>(); + haplotypeMapSomePhase.put(vc2, haplotypes2SomePhase); + haplotypeMapSomePhase.put(vc3, haplotypes3NoPhase); + haplotypeMapSomePhase.put(vc4, haplotypes4SomePhase); + tests.add(new Object[]{calls, haplotypeMapSomePhase, 2, 1}); + + // test all phased variants + final Set haplotypes2AllPhase = new HashSet<>(); + haplotypes2AllPhase.add(pos234); + final Set haplotypes3AllPhase = new HashSet<>(); + haplotypes3AllPhase.add(pos234); + final Set haplotypes4AllPhase = new HashSet<>(); + haplotypes4AllPhase.add(pos234); + final Map> haplotypeMapAllPhase = new HashMap<>(); + haplotypeMapAllPhase.put(vc2, haplotypes2AllPhase); + haplotypeMapAllPhase.put(vc3, haplotypes3AllPhase); + haplotypeMapAllPhase.put(vc4, haplotypes4AllPhase); + tests.add(new Object[]{calls, haplotypeMapAllPhase, 3, 1}); + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider="ConstructPhaseSetMappingProvider") + public void testConstructPhaseSetMapping(final List calls, + final Map> haplotypeMap, + final int expectedMapSize, + final int expectedNumGroups) { + final Map actualPhaseSetMapping = new HashMap<>(); + final int actualNumGroups = HaplotypeCallerGenotypingEngine.constructPhaseSetMapping(calls, haplotypeMap, actualPhaseSetMapping); + Assert.assertEquals(actualNumGroups, expectedNumGroups); + Assert.assertEquals(actualPhaseSetMapping.size(), expectedMapSize); + } + + @DataProvider(name = "ConstructPhaseGroupsProvider") + public Object[][] makeConstructPhaseGroupsData() { + List tests = new ArrayList(); + + final Allele ref = Allele.create("A", true); + final Allele altC = Allele.create("C", false); + + final Genotype g1 = new GenotypeBuilder().alleles(Arrays.asList(ref, altC)).make(); + final VariantContext vc1 = new VariantContextBuilder().chr("20").start(1).stop(1).alleles(Arrays.asList(ref, altC)).genotypes(g1).make(); + final Genotype g2 = new GenotypeBuilder().alleles(Arrays.asList(ref, altC)).make(); + final VariantContext vc2 = new VariantContextBuilder().chr("20").start(2).stop(2).alleles(Arrays.asList(ref, altC)).genotypes(g2).make(); + final Genotype g3 = new GenotypeBuilder().alleles(Arrays.asList(ref, altC)).make(); + final VariantContext vc3 = new VariantContextBuilder().chr("20").start(3).stop(3).alleles(Arrays.asList(ref, altC)).genotypes(g3).make(); + final List calls = Arrays.asList(vc1, vc2, vc3); + + // test no phased variants, empty map + final Map nonePhased1 = new HashMap<>(); + tests.add(new Object[]{calls, nonePhased1, 0, 0, 0}); + + // test no phased variants, full map, exception expected + final Map nonePhased2 = new HashMap<>(); + nonePhased2.put(vc1, 0); + nonePhased2.put(vc2, 1); + nonePhased2.put(vc3, 2); + tests.add(new Object[]{calls, nonePhased2, 3, -1, -1}); + + // test 2 phased variants + final Map twoPhased = new HashMap<>(); + twoPhased.put(vc1, 0); + twoPhased.put(vc2, 0); + tests.add(new Object[]{calls, twoPhased, 1, 1, 2}); + + // test all phased variants + final Map allPhased = new HashMap<>(); + allPhased.put(vc1, 0); + allPhased.put(vc2, 0); + allPhased.put(vc3, 0); + tests.add(new Object[]{calls, allPhased, 1, 1, 3}); + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider="ConstructPhaseGroupsProvider") + public void testConstructPhaseGroups(final List calls, + final Map phaseMap, + final int endIndex, + final int expectedNumGroups, + final int expectedGroupSize) { + final List actualPhasedCalls; + try { + actualPhasedCalls = HaplotypeCallerGenotypingEngine.constructPhaseGroups(calls, phaseMap, endIndex); + } catch (IllegalStateException e) { + Assert.assertEquals(-1, expectedNumGroups); + return; + } + + final Set uniqueGroups = new HashSet<>(); + int counter = 0; + for ( final VariantContext call : actualPhasedCalls ) { + for ( final Genotype g : call.getGenotypes() ) { + if ( g.hasExtendedAttribute(HaplotypeCaller.HAPLOTYPE_CALLER_PHASING_KEY) ) { + uniqueGroups.add(g.getExtendedAttribute(HaplotypeCaller.HAPLOTYPE_CALLER_PHASING_KEY).toString()); + counter++; + } + } + } + + Assert.assertEquals(uniqueGroups.size(), expectedNumGroups); + Assert.assertEquals(counter, expectedGroupSize); + } +} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/haplotype/EventMap.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/haplotype/EventMap.java index a540054bc..c460ff555 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/haplotype/EventMap.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/haplotype/EventMap.java @@ -74,7 +74,7 @@ public class EventMap extends TreeMap { * For testing. Let's you set up a explicit configuration without having to process a haplotype and reference * @param stateForTesting */ - protected EventMap(final Collection stateForTesting) { + public EventMap(final Collection stateForTesting) { haplotype = null; ref = null; refLoc = null; @@ -176,99 +176,8 @@ public class EventMap extends TreeMap { } } - // handle the case where the event set for the haplotype is very complex - // TODO -- theoretically this should be part of the MergeVariantsAcrossHaplotypes class, but it's just so much more complicated to do so - if ( variationIsTooComplex(proposedEvents) ) { - addComplexVC(cigar, alignment, haplotype.getAlignmentStartHapwrtRef()); - } else { - for ( final VariantContext proposedEvent : proposedEvents ) - addVC(proposedEvent, true); - } - } - - /** - * Determine whether the provided set of variants is too complex for breaking up into individual parts - * - * @param events the individual events - * @return true if the cigar is too complex, false otherwise - */ - private boolean variationIsTooComplex(final List events) { - // TODO -- we've decided to disable this for now and try "physical phasing" - return false; - - //int indelCount = 0; - //for ( final VariantContext event : events ) { - // if ( event.isIndel() ) - // indelCount++; - //} - // - // don't allow too many indels - //return indelCount > MAX_INDELS_PER_HAPLOTYPE; - } - - /** - * Add a complex variant context to the events given the haplotype and its cigar - * - * @param cigar the cigar to convert - * @param haplotype the bases of the alternate haplotype - * @param refPos the position on the reference for this alignment - */ - private void addComplexVC(final Cigar cigar, final byte[] haplotype, final int refPos) { - // ignore leading and trailing bases that match between this haplotype and the reference - final int matchingPrefix = numPrefixMatch(ref, haplotype, refPos, 0); - final int matchingSuffix = numSuffixMatch(ref, haplotype, refPos + cigar.getReferenceLength() - 1, haplotype.length - 1); - - // edge case: too many matches - final int totalMatch = matchingPrefix + matchingSuffix; - if ( totalMatch >= haplotype.length || totalMatch >= ref.length ) - return; - - final byte[] refBases = Arrays.copyOfRange(ref, refPos + matchingPrefix, refPos + cigar.getReferenceLength() - matchingSuffix); - final byte[] altBases = Arrays.copyOfRange(haplotype, matchingPrefix, haplotype.length - matchingSuffix); - - final List alleles = new ArrayList<>(); - alleles.add( Allele.create( refBases, true ) ); - alleles.add( Allele.create( altBases, false ) ); - final int start = refLoc.getStart() + refPos + matchingPrefix; - addVC(new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), start, start + refBases.length - 1, alleles).make(), true); - } - - /** - * calculates the extent of the prefix match between 2 sequences - * - * @param seq1 the first sequence - * @param seq2 the second sequence - * @param startPos1 the index on seq1 to start from - * @param startPos2 the index on seq2 to start from - * @return non-negative int representing the matching prefix - */ - private int numPrefixMatch(final byte[] seq1, final byte[] seq2, final int startPos1, final int startPos2) { - int matchingBases = 0; - for ( int pos1 = startPos1, pos2 = startPos2; pos1 < seq1.length && pos2 < seq2.length; pos1++, pos2++ ) { - if ( seq1[pos1] != seq2[pos2] ) - break; - matchingBases++; - } - return matchingBases; - } - - /** - * calculates the extent of the suffix match between 2 sequences - * - * @param seq1 the first sequence - * @param seq2 the second sequence - * @param startPos1 the index on seq1 to start from - * @param startPos2 the index on seq2 to start from - * @return non-negative int representing the matching suffix - */ - private int numSuffixMatch(final byte[] seq1, final byte[] seq2, final int startPos1, final int startPos2) { - int matchingBases = 0; - for ( int pos1 = startPos1, pos2 = startPos2; pos1 >=0 && pos2 >= 0; pos1--, pos2-- ) { - if ( seq1[pos1] != seq2[pos2] ) - break; - matchingBases++; - } - return matchingBases; + for ( final VariantContext proposedEvent : proposedEvents ) + addVC(proposedEvent, true); } /** @@ -345,7 +254,7 @@ public class EventMap extends TreeMap { // TODO -- warning this is an O(N^3) algorithm because I'm just lazy. If it's valuable we need to reengineer it @Requires("getNumberOfEvents() > 0") - protected void replaceClumpedEventsWithBlockSubstititions() { + protected void replaceClumpedEventsWithBlockSubstitutions() { if ( getNumberOfEvents() >= MIN_NUMBER_OF_EVENTS_TO_COMBINE_INTO_BLOCK_SUBSTITUTION) { int lastStart = -1; for ( boolean foundOne = true; foundOne; ) { diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/haplotype/EventMapUnitTest.java b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/haplotype/EventMapUnitTest.java index 3870182fe..6baf6ef4d 100644 --- a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/haplotype/EventMapUnitTest.java +++ b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/haplotype/EventMapUnitTest.java @@ -130,7 +130,7 @@ public class EventMapUnitTest extends BaseTest { final Haplotype hap = new Haplotype(haplotypeBases.getBytes(), false, 0, TextCigarCodec.getSingleton().decode(cigar)); final GenomeLoc loc = new UnvalidatingGenomeLoc(CHR, 0, 1, refBases.length()); final EventMap ee = new EventMap(hap, refBases.getBytes(), loc, NAME); - ee.replaceClumpedEventsWithBlockSubstititions(); + ee.replaceClumpedEventsWithBlockSubstitutions(); Assert.assertEquals(ee.getNumberOfEvents(), 1); final VariantContext actual = ee.getVariantContexts().iterator().next(); Assert.assertTrue(GATKVariantContextUtils.equalSites(actual, expectedBlock), "Failed with " + actual); @@ -159,7 +159,7 @@ public class EventMapUnitTest extends BaseTest { final Haplotype hap = new Haplotype(haplotypeBases.getBytes(), false, 0, TextCigarCodec.getSingleton().decode(cigar)); final GenomeLoc loc = new UnvalidatingGenomeLoc(CHR, 0, 1, refBases.length()); final EventMap ee = new EventMap(hap, refBases.getBytes(), loc, NAME); - ee.replaceClumpedEventsWithBlockSubstititions(); + ee.replaceClumpedEventsWithBlockSubstitutions(); Assert.assertEquals(ee.getNumberOfEvents(), expectedAlleles.size()); final List actuals = new ArrayList(ee.getVariantContexts()); for ( int i = 0; i < ee.getNumberOfEvents(); i++ ) {