diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCaller.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCaller.java index 1e656facc..cf34b1fb4 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCaller.java @@ -496,6 +496,12 @@ public class HaplotypeCaller extends ActiveRegionWalker, In @Argument(fullName="mergeVariantsViaLD", shortName="mergeVariantsViaLD", doc="If specified, we will merge variants together into block substitutions that are in strong local LD", required = false) protected boolean mergeVariantsViaLD = false; + @Advanced + @Argument(fullName="tryPhysicalPhasing", shortName="tryPhysicalPhasing", doc="If specified, we will add physical (read-based) phasing information", required = false) + protected boolean tryPhysicalPhasing = false; + + public static final String HAPLOTYPE_CALLER_PHASING_KEY = "HCP"; + // ----------------------------------------------------------------------------------------------- // arguments for debugging / developing the haplotype caller // ----------------------------------------------------------------------------------------------- @@ -672,7 +678,7 @@ public class HaplotypeCaller extends ActiveRegionWalker, In if( SCAC.genotypingOutputMode == GenotypingOutputMode.GENOTYPE_GIVEN_ALLELES && consensusMode ) throw new UserException("HaplotypeCaller cannot be run in both GENOTYPE_GIVEN_ALLELES mode and in consensus mode. Please choose one or the other."); - genotypingEngine = new HaplotypeCallerGenotypingEngine( getToolkit(),SCAC); + genotypingEngine = new HaplotypeCallerGenotypingEngine( getToolkit(), SCAC, tryPhysicalPhasing); // initialize the output VCF header final VariantAnnotatorEngine annotationEngine = new VariantAnnotatorEngine(Arrays.asList(annotationClassesToUse), annotationsToUse, annotationsToExclude, this, getToolkit()); @@ -693,6 +699,9 @@ public class HaplotypeCaller extends ActiveRegionWalker, In VCFConstants.DEPTH_KEY, VCFConstants.GENOTYPE_PL_KEY); + if ( tryPhysicalPhasing ) + headerInfo.add(new VCFFormatHeaderLine(HAPLOTYPE_CALLER_PHASING_KEY, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Physical phasing information, each unique ID within a given sample (but not across samples) connects alternate alleles as occurring on the same haplotype")); + // FILTER fields are added unconditionally as it's not always 100% certain the circumstances // where the filters are used. For example, in emitting all sites the lowQual field is used headerInfo.add(new VCFFilterHeaderLine(UnifiedGenotypingEngine.LOW_QUAL_FILTER_NAME, "Low quality")); diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngine.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngine.java index a1bf09043..0a78367d3 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngine.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngine.java @@ -77,13 +77,16 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine sampleNames) { super(toolkit,configuration,sampleNames); + tryPhysicalPhasing = false; } @@ -282,7 +286,179 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine phasedCalls = tryPhysicalPhasing ? phaseCalls(returnCalls, calledHaplotypes) : returnCalls; + return new CalledHaplotypes(phasedCalls, calledHaplotypes); + } + + /** + * Tries to phase the individual alleles based on pairwise comparisons to the other alleles based on all called haplotypes + * + * @param calls the list of called alleles + * @param calledHaplotypes the set of haplotypes used for calling + * @return a non-null list which represents the possibly phased version of the calls + */ + protected List phaseCalls(final List calls, final Set calledHaplotypes) { + + // construct a mapping from alternate allele to the set of haplotypes that contain that allele + final Map> haplotypeMap = constructHaplotypeMapping(calls, calledHaplotypes); + + // construct a mapping from call to phase set ID + final Map phaseSetMapping = new HashMap<>(); + final int uniqueCounterEndValue = constructPhaseSetMapping(calls, haplotypeMap, phaseSetMapping); + + // we want to establish (potential) *groups* of phased variants, so we need to be smart when looking at pairwise phasing partners + return constructPhaseGroups(calls, phaseSetMapping, uniqueCounterEndValue); + } + + /** + * Construct the mapping from alternate allele to the set of haplotypes that contain that allele + * + * @param originalCalls the original unphased calls + * @param calledHaplotypes the set of haplotypes used for calling + * @return non-null Map + */ + protected static Map> constructHaplotypeMapping(final List originalCalls, + final Set calledHaplotypes) { + final Map> haplotypeMap = new HashMap<>(originalCalls.size()); + for ( final VariantContext call : originalCalls ) { + // don't try to phase if there is not exactly 1 alternate allele + if ( ! isBiallelic(call) ) { + haplotypeMap.put(call, Collections.emptySet()); + continue; + } + + // keep track of the haplotypes that contain this particular alternate allele + final Set hapsWithAllele = new HashSet<>(); + final Allele alt = call.getAlternateAllele(0); + + for ( final Haplotype h : calledHaplotypes ) { + for ( final VariantContext event : h.getEventMap().getVariantContexts() ) { + if ( event.getStart() == call.getStart() && event.getAlternateAlleles().contains(alt) ) + hapsWithAllele.add(h); + } + } + haplotypeMap.put(call, hapsWithAllele); + } + + return haplotypeMap; + } + + + /** + * Construct the mapping from call (variant context) to phase set ID + * + * @param originalCalls the original unphased calls + * @param haplotypeMap mapping from alternate allele to the set of haplotypes that contain that allele + * @param phaseSetMapping the map to populate in this method + * @return the next incremental unique index + */ + protected static int constructPhaseSetMapping(final List originalCalls, + final Map> haplotypeMap, + final Map phaseSetMapping) { + + final int numCalls = originalCalls.size(); + int uniqueCounter = 0; + + // use the haplotype mapping to connect variants that are always present on the same haplotypes + for ( int i = 0; i < numCalls - 1; i++ ) { + final VariantContext call = originalCalls.get(i); + final Set haplotypesWithCall = haplotypeMap.get(call); + if ( haplotypesWithCall.isEmpty() ) + continue; + + for ( int j = i+1; j < numCalls; j++ ) { + final VariantContext comp = originalCalls.get(j); + final Set haplotypesWithComp = haplotypeMap.get(comp); + + // if the variants are in phase, record that fact + if ( haplotypesWithCall.size() == haplotypesWithComp.size() && haplotypesWithCall.containsAll(haplotypesWithComp) ) { + // if it's part of an existing group, use that group's unique ID + if ( phaseSetMapping.containsKey(call) ) { + phaseSetMapping.put(comp, phaseSetMapping.get(call)); + // otherwise, create a new group + } else { + phaseSetMapping.put(call, uniqueCounter); + phaseSetMapping.put(comp, uniqueCounter); + uniqueCounter++; + } + } + } + } + + return uniqueCounter; + } + + /** + * Assemble the phase groups together and update the original calls accordingly + * + * @param originalCalls the original unphased calls + * @param phaseSetMapping mapping from call (variant context) to phase group ID + * @param indexTo last index (exclusive) of phase group IDs + * @return a non-null list which represents the possibly phased version of the calls + */ + protected static List constructPhaseGroups(final List originalCalls, + final Map phaseSetMapping, + final int indexTo) { + final List phasedCalls = new ArrayList<>(originalCalls); + + // if we managed to find any phased groups, update the VariantContexts + for ( int count = 0; count < indexTo; count++ ) { + // get all of the (indexes of the) calls that belong in this group (keeping them in the original order) + final List indexes = new ArrayList<>(); + for ( int index = 0; index < originalCalls.size(); index++ ) { + final VariantContext call = originalCalls.get(index); + if ( phaseSetMapping.containsKey(call) && phaseSetMapping.get(call) == count ) + indexes.add(index); + } + if ( indexes.size() < 2 ) + throw new IllegalStateException("Somehow we have a group of phased variants that has fewer than 2 members"); + + // create a unique ID based on the leftmost one + final String uniqueID = createUniqueID(originalCalls.get(indexes.get(0))); + + // update the VCs + for ( final int index : indexes ) { + final VariantContext phasedCall = phaseVC(originalCalls.get(index), uniqueID); + phasedCalls.set(index, phasedCall); + } + } + + return phasedCalls; + } + + /** + * Is this variant bi-allelic? This implementation is very much specific to this class so shouldn't be pulled out into a generalized place. + * + * @param vc the variant context + * @return true if this variant context is bi-allelic, ignoring the NON-REF symbolic allele, false otherwise + */ + private static boolean isBiallelic(final VariantContext vc) { + return vc.isBiallelic() || (vc.getNAlleles() == 3 && vc.getAlternateAlleles().contains(GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE)); + } + + /** + * Create a unique identifier given the variant context + * + * @param vc the variant context + * @return non-null String + */ + private static String createUniqueID(final VariantContext vc) { + return String.format("%d_%s_%s", vc.getStart(), vc.getReference().getDisplayString(), vc.getAlternateAllele(0).getDisplayString()); + // return base + "_0," + base + "_1"; + } + + /** + * Add physical phase information to the provided variant context + * + * @param vc the variant context + * @param ID the ID to use + * @return phased non-null variant context + */ + private static VariantContext phaseVC(final VariantContext vc, final String ID) { + final List phasedGenotypes = new ArrayList<>(); + for ( final Genotype g : vc.getGenotypes() ) + phasedGenotypes.add(new GenotypeBuilder(g).attribute(HaplotypeCaller.HAPLOTYPE_CALLER_PHASING_KEY, ID).make()); + return new VariantContextBuilder(vc).genotypes(phasedGenotypes).make(); } // Builds the read-likelihoods collection to use for annotation considering user arguments and the collection diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngineUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngineUnitTest.java index 2fd7a64f7..19f07eccd 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngineUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngineUnitTest.java @@ -57,6 +57,7 @@ import htsjdk.variant.variantcontext.*; import org.broadinstitute.gatk.utils.BaseTest; import org.broadinstitute.gatk.utils.*; import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile; +import org.broadinstitute.gatk.utils.haplotype.EventMap; import org.broadinstitute.gatk.utils.haplotype.Haplotype; import org.broadinstitute.gatk.utils.pileup.ReadBackedPileup; import org.broadinstitute.gatk.utils.pileup.ReadBackedPileupImpl; @@ -365,4 +366,201 @@ public class HaplotypeCallerGenotypingEngineUnitTest extends BaseTest { } return true; } -} \ No newline at end of file + + + @DataProvider(name = "CreateHaplotypeMappingProvider") + public Object[][] makeCreateHaplotypeMappingData() { + List tests = new ArrayList(); + + final Set haplotypes = new HashSet<>(); + final Allele ref = Allele.create("A", true); + final Allele altC = Allele.create("C", false); + final Allele altT = Allele.create("T", false); + + final Haplotype AtoC1 = new Haplotype("AACAA".getBytes()); + final VariantContext vc1 = new VariantContextBuilder().chr("20").start(3).stop(3).alleles(Arrays.asList(ref, altC)).make(); + AtoC1.setEventMap(new EventMap(Arrays.asList(vc1))); + AtoC1.getEventMap().put(3, vc1); + haplotypes.add(AtoC1); + + final Haplotype AtoC2 = new Haplotype("AAACA".getBytes()); + final VariantContext vc2 = new VariantContextBuilder().chr("20").start(4).stop(4).alleles(Arrays.asList(ref, altT)).make(); + AtoC2.setEventMap(new EventMap(Arrays.asList(vc2))); + AtoC2.getEventMap().put(4, vc2); + haplotypes.add(AtoC2); + + tests.add(new Object[]{vc1, haplotypes, AtoC1}); + tests.add(new Object[]{vc2, haplotypes, AtoC2}); + tests.add(new Object[]{new VariantContextBuilder().chr("20").start(1).stop(1).alleles(Arrays.asList(ref, altT)).make(), haplotypes, null}); + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider="CreateHaplotypeMappingProvider") + public void testCreateHaplotypeMapping(final VariantContext vc, final Set haplotypes, final Haplotype expected) { + final Map> mapping = HaplotypeCallerGenotypingEngine.constructHaplotypeMapping(Arrays.asList(vc), haplotypes); + final Set actual = mapping.get(vc); + if ( expected == null ) + Assert.assertTrue(actual.isEmpty(), actual.toString()); + else { + Assert.assertEquals(actual.size(), 1); + Assert.assertEquals(actual.iterator().next(), expected); + } + } + + @DataProvider(name = "ConstructPhaseSetMappingProvider") + public Object[][] makeConstructPhaseSetMappingData() { + List tests = new ArrayList(); + + final Allele ref = Allele.create("A", true); + final Allele altC = Allele.create("C", false); + final Allele altT = Allele.create("T", false); + + final VariantContext vc2 = new VariantContextBuilder().chr("20").start(2).stop(2).alleles(Arrays.asList(ref, altC)).make(); + final VariantContext vc3 = new VariantContextBuilder().chr("20").start(3).stop(3).alleles(Arrays.asList(ref, altT)).make(); + final VariantContext vc4 = new VariantContextBuilder().chr("20").start(4).stop(4).alleles(Arrays.asList(ref, altC)).make(); + final List calls = Arrays.asList(vc2, vc3, vc4); + + final Haplotype pos2 = new Haplotype("ACAAA".getBytes()); + pos2.setEventMap(new EventMap(Arrays.asList(vc2))); + pos2.getEventMap().put(2, vc2); + final Haplotype pos3 = new Haplotype("AACAA".getBytes()); + pos3.setEventMap(new EventMap(Arrays.asList(vc3))); + pos3.getEventMap().put(3, vc3); + final Haplotype pos4 = new Haplotype("AAACA".getBytes()); + pos4.setEventMap(new EventMap(Arrays.asList(vc4))); + pos4.getEventMap().put(4, vc4); + final Haplotype pos24 = new Haplotype("ACACA".getBytes()); + pos24.setEventMap(new EventMap(Arrays.asList(vc2, vc4))); + pos24.getEventMap().put(2, vc2); + pos24.getEventMap().put(4, vc4); + final Haplotype pos234 = new Haplotype("ACCCA".getBytes()); + pos234.setEventMap(new EventMap(Arrays.asList(vc2, vc3, vc4))); + pos234.getEventMap().put(2, vc2); + pos234.getEventMap().put(3, vc3); + pos234.getEventMap().put(4, vc4); + + // test no phased variants + final Set haplotypes2NoPhase = new HashSet<>(); + haplotypes2NoPhase.add(pos2); + haplotypes2NoPhase.add(pos24); + final Set haplotypes3NoPhase = new HashSet<>(); + haplotypes3NoPhase.add(pos3); + final Set haplotypes4NoPhase = new HashSet<>(); + haplotypes4NoPhase.add(pos4); + haplotypes4NoPhase.add(pos24); + final Map> haplotypeMapNoPhase = new HashMap<>(); + haplotypeMapNoPhase.put(vc2, haplotypes2NoPhase); + haplotypeMapNoPhase.put(vc3, haplotypes3NoPhase); + haplotypeMapNoPhase.put(vc4, haplotypes4NoPhase); + tests.add(new Object[]{calls, haplotypeMapNoPhase, 0, 0}); + + // test 2 phased variants + final Set haplotypes2SomePhase = new HashSet<>(); + haplotypes2SomePhase.add(pos24); + final Set haplotypes4SomePhase = new HashSet<>(); + haplotypes4SomePhase.add(pos24); + final Map> haplotypeMapSomePhase = new HashMap<>(); + haplotypeMapSomePhase.put(vc2, haplotypes2SomePhase); + haplotypeMapSomePhase.put(vc3, haplotypes3NoPhase); + haplotypeMapSomePhase.put(vc4, haplotypes4SomePhase); + tests.add(new Object[]{calls, haplotypeMapSomePhase, 2, 1}); + + // test all phased variants + final Set haplotypes2AllPhase = new HashSet<>(); + haplotypes2AllPhase.add(pos234); + final Set haplotypes3AllPhase = new HashSet<>(); + haplotypes3AllPhase.add(pos234); + final Set haplotypes4AllPhase = new HashSet<>(); + haplotypes4AllPhase.add(pos234); + final Map> haplotypeMapAllPhase = new HashMap<>(); + haplotypeMapAllPhase.put(vc2, haplotypes2AllPhase); + haplotypeMapAllPhase.put(vc3, haplotypes3AllPhase); + haplotypeMapAllPhase.put(vc4, haplotypes4AllPhase); + tests.add(new Object[]{calls, haplotypeMapAllPhase, 3, 1}); + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider="ConstructPhaseSetMappingProvider") + public void testConstructPhaseSetMapping(final List calls, + final Map> haplotypeMap, + final int expectedMapSize, + final int expectedNumGroups) { + final Map actualPhaseSetMapping = new HashMap<>(); + final int actualNumGroups = HaplotypeCallerGenotypingEngine.constructPhaseSetMapping(calls, haplotypeMap, actualPhaseSetMapping); + Assert.assertEquals(actualNumGroups, expectedNumGroups); + Assert.assertEquals(actualPhaseSetMapping.size(), expectedMapSize); + } + + @DataProvider(name = "ConstructPhaseGroupsProvider") + public Object[][] makeConstructPhaseGroupsData() { + List tests = new ArrayList(); + + final Allele ref = Allele.create("A", true); + final Allele altC = Allele.create("C", false); + + final Genotype g1 = new GenotypeBuilder().alleles(Arrays.asList(ref, altC)).make(); + final VariantContext vc1 = new VariantContextBuilder().chr("20").start(1).stop(1).alleles(Arrays.asList(ref, altC)).genotypes(g1).make(); + final Genotype g2 = new GenotypeBuilder().alleles(Arrays.asList(ref, altC)).make(); + final VariantContext vc2 = new VariantContextBuilder().chr("20").start(2).stop(2).alleles(Arrays.asList(ref, altC)).genotypes(g2).make(); + final Genotype g3 = new GenotypeBuilder().alleles(Arrays.asList(ref, altC)).make(); + final VariantContext vc3 = new VariantContextBuilder().chr("20").start(3).stop(3).alleles(Arrays.asList(ref, altC)).genotypes(g3).make(); + final List calls = Arrays.asList(vc1, vc2, vc3); + + // test no phased variants, empty map + final Map nonePhased1 = new HashMap<>(); + tests.add(new Object[]{calls, nonePhased1, 0, 0, 0}); + + // test no phased variants, full map, exception expected + final Map nonePhased2 = new HashMap<>(); + nonePhased2.put(vc1, 0); + nonePhased2.put(vc2, 1); + nonePhased2.put(vc3, 2); + tests.add(new Object[]{calls, nonePhased2, 3, -1, -1}); + + // test 2 phased variants + final Map twoPhased = new HashMap<>(); + twoPhased.put(vc1, 0); + twoPhased.put(vc2, 0); + tests.add(new Object[]{calls, twoPhased, 1, 1, 2}); + + // test all phased variants + final Map allPhased = new HashMap<>(); + allPhased.put(vc1, 0); + allPhased.put(vc2, 0); + allPhased.put(vc3, 0); + tests.add(new Object[]{calls, allPhased, 1, 1, 3}); + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider="ConstructPhaseGroupsProvider") + public void testConstructPhaseGroups(final List calls, + final Map phaseMap, + final int endIndex, + final int expectedNumGroups, + final int expectedGroupSize) { + final List actualPhasedCalls; + try { + actualPhasedCalls = HaplotypeCallerGenotypingEngine.constructPhaseGroups(calls, phaseMap, endIndex); + } catch (IllegalStateException e) { + Assert.assertEquals(-1, expectedNumGroups); + return; + } + + final Set uniqueGroups = new HashSet<>(); + int counter = 0; + for ( final VariantContext call : actualPhasedCalls ) { + for ( final Genotype g : call.getGenotypes() ) { + if ( g.hasExtendedAttribute(HaplotypeCaller.HAPLOTYPE_CALLER_PHASING_KEY) ) { + uniqueGroups.add(g.getExtendedAttribute(HaplotypeCaller.HAPLOTYPE_CALLER_PHASING_KEY).toString()); + counter++; + } + } + } + + Assert.assertEquals(uniqueGroups.size(), expectedNumGroups); + Assert.assertEquals(counter, expectedGroupSize); + } +} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/haplotype/EventMap.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/haplotype/EventMap.java index a540054bc..c460ff555 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/haplotype/EventMap.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/haplotype/EventMap.java @@ -74,7 +74,7 @@ public class EventMap extends TreeMap { * For testing. Let's you set up a explicit configuration without having to process a haplotype and reference * @param stateForTesting */ - protected EventMap(final Collection stateForTesting) { + public EventMap(final Collection stateForTesting) { haplotype = null; ref = null; refLoc = null; @@ -176,99 +176,8 @@ public class EventMap extends TreeMap { } } - // handle the case where the event set for the haplotype is very complex - // TODO -- theoretically this should be part of the MergeVariantsAcrossHaplotypes class, but it's just so much more complicated to do so - if ( variationIsTooComplex(proposedEvents) ) { - addComplexVC(cigar, alignment, haplotype.getAlignmentStartHapwrtRef()); - } else { - for ( final VariantContext proposedEvent : proposedEvents ) - addVC(proposedEvent, true); - } - } - - /** - * Determine whether the provided set of variants is too complex for breaking up into individual parts - * - * @param events the individual events - * @return true if the cigar is too complex, false otherwise - */ - private boolean variationIsTooComplex(final List events) { - // TODO -- we've decided to disable this for now and try "physical phasing" - return false; - - //int indelCount = 0; - //for ( final VariantContext event : events ) { - // if ( event.isIndel() ) - // indelCount++; - //} - // - // don't allow too many indels - //return indelCount > MAX_INDELS_PER_HAPLOTYPE; - } - - /** - * Add a complex variant context to the events given the haplotype and its cigar - * - * @param cigar the cigar to convert - * @param haplotype the bases of the alternate haplotype - * @param refPos the position on the reference for this alignment - */ - private void addComplexVC(final Cigar cigar, final byte[] haplotype, final int refPos) { - // ignore leading and trailing bases that match between this haplotype and the reference - final int matchingPrefix = numPrefixMatch(ref, haplotype, refPos, 0); - final int matchingSuffix = numSuffixMatch(ref, haplotype, refPos + cigar.getReferenceLength() - 1, haplotype.length - 1); - - // edge case: too many matches - final int totalMatch = matchingPrefix + matchingSuffix; - if ( totalMatch >= haplotype.length || totalMatch >= ref.length ) - return; - - final byte[] refBases = Arrays.copyOfRange(ref, refPos + matchingPrefix, refPos + cigar.getReferenceLength() - matchingSuffix); - final byte[] altBases = Arrays.copyOfRange(haplotype, matchingPrefix, haplotype.length - matchingSuffix); - - final List alleles = new ArrayList<>(); - alleles.add( Allele.create( refBases, true ) ); - alleles.add( Allele.create( altBases, false ) ); - final int start = refLoc.getStart() + refPos + matchingPrefix; - addVC(new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), start, start + refBases.length - 1, alleles).make(), true); - } - - /** - * calculates the extent of the prefix match between 2 sequences - * - * @param seq1 the first sequence - * @param seq2 the second sequence - * @param startPos1 the index on seq1 to start from - * @param startPos2 the index on seq2 to start from - * @return non-negative int representing the matching prefix - */ - private int numPrefixMatch(final byte[] seq1, final byte[] seq2, final int startPos1, final int startPos2) { - int matchingBases = 0; - for ( int pos1 = startPos1, pos2 = startPos2; pos1 < seq1.length && pos2 < seq2.length; pos1++, pos2++ ) { - if ( seq1[pos1] != seq2[pos2] ) - break; - matchingBases++; - } - return matchingBases; - } - - /** - * calculates the extent of the suffix match between 2 sequences - * - * @param seq1 the first sequence - * @param seq2 the second sequence - * @param startPos1 the index on seq1 to start from - * @param startPos2 the index on seq2 to start from - * @return non-negative int representing the matching suffix - */ - private int numSuffixMatch(final byte[] seq1, final byte[] seq2, final int startPos1, final int startPos2) { - int matchingBases = 0; - for ( int pos1 = startPos1, pos2 = startPos2; pos1 >=0 && pos2 >= 0; pos1--, pos2-- ) { - if ( seq1[pos1] != seq2[pos2] ) - break; - matchingBases++; - } - return matchingBases; + for ( final VariantContext proposedEvent : proposedEvents ) + addVC(proposedEvent, true); } /** @@ -345,7 +254,7 @@ public class EventMap extends TreeMap { // TODO -- warning this is an O(N^3) algorithm because I'm just lazy. If it's valuable we need to reengineer it @Requires("getNumberOfEvents() > 0") - protected void replaceClumpedEventsWithBlockSubstititions() { + protected void replaceClumpedEventsWithBlockSubstitutions() { if ( getNumberOfEvents() >= MIN_NUMBER_OF_EVENTS_TO_COMBINE_INTO_BLOCK_SUBSTITUTION) { int lastStart = -1; for ( boolean foundOne = true; foundOne; ) { diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/haplotype/EventMapUnitTest.java b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/haplotype/EventMapUnitTest.java index 3870182fe..6baf6ef4d 100644 --- a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/haplotype/EventMapUnitTest.java +++ b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/haplotype/EventMapUnitTest.java @@ -130,7 +130,7 @@ public class EventMapUnitTest extends BaseTest { final Haplotype hap = new Haplotype(haplotypeBases.getBytes(), false, 0, TextCigarCodec.getSingleton().decode(cigar)); final GenomeLoc loc = new UnvalidatingGenomeLoc(CHR, 0, 1, refBases.length()); final EventMap ee = new EventMap(hap, refBases.getBytes(), loc, NAME); - ee.replaceClumpedEventsWithBlockSubstititions(); + ee.replaceClumpedEventsWithBlockSubstitutions(); Assert.assertEquals(ee.getNumberOfEvents(), 1); final VariantContext actual = ee.getVariantContexts().iterator().next(); Assert.assertTrue(GATKVariantContextUtils.equalSites(actual, expectedBlock), "Failed with " + actual); @@ -159,7 +159,7 @@ public class EventMapUnitTest extends BaseTest { final Haplotype hap = new Haplotype(haplotypeBases.getBytes(), false, 0, TextCigarCodec.getSingleton().decode(cigar)); final GenomeLoc loc = new UnvalidatingGenomeLoc(CHR, 0, 1, refBases.length()); final EventMap ee = new EventMap(hap, refBases.getBytes(), loc, NAME); - ee.replaceClumpedEventsWithBlockSubstititions(); + ee.replaceClumpedEventsWithBlockSubstitutions(); Assert.assertEquals(ee.getNumberOfEvents(), expectedAlleles.size()); final List actuals = new ArrayList(ee.getVariantContexts()); for ( int i = 0; i < ee.getNumberOfEvents(); i++ ) {