Initial implementation of functionality to add physical phasing information to the output of the HaplotypeCaller.

If any pair of variants occurs on all used haplotypes together, then we propagate that information into the gVCF.
Can be enabled with the --tryPhysicalPhasing argument.
This commit is contained in:
Eric Banks 2014-07-14 14:57:56 -04:00
parent abcaba4bc3
commit 4512940e87
5 changed files with 393 additions and 101 deletions

View File

@ -495,6 +495,12 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
@Argument(fullName="mergeVariantsViaLD", shortName="mergeVariantsViaLD", doc="If specified, we will merge variants together into block substitutions that are in strong local LD", required = false)
protected boolean mergeVariantsViaLD = false;
@Advanced
@Argument(fullName="tryPhysicalPhasing", shortName="tryPhysicalPhasing", doc="If specified, we will add physical (read-based) phasing information", required = false)
protected boolean tryPhysicalPhasing = false;
public static final String HAPLOTYPE_CALLER_PHASING_KEY = "HCP";
// -----------------------------------------------------------------------------------------------
// arguments for debugging / developing the haplotype caller
// -----------------------------------------------------------------------------------------------
@ -671,7 +677,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
if( SCAC.genotypingOutputMode == GenotypingOutputMode.GENOTYPE_GIVEN_ALLELES && consensusMode )
throw new UserException("HaplotypeCaller cannot be run in both GENOTYPE_GIVEN_ALLELES mode and in consensus mode. Please choose one or the other.");
genotypingEngine = new HaplotypeCallerGenotypingEngine( getToolkit(),SCAC);
genotypingEngine = new HaplotypeCallerGenotypingEngine( getToolkit(), SCAC, tryPhysicalPhasing);
// initialize the output VCF header
final VariantAnnotatorEngine annotationEngine = new VariantAnnotatorEngine(Arrays.asList(annotationClassesToUse), annotationsToUse, annotationsToExclude, this, getToolkit());
@ -692,6 +698,9 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
VCFConstants.DEPTH_KEY,
VCFConstants.GENOTYPE_PL_KEY);
if ( tryPhysicalPhasing )
headerInfo.add(new VCFFormatHeaderLine(HAPLOTYPE_CALLER_PHASING_KEY, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Physical phasing information, each unique ID within a given sample (but not across samples) connects alternate alleles as occurring on the same haplotype"));
// FILTER fields are added unconditionally as it's not always 100% certain the circumstances
// where the filters are used. For example, in emitting all sites the lowQual field is used
headerInfo.add(new VCFFilterHeaderLine(UnifiedGenotypingEngine.LOW_QUAL_FILTER_NAME, "Low quality"));

View File

@ -78,13 +78,16 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine<HaplotypeC
private MergeVariantsAcrossHaplotypes crossHaplotypeEventMerger;
private final boolean tryPhysicalPhasing;
/**
* {@inheritDoc}
* @param toolkit {@inheritDoc}
* @param configuration {@inheritDoc}
*/
public HaplotypeCallerGenotypingEngine(final GenomeAnalysisEngine toolkit, final HaplotypeCallerArgumentCollection configuration) {
public HaplotypeCallerGenotypingEngine(final GenomeAnalysisEngine toolkit, final HaplotypeCallerArgumentCollection configuration, final boolean tryPhysicalPhasing) {
super(toolkit,configuration);
this.tryPhysicalPhasing = tryPhysicalPhasing;
}
/**
@ -95,6 +98,7 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine<HaplotypeC
*/
public HaplotypeCallerGenotypingEngine(final GenomeAnalysisEngine toolkit, final HaplotypeCallerArgumentCollection configuration, final Set<String> sampleNames) {
super(toolkit,configuration,sampleNames);
tryPhysicalPhasing = false;
}
@ -279,7 +283,179 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine<HaplotypeC
}
}
return new CalledHaplotypes(returnCalls, calledHaplotypes);
final List<VariantContext> phasedCalls = tryPhysicalPhasing ? phaseCalls(returnCalls, calledHaplotypes) : returnCalls;
return new CalledHaplotypes(phasedCalls, calledHaplotypes);
}
/**
* Tries to phase the individual alleles based on pairwise comparisons to the other alleles based on all called haplotypes
*
* @param calls the list of called alleles
* @param calledHaplotypes the set of haplotypes used for calling
* @return a non-null list which represents the possibly phased version of the calls
*/
protected List<VariantContext> phaseCalls(final List<VariantContext> calls, final Set<Haplotype> calledHaplotypes) {
// construct a mapping from alternate allele to the set of haplotypes that contain that allele
final Map<VariantContext, Set<Haplotype>> haplotypeMap = constructHaplotypeMapping(calls, calledHaplotypes);
// construct a mapping from call to phase set ID
final Map<VariantContext, Integer> phaseSetMapping = new HashMap<>();
final int uniqueCounterEndValue = constructPhaseSetMapping(calls, haplotypeMap, phaseSetMapping);
// we want to establish (potential) *groups* of phased variants, so we need to be smart when looking at pairwise phasing partners
return constructPhaseGroups(calls, phaseSetMapping, uniqueCounterEndValue);
}
/**
* Construct the mapping from alternate allele to the set of haplotypes that contain that allele
*
* @param originalCalls the original unphased calls
* @param calledHaplotypes the set of haplotypes used for calling
* @return non-null Map
*/
protected static Map<VariantContext, Set<Haplotype>> constructHaplotypeMapping(final List<VariantContext> originalCalls,
final Set<Haplotype> calledHaplotypes) {
final Map<VariantContext, Set<Haplotype>> haplotypeMap = new HashMap<>(originalCalls.size());
for ( final VariantContext call : originalCalls ) {
// don't try to phase if there is not exactly 1 alternate allele
if ( ! isBiallelic(call) ) {
haplotypeMap.put(call, Collections.<Haplotype>emptySet());
continue;
}
// keep track of the haplotypes that contain this particular alternate allele
final Set<Haplotype> hapsWithAllele = new HashSet<>();
final Allele alt = call.getAlternateAllele(0);
for ( final Haplotype h : calledHaplotypes ) {
for ( final VariantContext event : h.getEventMap().getVariantContexts() ) {
if ( event.getStart() == call.getStart() && event.getAlternateAlleles().contains(alt) )
hapsWithAllele.add(h);
}
}
haplotypeMap.put(call, hapsWithAllele);
}
return haplotypeMap;
}
/**
* Construct the mapping from call (variant context) to phase set ID
*
* @param originalCalls the original unphased calls
* @param haplotypeMap mapping from alternate allele to the set of haplotypes that contain that allele
* @param phaseSetMapping the map to populate in this method
* @return the next incremental unique index
*/
protected static int constructPhaseSetMapping(final List<VariantContext> originalCalls,
final Map<VariantContext, Set<Haplotype>> haplotypeMap,
final Map<VariantContext, Integer> phaseSetMapping) {
final int numCalls = originalCalls.size();
int uniqueCounter = 0;
// use the haplotype mapping to connect variants that are always present on the same haplotypes
for ( int i = 0; i < numCalls - 1; i++ ) {
final VariantContext call = originalCalls.get(i);
final Set<Haplotype> haplotypesWithCall = haplotypeMap.get(call);
if ( haplotypesWithCall.isEmpty() )
continue;
for ( int j = i+1; j < numCalls; j++ ) {
final VariantContext comp = originalCalls.get(j);
final Set<Haplotype> haplotypesWithComp = haplotypeMap.get(comp);
// if the variants are in phase, record that fact
if ( haplotypesWithCall.size() == haplotypesWithComp.size() && haplotypesWithCall.containsAll(haplotypesWithComp) ) {
// if it's part of an existing group, use that group's unique ID
if ( phaseSetMapping.containsKey(call) ) {
phaseSetMapping.put(comp, phaseSetMapping.get(call));
// otherwise, create a new group
} else {
phaseSetMapping.put(call, uniqueCounter);
phaseSetMapping.put(comp, uniqueCounter);
uniqueCounter++;
}
}
}
}
return uniqueCounter;
}
/**
* Assemble the phase groups together and update the original calls accordingly
*
* @param originalCalls the original unphased calls
* @param phaseSetMapping mapping from call (variant context) to phase group ID
* @param indexTo last index (exclusive) of phase group IDs
* @return a non-null list which represents the possibly phased version of the calls
*/
protected static List<VariantContext> constructPhaseGroups(final List<VariantContext> originalCalls,
final Map<VariantContext, Integer> phaseSetMapping,
final int indexTo) {
final List<VariantContext> phasedCalls = new ArrayList<>(originalCalls);
// if we managed to find any phased groups, update the VariantContexts
for ( int count = 0; count < indexTo; count++ ) {
// get all of the (indexes of the) calls that belong in this group (keeping them in the original order)
final List<Integer> indexes = new ArrayList<>();
for ( int index = 0; index < originalCalls.size(); index++ ) {
final VariantContext call = originalCalls.get(index);
if ( phaseSetMapping.containsKey(call) && phaseSetMapping.get(call) == count )
indexes.add(index);
}
if ( indexes.size() < 2 )
throw new IllegalStateException("Somehow we have a group of phased variants that has fewer than 2 members");
// create a unique ID based on the leftmost one
final String uniqueID = createUniqueID(originalCalls.get(indexes.get(0)));
// update the VCs
for ( final int index : indexes ) {
final VariantContext phasedCall = phaseVC(originalCalls.get(index), uniqueID);
phasedCalls.set(index, phasedCall);
}
}
return phasedCalls;
}
/**
* Is this variant bi-allelic? This implementation is very much specific to this class so shouldn't be pulled out into a generalized place.
*
* @param vc the variant context
* @return true if this variant context is bi-allelic, ignoring the NON-REF symbolic allele, false otherwise
*/
private static boolean isBiallelic(final VariantContext vc) {
return vc.isBiallelic() || (vc.getNAlleles() == 3 && vc.getAlternateAlleles().contains(GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE));
}
/**
* Create a unique identifier given the variant context
*
* @param vc the variant context
* @return non-null String
*/
private static String createUniqueID(final VariantContext vc) {
return String.format("%d_%s_%s", vc.getStart(), vc.getReference().getDisplayString(), vc.getAlternateAllele(0).getDisplayString());
// return base + "_0," + base + "_1";
}
/**
* Add physical phase information to the provided variant context
*
* @param vc the variant context
* @param ID the ID to use
* @return phased non-null variant context
*/
private static VariantContext phaseVC(final VariantContext vc, final String ID) {
final List<Genotype> phasedGenotypes = new ArrayList<>();
for ( final Genotype g : vc.getGenotypes() )
phasedGenotypes.add(new GenotypeBuilder(g).attribute(HaplotypeCaller.HAPLOTYPE_CALLER_PHASING_KEY, ID).make());
return new VariantContextBuilder(vc).genotypes(phasedGenotypes).make();
}
/**

View File

@ -57,6 +57,7 @@ import htsjdk.variant.variantcontext.*;
import org.broadinstitute.gatk.utils.BaseTest;
import org.broadinstitute.gatk.utils.*;
import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile;
import org.broadinstitute.gatk.utils.haplotype.EventMap;
import org.broadinstitute.gatk.utils.haplotype.Haplotype;
import org.broadinstitute.gatk.utils.pileup.ReadBackedPileup;
import org.broadinstitute.gatk.utils.pileup.ReadBackedPileupImpl;
@ -365,4 +366,201 @@ public class HaplotypeCallerGenotypingEngineUnitTest extends BaseTest {
}
return true;
}
}
@DataProvider(name = "CreateHaplotypeMappingProvider")
public Object[][] makeCreateHaplotypeMappingData() {
List<Object[]> tests = new ArrayList<Object[]>();
final Set<Haplotype> haplotypes = new HashSet<>();
final Allele ref = Allele.create("A", true);
final Allele altC = Allele.create("C", false);
final Allele altT = Allele.create("T", false);
final Haplotype AtoC1 = new Haplotype("AACAA".getBytes());
final VariantContext vc1 = new VariantContextBuilder().chr("20").start(3).stop(3).alleles(Arrays.asList(ref, altC)).make();
AtoC1.setEventMap(new EventMap(Arrays.asList(vc1)));
AtoC1.getEventMap().put(3, vc1);
haplotypes.add(AtoC1);
final Haplotype AtoC2 = new Haplotype("AAACA".getBytes());
final VariantContext vc2 = new VariantContextBuilder().chr("20").start(4).stop(4).alleles(Arrays.asList(ref, altT)).make();
AtoC2.setEventMap(new EventMap(Arrays.asList(vc2)));
AtoC2.getEventMap().put(4, vc2);
haplotypes.add(AtoC2);
tests.add(new Object[]{vc1, haplotypes, AtoC1});
tests.add(new Object[]{vc2, haplotypes, AtoC2});
tests.add(new Object[]{new VariantContextBuilder().chr("20").start(1).stop(1).alleles(Arrays.asList(ref, altT)).make(), haplotypes, null});
return tests.toArray(new Object[][]{});
}
@Test(dataProvider="CreateHaplotypeMappingProvider")
public void testCreateHaplotypeMapping(final VariantContext vc, final Set<Haplotype> haplotypes, final Haplotype expected) {
final Map<VariantContext, Set<Haplotype>> mapping = HaplotypeCallerGenotypingEngine.constructHaplotypeMapping(Arrays.asList(vc), haplotypes);
final Set<Haplotype> actual = mapping.get(vc);
if ( expected == null )
Assert.assertTrue(actual.isEmpty(), actual.toString());
else {
Assert.assertEquals(actual.size(), 1);
Assert.assertEquals(actual.iterator().next(), expected);
}
}
@DataProvider(name = "ConstructPhaseSetMappingProvider")
public Object[][] makeConstructPhaseSetMappingData() {
List<Object[]> tests = new ArrayList<Object[]>();
final Allele ref = Allele.create("A", true);
final Allele altC = Allele.create("C", false);
final Allele altT = Allele.create("T", false);
final VariantContext vc2 = new VariantContextBuilder().chr("20").start(2).stop(2).alleles(Arrays.asList(ref, altC)).make();
final VariantContext vc3 = new VariantContextBuilder().chr("20").start(3).stop(3).alleles(Arrays.asList(ref, altT)).make();
final VariantContext vc4 = new VariantContextBuilder().chr("20").start(4).stop(4).alleles(Arrays.asList(ref, altC)).make();
final List<VariantContext> calls = Arrays.asList(vc2, vc3, vc4);
final Haplotype pos2 = new Haplotype("ACAAA".getBytes());
pos2.setEventMap(new EventMap(Arrays.asList(vc2)));
pos2.getEventMap().put(2, vc2);
final Haplotype pos3 = new Haplotype("AACAA".getBytes());
pos3.setEventMap(new EventMap(Arrays.asList(vc3)));
pos3.getEventMap().put(3, vc3);
final Haplotype pos4 = new Haplotype("AAACA".getBytes());
pos4.setEventMap(new EventMap(Arrays.asList(vc4)));
pos4.getEventMap().put(4, vc4);
final Haplotype pos24 = new Haplotype("ACACA".getBytes());
pos24.setEventMap(new EventMap(Arrays.asList(vc2, vc4)));
pos24.getEventMap().put(2, vc2);
pos24.getEventMap().put(4, vc4);
final Haplotype pos234 = new Haplotype("ACCCA".getBytes());
pos234.setEventMap(new EventMap(Arrays.asList(vc2, vc3, vc4)));
pos234.getEventMap().put(2, vc2);
pos234.getEventMap().put(3, vc3);
pos234.getEventMap().put(4, vc4);
// test no phased variants
final Set<Haplotype> haplotypes2NoPhase = new HashSet<>();
haplotypes2NoPhase.add(pos2);
haplotypes2NoPhase.add(pos24);
final Set<Haplotype> haplotypes3NoPhase = new HashSet<>();
haplotypes3NoPhase.add(pos3);
final Set<Haplotype> haplotypes4NoPhase = new HashSet<>();
haplotypes4NoPhase.add(pos4);
haplotypes4NoPhase.add(pos24);
final Map<VariantContext, Set<Haplotype>> haplotypeMapNoPhase = new HashMap<>();
haplotypeMapNoPhase.put(vc2, haplotypes2NoPhase);
haplotypeMapNoPhase.put(vc3, haplotypes3NoPhase);
haplotypeMapNoPhase.put(vc4, haplotypes4NoPhase);
tests.add(new Object[]{calls, haplotypeMapNoPhase, 0, 0});
// test 2 phased variants
final Set<Haplotype> haplotypes2SomePhase = new HashSet<>();
haplotypes2SomePhase.add(pos24);
final Set<Haplotype> haplotypes4SomePhase = new HashSet<>();
haplotypes4SomePhase.add(pos24);
final Map<VariantContext, Set<Haplotype>> haplotypeMapSomePhase = new HashMap<>();
haplotypeMapSomePhase.put(vc2, haplotypes2SomePhase);
haplotypeMapSomePhase.put(vc3, haplotypes3NoPhase);
haplotypeMapSomePhase.put(vc4, haplotypes4SomePhase);
tests.add(new Object[]{calls, haplotypeMapSomePhase, 2, 1});
// test all phased variants
final Set<Haplotype> haplotypes2AllPhase = new HashSet<>();
haplotypes2AllPhase.add(pos234);
final Set<Haplotype> haplotypes3AllPhase = new HashSet<>();
haplotypes3AllPhase.add(pos234);
final Set<Haplotype> haplotypes4AllPhase = new HashSet<>();
haplotypes4AllPhase.add(pos234);
final Map<VariantContext, Set<Haplotype>> haplotypeMapAllPhase = new HashMap<>();
haplotypeMapAllPhase.put(vc2, haplotypes2AllPhase);
haplotypeMapAllPhase.put(vc3, haplotypes3AllPhase);
haplotypeMapAllPhase.put(vc4, haplotypes4AllPhase);
tests.add(new Object[]{calls, haplotypeMapAllPhase, 3, 1});
return tests.toArray(new Object[][]{});
}
@Test(dataProvider="ConstructPhaseSetMappingProvider")
public void testConstructPhaseSetMapping(final List<VariantContext> calls,
final Map<VariantContext, Set<Haplotype>> haplotypeMap,
final int expectedMapSize,
final int expectedNumGroups) {
final Map<VariantContext, Integer> actualPhaseSetMapping = new HashMap<>();
final int actualNumGroups = HaplotypeCallerGenotypingEngine.constructPhaseSetMapping(calls, haplotypeMap, actualPhaseSetMapping);
Assert.assertEquals(actualNumGroups, expectedNumGroups);
Assert.assertEquals(actualPhaseSetMapping.size(), expectedMapSize);
}
@DataProvider(name = "ConstructPhaseGroupsProvider")
public Object[][] makeConstructPhaseGroupsData() {
List<Object[]> tests = new ArrayList<Object[]>();
final Allele ref = Allele.create("A", true);
final Allele altC = Allele.create("C", false);
final Genotype g1 = new GenotypeBuilder().alleles(Arrays.asList(ref, altC)).make();
final VariantContext vc1 = new VariantContextBuilder().chr("20").start(1).stop(1).alleles(Arrays.asList(ref, altC)).genotypes(g1).make();
final Genotype g2 = new GenotypeBuilder().alleles(Arrays.asList(ref, altC)).make();
final VariantContext vc2 = new VariantContextBuilder().chr("20").start(2).stop(2).alleles(Arrays.asList(ref, altC)).genotypes(g2).make();
final Genotype g3 = new GenotypeBuilder().alleles(Arrays.asList(ref, altC)).make();
final VariantContext vc3 = new VariantContextBuilder().chr("20").start(3).stop(3).alleles(Arrays.asList(ref, altC)).genotypes(g3).make();
final List<VariantContext> calls = Arrays.asList(vc1, vc2, vc3);
// test no phased variants, empty map
final Map<VariantContext, Integer> nonePhased1 = new HashMap<>();
tests.add(new Object[]{calls, nonePhased1, 0, 0, 0});
// test no phased variants, full map, exception expected
final Map<VariantContext, Integer> nonePhased2 = new HashMap<>();
nonePhased2.put(vc1, 0);
nonePhased2.put(vc2, 1);
nonePhased2.put(vc3, 2);
tests.add(new Object[]{calls, nonePhased2, 3, -1, -1});
// test 2 phased variants
final Map<VariantContext, Integer> twoPhased = new HashMap<>();
twoPhased.put(vc1, 0);
twoPhased.put(vc2, 0);
tests.add(new Object[]{calls, twoPhased, 1, 1, 2});
// test all phased variants
final Map<VariantContext, Integer> allPhased = new HashMap<>();
allPhased.put(vc1, 0);
allPhased.put(vc2, 0);
allPhased.put(vc3, 0);
tests.add(new Object[]{calls, allPhased, 1, 1, 3});
return tests.toArray(new Object[][]{});
}
@Test(dataProvider="ConstructPhaseGroupsProvider")
public void testConstructPhaseGroups(final List<VariantContext> calls,
final Map<VariantContext, Integer> phaseMap,
final int endIndex,
final int expectedNumGroups,
final int expectedGroupSize) {
final List<VariantContext> actualPhasedCalls;
try {
actualPhasedCalls = HaplotypeCallerGenotypingEngine.constructPhaseGroups(calls, phaseMap, endIndex);
} catch (IllegalStateException e) {
Assert.assertEquals(-1, expectedNumGroups);
return;
}
final Set<String> uniqueGroups = new HashSet<>();
int counter = 0;
for ( final VariantContext call : actualPhasedCalls ) {
for ( final Genotype g : call.getGenotypes() ) {
if ( g.hasExtendedAttribute(HaplotypeCaller.HAPLOTYPE_CALLER_PHASING_KEY) ) {
uniqueGroups.add(g.getExtendedAttribute(HaplotypeCaller.HAPLOTYPE_CALLER_PHASING_KEY).toString());
counter++;
}
}
}
Assert.assertEquals(uniqueGroups.size(), expectedNumGroups);
Assert.assertEquals(counter, expectedGroupSize);
}
}

View File

@ -74,7 +74,7 @@ public class EventMap extends TreeMap<Integer, VariantContext> {
* For testing. Let's you set up a explicit configuration without having to process a haplotype and reference
* @param stateForTesting
*/
protected EventMap(final Collection<VariantContext> stateForTesting) {
public EventMap(final Collection<VariantContext> stateForTesting) {
haplotype = null;
ref = null;
refLoc = null;
@ -176,99 +176,8 @@ public class EventMap extends TreeMap<Integer, VariantContext> {
}
}
// handle the case where the event set for the haplotype is very complex
// TODO -- theoretically this should be part of the MergeVariantsAcrossHaplotypes class, but it's just so much more complicated to do so
if ( variationIsTooComplex(proposedEvents) ) {
addComplexVC(cigar, alignment, haplotype.getAlignmentStartHapwrtRef());
} else {
for ( final VariantContext proposedEvent : proposedEvents )
addVC(proposedEvent, true);
}
}
/**
* Determine whether the provided set of variants is too complex for breaking up into individual parts
*
* @param events the individual events
* @return true if the cigar is too complex, false otherwise
*/
private boolean variationIsTooComplex(final List<VariantContext> events) {
// TODO -- we've decided to disable this for now and try "physical phasing"
return false;
//int indelCount = 0;
//for ( final VariantContext event : events ) {
// if ( event.isIndel() )
// indelCount++;
//}
//
// don't allow too many indels
//return indelCount > MAX_INDELS_PER_HAPLOTYPE;
}
/**
* Add a complex variant context to the events given the haplotype and its cigar
*
* @param cigar the cigar to convert
* @param haplotype the bases of the alternate haplotype
* @param refPos the position on the reference for this alignment
*/
private void addComplexVC(final Cigar cigar, final byte[] haplotype, final int refPos) {
// ignore leading and trailing bases that match between this haplotype and the reference
final int matchingPrefix = numPrefixMatch(ref, haplotype, refPos, 0);
final int matchingSuffix = numSuffixMatch(ref, haplotype, refPos + cigar.getReferenceLength() - 1, haplotype.length - 1);
// edge case: too many matches
final int totalMatch = matchingPrefix + matchingSuffix;
if ( totalMatch >= haplotype.length || totalMatch >= ref.length )
return;
final byte[] refBases = Arrays.copyOfRange(ref, refPos + matchingPrefix, refPos + cigar.getReferenceLength() - matchingSuffix);
final byte[] altBases = Arrays.copyOfRange(haplotype, matchingPrefix, haplotype.length - matchingSuffix);
final List<Allele> alleles = new ArrayList<>();
alleles.add( Allele.create( refBases, true ) );
alleles.add( Allele.create( altBases, false ) );
final int start = refLoc.getStart() + refPos + matchingPrefix;
addVC(new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), start, start + refBases.length - 1, alleles).make(), true);
}
/**
* calculates the extent of the prefix match between 2 sequences
*
* @param seq1 the first sequence
* @param seq2 the second sequence
* @param startPos1 the index on seq1 to start from
* @param startPos2 the index on seq2 to start from
* @return non-negative int representing the matching prefix
*/
private int numPrefixMatch(final byte[] seq1, final byte[] seq2, final int startPos1, final int startPos2) {
int matchingBases = 0;
for ( int pos1 = startPos1, pos2 = startPos2; pos1 < seq1.length && pos2 < seq2.length; pos1++, pos2++ ) {
if ( seq1[pos1] != seq2[pos2] )
break;
matchingBases++;
}
return matchingBases;
}
/**
* calculates the extent of the suffix match between 2 sequences
*
* @param seq1 the first sequence
* @param seq2 the second sequence
* @param startPos1 the index on seq1 to start from
* @param startPos2 the index on seq2 to start from
* @return non-negative int representing the matching suffix
*/
private int numSuffixMatch(final byte[] seq1, final byte[] seq2, final int startPos1, final int startPos2) {
int matchingBases = 0;
for ( int pos1 = startPos1, pos2 = startPos2; pos1 >=0 && pos2 >= 0; pos1--, pos2-- ) {
if ( seq1[pos1] != seq2[pos2] )
break;
matchingBases++;
}
return matchingBases;
for ( final VariantContext proposedEvent : proposedEvents )
addVC(proposedEvent, true);
}
/**
@ -345,7 +254,7 @@ public class EventMap extends TreeMap<Integer, VariantContext> {
// TODO -- warning this is an O(N^3) algorithm because I'm just lazy. If it's valuable we need to reengineer it
@Requires("getNumberOfEvents() > 0")
protected void replaceClumpedEventsWithBlockSubstititions() {
protected void replaceClumpedEventsWithBlockSubstitutions() {
if ( getNumberOfEvents() >= MIN_NUMBER_OF_EVENTS_TO_COMBINE_INTO_BLOCK_SUBSTITUTION) {
int lastStart = -1;
for ( boolean foundOne = true; foundOne; ) {

View File

@ -130,7 +130,7 @@ public class EventMapUnitTest extends BaseTest {
final Haplotype hap = new Haplotype(haplotypeBases.getBytes(), false, 0, TextCigarCodec.getSingleton().decode(cigar));
final GenomeLoc loc = new UnvalidatingGenomeLoc(CHR, 0, 1, refBases.length());
final EventMap ee = new EventMap(hap, refBases.getBytes(), loc, NAME);
ee.replaceClumpedEventsWithBlockSubstititions();
ee.replaceClumpedEventsWithBlockSubstitutions();
Assert.assertEquals(ee.getNumberOfEvents(), 1);
final VariantContext actual = ee.getVariantContexts().iterator().next();
Assert.assertTrue(GATKVariantContextUtils.equalSites(actual, expectedBlock), "Failed with " + actual);
@ -159,7 +159,7 @@ public class EventMapUnitTest extends BaseTest {
final Haplotype hap = new Haplotype(haplotypeBases.getBytes(), false, 0, TextCigarCodec.getSingleton().decode(cigar));
final GenomeLoc loc = new UnvalidatingGenomeLoc(CHR, 0, 1, refBases.length());
final EventMap ee = new EventMap(hap, refBases.getBytes(), loc, NAME);
ee.replaceClumpedEventsWithBlockSubstititions();
ee.replaceClumpedEventsWithBlockSubstitutions();
Assert.assertEquals(ee.getNumberOfEvents(), expectedAlleles.size());
final List<VariantContext> actuals = new ArrayList<VariantContext>(ee.getVariantContexts());
for ( int i = 0; i < ee.getNumberOfEvents(); i++ ) {