Merge pull request #700 from broadinstitute/eb_phase_HC_variants_PT74816060

Initial implementation of functionality to add physical phasing informat...
This commit is contained in:
Eric Banks 2014-08-13 12:30:32 -04:00
commit 27193c5048
5 changed files with 393 additions and 101 deletions

View File

@ -496,6 +496,12 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
@Argument(fullName="mergeVariantsViaLD", shortName="mergeVariantsViaLD", doc="If specified, we will merge variants together into block substitutions that are in strong local LD", required = false)
protected boolean mergeVariantsViaLD = false;
@Advanced
@Argument(fullName="tryPhysicalPhasing", shortName="tryPhysicalPhasing", doc="If specified, we will add physical (read-based) phasing information", required = false)
protected boolean tryPhysicalPhasing = false;
public static final String HAPLOTYPE_CALLER_PHASING_KEY = "HCP";
// -----------------------------------------------------------------------------------------------
// arguments for debugging / developing the haplotype caller
// -----------------------------------------------------------------------------------------------
@ -672,7 +678,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
if( SCAC.genotypingOutputMode == GenotypingOutputMode.GENOTYPE_GIVEN_ALLELES && consensusMode )
throw new UserException("HaplotypeCaller cannot be run in both GENOTYPE_GIVEN_ALLELES mode and in consensus mode. Please choose one or the other.");
genotypingEngine = new HaplotypeCallerGenotypingEngine( getToolkit(),SCAC);
genotypingEngine = new HaplotypeCallerGenotypingEngine( getToolkit(), SCAC, tryPhysicalPhasing);
// initialize the output VCF header
final VariantAnnotatorEngine annotationEngine = new VariantAnnotatorEngine(Arrays.asList(annotationClassesToUse), annotationsToUse, annotationsToExclude, this, getToolkit());
@ -693,6 +699,9 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
VCFConstants.DEPTH_KEY,
VCFConstants.GENOTYPE_PL_KEY);
if ( tryPhysicalPhasing )
headerInfo.add(new VCFFormatHeaderLine(HAPLOTYPE_CALLER_PHASING_KEY, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Physical phasing information, each unique ID within a given sample (but not across samples) connects alternate alleles as occurring on the same haplotype"));
// FILTER fields are added unconditionally as it's not always 100% certain the circumstances
// where the filters are used. For example, in emitting all sites the lowQual field is used
headerInfo.add(new VCFFilterHeaderLine(UnifiedGenotypingEngine.LOW_QUAL_FILTER_NAME, "Low quality"));

View File

@ -77,13 +77,16 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine<HaplotypeC
private MergeVariantsAcrossHaplotypes crossHaplotypeEventMerger;
private final boolean tryPhysicalPhasing;
/**
* {@inheritDoc}
* @param toolkit {@inheritDoc}
* @param configuration {@inheritDoc}
*/
public HaplotypeCallerGenotypingEngine(final GenomeAnalysisEngine toolkit, final HaplotypeCallerArgumentCollection configuration) {
public HaplotypeCallerGenotypingEngine(final GenomeAnalysisEngine toolkit, final HaplotypeCallerArgumentCollection configuration, final boolean tryPhysicalPhasing) {
super(toolkit,configuration);
this.tryPhysicalPhasing = tryPhysicalPhasing;
}
/**
@ -94,6 +97,7 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine<HaplotypeC
*/
public HaplotypeCallerGenotypingEngine(final GenomeAnalysisEngine toolkit, final HaplotypeCallerArgumentCollection configuration, final Set<String> sampleNames) {
super(toolkit,configuration,sampleNames);
tryPhysicalPhasing = false;
}
@ -282,7 +286,179 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine<HaplotypeC
}
}
return new CalledHaplotypes(returnCalls, calledHaplotypes);
final List<VariantContext> phasedCalls = tryPhysicalPhasing ? phaseCalls(returnCalls, calledHaplotypes) : returnCalls;
return new CalledHaplotypes(phasedCalls, calledHaplotypes);
}
/**
* Tries to phase the individual alleles based on pairwise comparisons to the other alleles based on all called haplotypes
*
* @param calls the list of called alleles
* @param calledHaplotypes the set of haplotypes used for calling
* @return a non-null list which represents the possibly phased version of the calls
*/
protected List<VariantContext> phaseCalls(final List<VariantContext> calls, final Set<Haplotype> calledHaplotypes) {
// construct a mapping from alternate allele to the set of haplotypes that contain that allele
final Map<VariantContext, Set<Haplotype>> haplotypeMap = constructHaplotypeMapping(calls, calledHaplotypes);
// construct a mapping from call to phase set ID
final Map<VariantContext, Integer> phaseSetMapping = new HashMap<>();
final int uniqueCounterEndValue = constructPhaseSetMapping(calls, haplotypeMap, phaseSetMapping);
// we want to establish (potential) *groups* of phased variants, so we need to be smart when looking at pairwise phasing partners
return constructPhaseGroups(calls, phaseSetMapping, uniqueCounterEndValue);
}
/**
* Construct the mapping from alternate allele to the set of haplotypes that contain that allele
*
* @param originalCalls the original unphased calls
* @param calledHaplotypes the set of haplotypes used for calling
* @return non-null Map
*/
protected static Map<VariantContext, Set<Haplotype>> constructHaplotypeMapping(final List<VariantContext> originalCalls,
final Set<Haplotype> calledHaplotypes) {
final Map<VariantContext, Set<Haplotype>> haplotypeMap = new HashMap<>(originalCalls.size());
for ( final VariantContext call : originalCalls ) {
// don't try to phase if there is not exactly 1 alternate allele
if ( ! isBiallelic(call) ) {
haplotypeMap.put(call, Collections.<Haplotype>emptySet());
continue;
}
// keep track of the haplotypes that contain this particular alternate allele
final Set<Haplotype> hapsWithAllele = new HashSet<>();
final Allele alt = call.getAlternateAllele(0);
for ( final Haplotype h : calledHaplotypes ) {
for ( final VariantContext event : h.getEventMap().getVariantContexts() ) {
if ( event.getStart() == call.getStart() && event.getAlternateAlleles().contains(alt) )
hapsWithAllele.add(h);
}
}
haplotypeMap.put(call, hapsWithAllele);
}
return haplotypeMap;
}
/**
* Construct the mapping from call (variant context) to phase set ID
*
* @param originalCalls the original unphased calls
* @param haplotypeMap mapping from alternate allele to the set of haplotypes that contain that allele
* @param phaseSetMapping the map to populate in this method
* @return the next incremental unique index
*/
protected static int constructPhaseSetMapping(final List<VariantContext> originalCalls,
final Map<VariantContext, Set<Haplotype>> haplotypeMap,
final Map<VariantContext, Integer> phaseSetMapping) {
final int numCalls = originalCalls.size();
int uniqueCounter = 0;
// use the haplotype mapping to connect variants that are always present on the same haplotypes
for ( int i = 0; i < numCalls - 1; i++ ) {
final VariantContext call = originalCalls.get(i);
final Set<Haplotype> haplotypesWithCall = haplotypeMap.get(call);
if ( haplotypesWithCall.isEmpty() )
continue;
for ( int j = i+1; j < numCalls; j++ ) {
final VariantContext comp = originalCalls.get(j);
final Set<Haplotype> haplotypesWithComp = haplotypeMap.get(comp);
// if the variants are in phase, record that fact
if ( haplotypesWithCall.size() == haplotypesWithComp.size() && haplotypesWithCall.containsAll(haplotypesWithComp) ) {
// if it's part of an existing group, use that group's unique ID
if ( phaseSetMapping.containsKey(call) ) {
phaseSetMapping.put(comp, phaseSetMapping.get(call));
// otherwise, create a new group
} else {
phaseSetMapping.put(call, uniqueCounter);
phaseSetMapping.put(comp, uniqueCounter);
uniqueCounter++;
}
}
}
}
return uniqueCounter;
}
/**
* Assemble the phase groups together and update the original calls accordingly
*
* @param originalCalls the original unphased calls
* @param phaseSetMapping mapping from call (variant context) to phase group ID
* @param indexTo last index (exclusive) of phase group IDs
* @return a non-null list which represents the possibly phased version of the calls
*/
protected static List<VariantContext> constructPhaseGroups(final List<VariantContext> originalCalls,
final Map<VariantContext, Integer> phaseSetMapping,
final int indexTo) {
final List<VariantContext> phasedCalls = new ArrayList<>(originalCalls);
// if we managed to find any phased groups, update the VariantContexts
for ( int count = 0; count < indexTo; count++ ) {
// get all of the (indexes of the) calls that belong in this group (keeping them in the original order)
final List<Integer> indexes = new ArrayList<>();
for ( int index = 0; index < originalCalls.size(); index++ ) {
final VariantContext call = originalCalls.get(index);
if ( phaseSetMapping.containsKey(call) && phaseSetMapping.get(call) == count )
indexes.add(index);
}
if ( indexes.size() < 2 )
throw new IllegalStateException("Somehow we have a group of phased variants that has fewer than 2 members");
// create a unique ID based on the leftmost one
final String uniqueID = createUniqueID(originalCalls.get(indexes.get(0)));
// update the VCs
for ( final int index : indexes ) {
final VariantContext phasedCall = phaseVC(originalCalls.get(index), uniqueID);
phasedCalls.set(index, phasedCall);
}
}
return phasedCalls;
}
/**
* Is this variant bi-allelic? This implementation is very much specific to this class so shouldn't be pulled out into a generalized place.
*
* @param vc the variant context
* @return true if this variant context is bi-allelic, ignoring the NON-REF symbolic allele, false otherwise
*/
private static boolean isBiallelic(final VariantContext vc) {
return vc.isBiallelic() || (vc.getNAlleles() == 3 && vc.getAlternateAlleles().contains(GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE));
}
/**
* Create a unique identifier given the variant context
*
* @param vc the variant context
* @return non-null String
*/
private static String createUniqueID(final VariantContext vc) {
return String.format("%d_%s_%s", vc.getStart(), vc.getReference().getDisplayString(), vc.getAlternateAllele(0).getDisplayString());
// return base + "_0," + base + "_1";
}
/**
* Add physical phase information to the provided variant context
*
* @param vc the variant context
* @param ID the ID to use
* @return phased non-null variant context
*/
private static VariantContext phaseVC(final VariantContext vc, final String ID) {
final List<Genotype> phasedGenotypes = new ArrayList<>();
for ( final Genotype g : vc.getGenotypes() )
phasedGenotypes.add(new GenotypeBuilder(g).attribute(HaplotypeCaller.HAPLOTYPE_CALLER_PHASING_KEY, ID).make());
return new VariantContextBuilder(vc).genotypes(phasedGenotypes).make();
}
// Builds the read-likelihoods collection to use for annotation considering user arguments and the collection

View File

@ -57,6 +57,7 @@ import htsjdk.variant.variantcontext.*;
import org.broadinstitute.gatk.utils.BaseTest;
import org.broadinstitute.gatk.utils.*;
import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile;
import org.broadinstitute.gatk.utils.haplotype.EventMap;
import org.broadinstitute.gatk.utils.haplotype.Haplotype;
import org.broadinstitute.gatk.utils.pileup.ReadBackedPileup;
import org.broadinstitute.gatk.utils.pileup.ReadBackedPileupImpl;
@ -365,4 +366,201 @@ public class HaplotypeCallerGenotypingEngineUnitTest extends BaseTest {
}
return true;
}
}
@DataProvider(name = "CreateHaplotypeMappingProvider")
public Object[][] makeCreateHaplotypeMappingData() {
List<Object[]> tests = new ArrayList<Object[]>();
final Set<Haplotype> haplotypes = new HashSet<>();
final Allele ref = Allele.create("A", true);
final Allele altC = Allele.create("C", false);
final Allele altT = Allele.create("T", false);
final Haplotype AtoC1 = new Haplotype("AACAA".getBytes());
final VariantContext vc1 = new VariantContextBuilder().chr("20").start(3).stop(3).alleles(Arrays.asList(ref, altC)).make();
AtoC1.setEventMap(new EventMap(Arrays.asList(vc1)));
AtoC1.getEventMap().put(3, vc1);
haplotypes.add(AtoC1);
final Haplotype AtoC2 = new Haplotype("AAACA".getBytes());
final VariantContext vc2 = new VariantContextBuilder().chr("20").start(4).stop(4).alleles(Arrays.asList(ref, altT)).make();
AtoC2.setEventMap(new EventMap(Arrays.asList(vc2)));
AtoC2.getEventMap().put(4, vc2);
haplotypes.add(AtoC2);
tests.add(new Object[]{vc1, haplotypes, AtoC1});
tests.add(new Object[]{vc2, haplotypes, AtoC2});
tests.add(new Object[]{new VariantContextBuilder().chr("20").start(1).stop(1).alleles(Arrays.asList(ref, altT)).make(), haplotypes, null});
return tests.toArray(new Object[][]{});
}
@Test(dataProvider="CreateHaplotypeMappingProvider")
public void testCreateHaplotypeMapping(final VariantContext vc, final Set<Haplotype> haplotypes, final Haplotype expected) {
final Map<VariantContext, Set<Haplotype>> mapping = HaplotypeCallerGenotypingEngine.constructHaplotypeMapping(Arrays.asList(vc), haplotypes);
final Set<Haplotype> actual = mapping.get(vc);
if ( expected == null )
Assert.assertTrue(actual.isEmpty(), actual.toString());
else {
Assert.assertEquals(actual.size(), 1);
Assert.assertEquals(actual.iterator().next(), expected);
}
}
@DataProvider(name = "ConstructPhaseSetMappingProvider")
public Object[][] makeConstructPhaseSetMappingData() {
List<Object[]> tests = new ArrayList<Object[]>();
final Allele ref = Allele.create("A", true);
final Allele altC = Allele.create("C", false);
final Allele altT = Allele.create("T", false);
final VariantContext vc2 = new VariantContextBuilder().chr("20").start(2).stop(2).alleles(Arrays.asList(ref, altC)).make();
final VariantContext vc3 = new VariantContextBuilder().chr("20").start(3).stop(3).alleles(Arrays.asList(ref, altT)).make();
final VariantContext vc4 = new VariantContextBuilder().chr("20").start(4).stop(4).alleles(Arrays.asList(ref, altC)).make();
final List<VariantContext> calls = Arrays.asList(vc2, vc3, vc4);
final Haplotype pos2 = new Haplotype("ACAAA".getBytes());
pos2.setEventMap(new EventMap(Arrays.asList(vc2)));
pos2.getEventMap().put(2, vc2);
final Haplotype pos3 = new Haplotype("AACAA".getBytes());
pos3.setEventMap(new EventMap(Arrays.asList(vc3)));
pos3.getEventMap().put(3, vc3);
final Haplotype pos4 = new Haplotype("AAACA".getBytes());
pos4.setEventMap(new EventMap(Arrays.asList(vc4)));
pos4.getEventMap().put(4, vc4);
final Haplotype pos24 = new Haplotype("ACACA".getBytes());
pos24.setEventMap(new EventMap(Arrays.asList(vc2, vc4)));
pos24.getEventMap().put(2, vc2);
pos24.getEventMap().put(4, vc4);
final Haplotype pos234 = new Haplotype("ACCCA".getBytes());
pos234.setEventMap(new EventMap(Arrays.asList(vc2, vc3, vc4)));
pos234.getEventMap().put(2, vc2);
pos234.getEventMap().put(3, vc3);
pos234.getEventMap().put(4, vc4);
// test no phased variants
final Set<Haplotype> haplotypes2NoPhase = new HashSet<>();
haplotypes2NoPhase.add(pos2);
haplotypes2NoPhase.add(pos24);
final Set<Haplotype> haplotypes3NoPhase = new HashSet<>();
haplotypes3NoPhase.add(pos3);
final Set<Haplotype> haplotypes4NoPhase = new HashSet<>();
haplotypes4NoPhase.add(pos4);
haplotypes4NoPhase.add(pos24);
final Map<VariantContext, Set<Haplotype>> haplotypeMapNoPhase = new HashMap<>();
haplotypeMapNoPhase.put(vc2, haplotypes2NoPhase);
haplotypeMapNoPhase.put(vc3, haplotypes3NoPhase);
haplotypeMapNoPhase.put(vc4, haplotypes4NoPhase);
tests.add(new Object[]{calls, haplotypeMapNoPhase, 0, 0});
// test 2 phased variants
final Set<Haplotype> haplotypes2SomePhase = new HashSet<>();
haplotypes2SomePhase.add(pos24);
final Set<Haplotype> haplotypes4SomePhase = new HashSet<>();
haplotypes4SomePhase.add(pos24);
final Map<VariantContext, Set<Haplotype>> haplotypeMapSomePhase = new HashMap<>();
haplotypeMapSomePhase.put(vc2, haplotypes2SomePhase);
haplotypeMapSomePhase.put(vc3, haplotypes3NoPhase);
haplotypeMapSomePhase.put(vc4, haplotypes4SomePhase);
tests.add(new Object[]{calls, haplotypeMapSomePhase, 2, 1});
// test all phased variants
final Set<Haplotype> haplotypes2AllPhase = new HashSet<>();
haplotypes2AllPhase.add(pos234);
final Set<Haplotype> haplotypes3AllPhase = new HashSet<>();
haplotypes3AllPhase.add(pos234);
final Set<Haplotype> haplotypes4AllPhase = new HashSet<>();
haplotypes4AllPhase.add(pos234);
final Map<VariantContext, Set<Haplotype>> haplotypeMapAllPhase = new HashMap<>();
haplotypeMapAllPhase.put(vc2, haplotypes2AllPhase);
haplotypeMapAllPhase.put(vc3, haplotypes3AllPhase);
haplotypeMapAllPhase.put(vc4, haplotypes4AllPhase);
tests.add(new Object[]{calls, haplotypeMapAllPhase, 3, 1});
return tests.toArray(new Object[][]{});
}
@Test(dataProvider="ConstructPhaseSetMappingProvider")
public void testConstructPhaseSetMapping(final List<VariantContext> calls,
final Map<VariantContext, Set<Haplotype>> haplotypeMap,
final int expectedMapSize,
final int expectedNumGroups) {
final Map<VariantContext, Integer> actualPhaseSetMapping = new HashMap<>();
final int actualNumGroups = HaplotypeCallerGenotypingEngine.constructPhaseSetMapping(calls, haplotypeMap, actualPhaseSetMapping);
Assert.assertEquals(actualNumGroups, expectedNumGroups);
Assert.assertEquals(actualPhaseSetMapping.size(), expectedMapSize);
}
@DataProvider(name = "ConstructPhaseGroupsProvider")
public Object[][] makeConstructPhaseGroupsData() {
List<Object[]> tests = new ArrayList<Object[]>();
final Allele ref = Allele.create("A", true);
final Allele altC = Allele.create("C", false);
final Genotype g1 = new GenotypeBuilder().alleles(Arrays.asList(ref, altC)).make();
final VariantContext vc1 = new VariantContextBuilder().chr("20").start(1).stop(1).alleles(Arrays.asList(ref, altC)).genotypes(g1).make();
final Genotype g2 = new GenotypeBuilder().alleles(Arrays.asList(ref, altC)).make();
final VariantContext vc2 = new VariantContextBuilder().chr("20").start(2).stop(2).alleles(Arrays.asList(ref, altC)).genotypes(g2).make();
final Genotype g3 = new GenotypeBuilder().alleles(Arrays.asList(ref, altC)).make();
final VariantContext vc3 = new VariantContextBuilder().chr("20").start(3).stop(3).alleles(Arrays.asList(ref, altC)).genotypes(g3).make();
final List<VariantContext> calls = Arrays.asList(vc1, vc2, vc3);
// test no phased variants, empty map
final Map<VariantContext, Integer> nonePhased1 = new HashMap<>();
tests.add(new Object[]{calls, nonePhased1, 0, 0, 0});
// test no phased variants, full map, exception expected
final Map<VariantContext, Integer> nonePhased2 = new HashMap<>();
nonePhased2.put(vc1, 0);
nonePhased2.put(vc2, 1);
nonePhased2.put(vc3, 2);
tests.add(new Object[]{calls, nonePhased2, 3, -1, -1});
// test 2 phased variants
final Map<VariantContext, Integer> twoPhased = new HashMap<>();
twoPhased.put(vc1, 0);
twoPhased.put(vc2, 0);
tests.add(new Object[]{calls, twoPhased, 1, 1, 2});
// test all phased variants
final Map<VariantContext, Integer> allPhased = new HashMap<>();
allPhased.put(vc1, 0);
allPhased.put(vc2, 0);
allPhased.put(vc3, 0);
tests.add(new Object[]{calls, allPhased, 1, 1, 3});
return tests.toArray(new Object[][]{});
}
@Test(dataProvider="ConstructPhaseGroupsProvider")
public void testConstructPhaseGroups(final List<VariantContext> calls,
final Map<VariantContext, Integer> phaseMap,
final int endIndex,
final int expectedNumGroups,
final int expectedGroupSize) {
final List<VariantContext> actualPhasedCalls;
try {
actualPhasedCalls = HaplotypeCallerGenotypingEngine.constructPhaseGroups(calls, phaseMap, endIndex);
} catch (IllegalStateException e) {
Assert.assertEquals(-1, expectedNumGroups);
return;
}
final Set<String> uniqueGroups = new HashSet<>();
int counter = 0;
for ( final VariantContext call : actualPhasedCalls ) {
for ( final Genotype g : call.getGenotypes() ) {
if ( g.hasExtendedAttribute(HaplotypeCaller.HAPLOTYPE_CALLER_PHASING_KEY) ) {
uniqueGroups.add(g.getExtendedAttribute(HaplotypeCaller.HAPLOTYPE_CALLER_PHASING_KEY).toString());
counter++;
}
}
}
Assert.assertEquals(uniqueGroups.size(), expectedNumGroups);
Assert.assertEquals(counter, expectedGroupSize);
}
}

View File

@ -74,7 +74,7 @@ public class EventMap extends TreeMap<Integer, VariantContext> {
* For testing. Let's you set up a explicit configuration without having to process a haplotype and reference
* @param stateForTesting
*/
protected EventMap(final Collection<VariantContext> stateForTesting) {
public EventMap(final Collection<VariantContext> stateForTesting) {
haplotype = null;
ref = null;
refLoc = null;
@ -176,99 +176,8 @@ public class EventMap extends TreeMap<Integer, VariantContext> {
}
}
// handle the case where the event set for the haplotype is very complex
// TODO -- theoretically this should be part of the MergeVariantsAcrossHaplotypes class, but it's just so much more complicated to do so
if ( variationIsTooComplex(proposedEvents) ) {
addComplexVC(cigar, alignment, haplotype.getAlignmentStartHapwrtRef());
} else {
for ( final VariantContext proposedEvent : proposedEvents )
addVC(proposedEvent, true);
}
}
/**
* Determine whether the provided set of variants is too complex for breaking up into individual parts
*
* @param events the individual events
* @return true if the cigar is too complex, false otherwise
*/
private boolean variationIsTooComplex(final List<VariantContext> events) {
// TODO -- we've decided to disable this for now and try "physical phasing"
return false;
//int indelCount = 0;
//for ( final VariantContext event : events ) {
// if ( event.isIndel() )
// indelCount++;
//}
//
// don't allow too many indels
//return indelCount > MAX_INDELS_PER_HAPLOTYPE;
}
/**
* Add a complex variant context to the events given the haplotype and its cigar
*
* @param cigar the cigar to convert
* @param haplotype the bases of the alternate haplotype
* @param refPos the position on the reference for this alignment
*/
private void addComplexVC(final Cigar cigar, final byte[] haplotype, final int refPos) {
// ignore leading and trailing bases that match between this haplotype and the reference
final int matchingPrefix = numPrefixMatch(ref, haplotype, refPos, 0);
final int matchingSuffix = numSuffixMatch(ref, haplotype, refPos + cigar.getReferenceLength() - 1, haplotype.length - 1);
// edge case: too many matches
final int totalMatch = matchingPrefix + matchingSuffix;
if ( totalMatch >= haplotype.length || totalMatch >= ref.length )
return;
final byte[] refBases = Arrays.copyOfRange(ref, refPos + matchingPrefix, refPos + cigar.getReferenceLength() - matchingSuffix);
final byte[] altBases = Arrays.copyOfRange(haplotype, matchingPrefix, haplotype.length - matchingSuffix);
final List<Allele> alleles = new ArrayList<>();
alleles.add( Allele.create( refBases, true ) );
alleles.add( Allele.create( altBases, false ) );
final int start = refLoc.getStart() + refPos + matchingPrefix;
addVC(new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), start, start + refBases.length - 1, alleles).make(), true);
}
/**
* calculates the extent of the prefix match between 2 sequences
*
* @param seq1 the first sequence
* @param seq2 the second sequence
* @param startPos1 the index on seq1 to start from
* @param startPos2 the index on seq2 to start from
* @return non-negative int representing the matching prefix
*/
private int numPrefixMatch(final byte[] seq1, final byte[] seq2, final int startPos1, final int startPos2) {
int matchingBases = 0;
for ( int pos1 = startPos1, pos2 = startPos2; pos1 < seq1.length && pos2 < seq2.length; pos1++, pos2++ ) {
if ( seq1[pos1] != seq2[pos2] )
break;
matchingBases++;
}
return matchingBases;
}
/**
* calculates the extent of the suffix match between 2 sequences
*
* @param seq1 the first sequence
* @param seq2 the second sequence
* @param startPos1 the index on seq1 to start from
* @param startPos2 the index on seq2 to start from
* @return non-negative int representing the matching suffix
*/
private int numSuffixMatch(final byte[] seq1, final byte[] seq2, final int startPos1, final int startPos2) {
int matchingBases = 0;
for ( int pos1 = startPos1, pos2 = startPos2; pos1 >=0 && pos2 >= 0; pos1--, pos2-- ) {
if ( seq1[pos1] != seq2[pos2] )
break;
matchingBases++;
}
return matchingBases;
for ( final VariantContext proposedEvent : proposedEvents )
addVC(proposedEvent, true);
}
/**
@ -345,7 +254,7 @@ public class EventMap extends TreeMap<Integer, VariantContext> {
// TODO -- warning this is an O(N^3) algorithm because I'm just lazy. If it's valuable we need to reengineer it
@Requires("getNumberOfEvents() > 0")
protected void replaceClumpedEventsWithBlockSubstititions() {
protected void replaceClumpedEventsWithBlockSubstitutions() {
if ( getNumberOfEvents() >= MIN_NUMBER_OF_EVENTS_TO_COMBINE_INTO_BLOCK_SUBSTITUTION) {
int lastStart = -1;
for ( boolean foundOne = true; foundOne; ) {

View File

@ -130,7 +130,7 @@ public class EventMapUnitTest extends BaseTest {
final Haplotype hap = new Haplotype(haplotypeBases.getBytes(), false, 0, TextCigarCodec.getSingleton().decode(cigar));
final GenomeLoc loc = new UnvalidatingGenomeLoc(CHR, 0, 1, refBases.length());
final EventMap ee = new EventMap(hap, refBases.getBytes(), loc, NAME);
ee.replaceClumpedEventsWithBlockSubstititions();
ee.replaceClumpedEventsWithBlockSubstitutions();
Assert.assertEquals(ee.getNumberOfEvents(), 1);
final VariantContext actual = ee.getVariantContexts().iterator().next();
Assert.assertTrue(GATKVariantContextUtils.equalSites(actual, expectedBlock), "Failed with " + actual);
@ -159,7 +159,7 @@ public class EventMapUnitTest extends BaseTest {
final Haplotype hap = new Haplotype(haplotypeBases.getBytes(), false, 0, TextCigarCodec.getSingleton().decode(cigar));
final GenomeLoc loc = new UnvalidatingGenomeLoc(CHR, 0, 1, refBases.length());
final EventMap ee = new EventMap(hap, refBases.getBytes(), loc, NAME);
ee.replaceClumpedEventsWithBlockSubstititions();
ee.replaceClumpedEventsWithBlockSubstitutions();
Assert.assertEquals(ee.getNumberOfEvents(), expectedAlleles.size());
final List<VariantContext> actuals = new ArrayList<VariantContext>(ee.getVariantContexts());
for ( int i = 0; i < ee.getNumberOfEvents(); i++ ) {