Merge pull request #700 from broadinstitute/eb_phase_HC_variants_PT74816060
Initial implementation of functionality to add physical phasing informat...
This commit is contained in:
commit
27193c5048
|
|
@ -496,6 +496,12 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
|
|||
@Argument(fullName="mergeVariantsViaLD", shortName="mergeVariantsViaLD", doc="If specified, we will merge variants together into block substitutions that are in strong local LD", required = false)
|
||||
protected boolean mergeVariantsViaLD = false;
|
||||
|
||||
@Advanced
|
||||
@Argument(fullName="tryPhysicalPhasing", shortName="tryPhysicalPhasing", doc="If specified, we will add physical (read-based) phasing information", required = false)
|
||||
protected boolean tryPhysicalPhasing = false;
|
||||
|
||||
public static final String HAPLOTYPE_CALLER_PHASING_KEY = "HCP";
|
||||
|
||||
// -----------------------------------------------------------------------------------------------
|
||||
// arguments for debugging / developing the haplotype caller
|
||||
// -----------------------------------------------------------------------------------------------
|
||||
|
|
@ -672,7 +678,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
|
|||
if( SCAC.genotypingOutputMode == GenotypingOutputMode.GENOTYPE_GIVEN_ALLELES && consensusMode )
|
||||
throw new UserException("HaplotypeCaller cannot be run in both GENOTYPE_GIVEN_ALLELES mode and in consensus mode. Please choose one or the other.");
|
||||
|
||||
genotypingEngine = new HaplotypeCallerGenotypingEngine( getToolkit(),SCAC);
|
||||
genotypingEngine = new HaplotypeCallerGenotypingEngine( getToolkit(), SCAC, tryPhysicalPhasing);
|
||||
// initialize the output VCF header
|
||||
final VariantAnnotatorEngine annotationEngine = new VariantAnnotatorEngine(Arrays.asList(annotationClassesToUse), annotationsToUse, annotationsToExclude, this, getToolkit());
|
||||
|
||||
|
|
@ -693,6 +699,9 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
|
|||
VCFConstants.DEPTH_KEY,
|
||||
VCFConstants.GENOTYPE_PL_KEY);
|
||||
|
||||
if ( tryPhysicalPhasing )
|
||||
headerInfo.add(new VCFFormatHeaderLine(HAPLOTYPE_CALLER_PHASING_KEY, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Physical phasing information, each unique ID within a given sample (but not across samples) connects alternate alleles as occurring on the same haplotype"));
|
||||
|
||||
// FILTER fields are added unconditionally as it's not always 100% certain the circumstances
|
||||
// where the filters are used. For example, in emitting all sites the lowQual field is used
|
||||
headerInfo.add(new VCFFilterHeaderLine(UnifiedGenotypingEngine.LOW_QUAL_FILTER_NAME, "Low quality"));
|
||||
|
|
|
|||
|
|
@ -77,13 +77,16 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine<HaplotypeC
|
|||
|
||||
private MergeVariantsAcrossHaplotypes crossHaplotypeEventMerger;
|
||||
|
||||
private final boolean tryPhysicalPhasing;
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
* @param toolkit {@inheritDoc}
|
||||
* @param configuration {@inheritDoc}
|
||||
*/
|
||||
public HaplotypeCallerGenotypingEngine(final GenomeAnalysisEngine toolkit, final HaplotypeCallerArgumentCollection configuration) {
|
||||
public HaplotypeCallerGenotypingEngine(final GenomeAnalysisEngine toolkit, final HaplotypeCallerArgumentCollection configuration, final boolean tryPhysicalPhasing) {
|
||||
super(toolkit,configuration);
|
||||
this.tryPhysicalPhasing = tryPhysicalPhasing;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -94,6 +97,7 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine<HaplotypeC
|
|||
*/
|
||||
public HaplotypeCallerGenotypingEngine(final GenomeAnalysisEngine toolkit, final HaplotypeCallerArgumentCollection configuration, final Set<String> sampleNames) {
|
||||
super(toolkit,configuration,sampleNames);
|
||||
tryPhysicalPhasing = false;
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -282,7 +286,179 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine<HaplotypeC
|
|||
}
|
||||
}
|
||||
|
||||
return new CalledHaplotypes(returnCalls, calledHaplotypes);
|
||||
final List<VariantContext> phasedCalls = tryPhysicalPhasing ? phaseCalls(returnCalls, calledHaplotypes) : returnCalls;
|
||||
return new CalledHaplotypes(phasedCalls, calledHaplotypes);
|
||||
}
|
||||
|
||||
/**
|
||||
* Tries to phase the individual alleles based on pairwise comparisons to the other alleles based on all called haplotypes
|
||||
*
|
||||
* @param calls the list of called alleles
|
||||
* @param calledHaplotypes the set of haplotypes used for calling
|
||||
* @return a non-null list which represents the possibly phased version of the calls
|
||||
*/
|
||||
protected List<VariantContext> phaseCalls(final List<VariantContext> calls, final Set<Haplotype> calledHaplotypes) {
|
||||
|
||||
// construct a mapping from alternate allele to the set of haplotypes that contain that allele
|
||||
final Map<VariantContext, Set<Haplotype>> haplotypeMap = constructHaplotypeMapping(calls, calledHaplotypes);
|
||||
|
||||
// construct a mapping from call to phase set ID
|
||||
final Map<VariantContext, Integer> phaseSetMapping = new HashMap<>();
|
||||
final int uniqueCounterEndValue = constructPhaseSetMapping(calls, haplotypeMap, phaseSetMapping);
|
||||
|
||||
// we want to establish (potential) *groups* of phased variants, so we need to be smart when looking at pairwise phasing partners
|
||||
return constructPhaseGroups(calls, phaseSetMapping, uniqueCounterEndValue);
|
||||
}
|
||||
|
||||
/**
|
||||
* Construct the mapping from alternate allele to the set of haplotypes that contain that allele
|
||||
*
|
||||
* @param originalCalls the original unphased calls
|
||||
* @param calledHaplotypes the set of haplotypes used for calling
|
||||
* @return non-null Map
|
||||
*/
|
||||
protected static Map<VariantContext, Set<Haplotype>> constructHaplotypeMapping(final List<VariantContext> originalCalls,
|
||||
final Set<Haplotype> calledHaplotypes) {
|
||||
final Map<VariantContext, Set<Haplotype>> haplotypeMap = new HashMap<>(originalCalls.size());
|
||||
for ( final VariantContext call : originalCalls ) {
|
||||
// don't try to phase if there is not exactly 1 alternate allele
|
||||
if ( ! isBiallelic(call) ) {
|
||||
haplotypeMap.put(call, Collections.<Haplotype>emptySet());
|
||||
continue;
|
||||
}
|
||||
|
||||
// keep track of the haplotypes that contain this particular alternate allele
|
||||
final Set<Haplotype> hapsWithAllele = new HashSet<>();
|
||||
final Allele alt = call.getAlternateAllele(0);
|
||||
|
||||
for ( final Haplotype h : calledHaplotypes ) {
|
||||
for ( final VariantContext event : h.getEventMap().getVariantContexts() ) {
|
||||
if ( event.getStart() == call.getStart() && event.getAlternateAlleles().contains(alt) )
|
||||
hapsWithAllele.add(h);
|
||||
}
|
||||
}
|
||||
haplotypeMap.put(call, hapsWithAllele);
|
||||
}
|
||||
|
||||
return haplotypeMap;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Construct the mapping from call (variant context) to phase set ID
|
||||
*
|
||||
* @param originalCalls the original unphased calls
|
||||
* @param haplotypeMap mapping from alternate allele to the set of haplotypes that contain that allele
|
||||
* @param phaseSetMapping the map to populate in this method
|
||||
* @return the next incremental unique index
|
||||
*/
|
||||
protected static int constructPhaseSetMapping(final List<VariantContext> originalCalls,
|
||||
final Map<VariantContext, Set<Haplotype>> haplotypeMap,
|
||||
final Map<VariantContext, Integer> phaseSetMapping) {
|
||||
|
||||
final int numCalls = originalCalls.size();
|
||||
int uniqueCounter = 0;
|
||||
|
||||
// use the haplotype mapping to connect variants that are always present on the same haplotypes
|
||||
for ( int i = 0; i < numCalls - 1; i++ ) {
|
||||
final VariantContext call = originalCalls.get(i);
|
||||
final Set<Haplotype> haplotypesWithCall = haplotypeMap.get(call);
|
||||
if ( haplotypesWithCall.isEmpty() )
|
||||
continue;
|
||||
|
||||
for ( int j = i+1; j < numCalls; j++ ) {
|
||||
final VariantContext comp = originalCalls.get(j);
|
||||
final Set<Haplotype> haplotypesWithComp = haplotypeMap.get(comp);
|
||||
|
||||
// if the variants are in phase, record that fact
|
||||
if ( haplotypesWithCall.size() == haplotypesWithComp.size() && haplotypesWithCall.containsAll(haplotypesWithComp) ) {
|
||||
// if it's part of an existing group, use that group's unique ID
|
||||
if ( phaseSetMapping.containsKey(call) ) {
|
||||
phaseSetMapping.put(comp, phaseSetMapping.get(call));
|
||||
// otherwise, create a new group
|
||||
} else {
|
||||
phaseSetMapping.put(call, uniqueCounter);
|
||||
phaseSetMapping.put(comp, uniqueCounter);
|
||||
uniqueCounter++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return uniqueCounter;
|
||||
}
|
||||
|
||||
/**
|
||||
* Assemble the phase groups together and update the original calls accordingly
|
||||
*
|
||||
* @param originalCalls the original unphased calls
|
||||
* @param phaseSetMapping mapping from call (variant context) to phase group ID
|
||||
* @param indexTo last index (exclusive) of phase group IDs
|
||||
* @return a non-null list which represents the possibly phased version of the calls
|
||||
*/
|
||||
protected static List<VariantContext> constructPhaseGroups(final List<VariantContext> originalCalls,
|
||||
final Map<VariantContext, Integer> phaseSetMapping,
|
||||
final int indexTo) {
|
||||
final List<VariantContext> phasedCalls = new ArrayList<>(originalCalls);
|
||||
|
||||
// if we managed to find any phased groups, update the VariantContexts
|
||||
for ( int count = 0; count < indexTo; count++ ) {
|
||||
// get all of the (indexes of the) calls that belong in this group (keeping them in the original order)
|
||||
final List<Integer> indexes = new ArrayList<>();
|
||||
for ( int index = 0; index < originalCalls.size(); index++ ) {
|
||||
final VariantContext call = originalCalls.get(index);
|
||||
if ( phaseSetMapping.containsKey(call) && phaseSetMapping.get(call) == count )
|
||||
indexes.add(index);
|
||||
}
|
||||
if ( indexes.size() < 2 )
|
||||
throw new IllegalStateException("Somehow we have a group of phased variants that has fewer than 2 members");
|
||||
|
||||
// create a unique ID based on the leftmost one
|
||||
final String uniqueID = createUniqueID(originalCalls.get(indexes.get(0)));
|
||||
|
||||
// update the VCs
|
||||
for ( final int index : indexes ) {
|
||||
final VariantContext phasedCall = phaseVC(originalCalls.get(index), uniqueID);
|
||||
phasedCalls.set(index, phasedCall);
|
||||
}
|
||||
}
|
||||
|
||||
return phasedCalls;
|
||||
}
|
||||
|
||||
/**
|
||||
* Is this variant bi-allelic? This implementation is very much specific to this class so shouldn't be pulled out into a generalized place.
|
||||
*
|
||||
* @param vc the variant context
|
||||
* @return true if this variant context is bi-allelic, ignoring the NON-REF symbolic allele, false otherwise
|
||||
*/
|
||||
private static boolean isBiallelic(final VariantContext vc) {
|
||||
return vc.isBiallelic() || (vc.getNAlleles() == 3 && vc.getAlternateAlleles().contains(GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE));
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a unique identifier given the variant context
|
||||
*
|
||||
* @param vc the variant context
|
||||
* @return non-null String
|
||||
*/
|
||||
private static String createUniqueID(final VariantContext vc) {
|
||||
return String.format("%d_%s_%s", vc.getStart(), vc.getReference().getDisplayString(), vc.getAlternateAllele(0).getDisplayString());
|
||||
// return base + "_0," + base + "_1";
|
||||
}
|
||||
|
||||
/**
|
||||
* Add physical phase information to the provided variant context
|
||||
*
|
||||
* @param vc the variant context
|
||||
* @param ID the ID to use
|
||||
* @return phased non-null variant context
|
||||
*/
|
||||
private static VariantContext phaseVC(final VariantContext vc, final String ID) {
|
||||
final List<Genotype> phasedGenotypes = new ArrayList<>();
|
||||
for ( final Genotype g : vc.getGenotypes() )
|
||||
phasedGenotypes.add(new GenotypeBuilder(g).attribute(HaplotypeCaller.HAPLOTYPE_CALLER_PHASING_KEY, ID).make());
|
||||
return new VariantContextBuilder(vc).genotypes(phasedGenotypes).make();
|
||||
}
|
||||
|
||||
// Builds the read-likelihoods collection to use for annotation considering user arguments and the collection
|
||||
|
|
|
|||
|
|
@ -57,6 +57,7 @@ import htsjdk.variant.variantcontext.*;
|
|||
import org.broadinstitute.gatk.utils.BaseTest;
|
||||
import org.broadinstitute.gatk.utils.*;
|
||||
import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile;
|
||||
import org.broadinstitute.gatk.utils.haplotype.EventMap;
|
||||
import org.broadinstitute.gatk.utils.haplotype.Haplotype;
|
||||
import org.broadinstitute.gatk.utils.pileup.ReadBackedPileup;
|
||||
import org.broadinstitute.gatk.utils.pileup.ReadBackedPileupImpl;
|
||||
|
|
@ -365,4 +366,201 @@ public class HaplotypeCallerGenotypingEngineUnitTest extends BaseTest {
|
|||
}
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@DataProvider(name = "CreateHaplotypeMappingProvider")
|
||||
public Object[][] makeCreateHaplotypeMappingData() {
|
||||
List<Object[]> tests = new ArrayList<Object[]>();
|
||||
|
||||
final Set<Haplotype> haplotypes = new HashSet<>();
|
||||
final Allele ref = Allele.create("A", true);
|
||||
final Allele altC = Allele.create("C", false);
|
||||
final Allele altT = Allele.create("T", false);
|
||||
|
||||
final Haplotype AtoC1 = new Haplotype("AACAA".getBytes());
|
||||
final VariantContext vc1 = new VariantContextBuilder().chr("20").start(3).stop(3).alleles(Arrays.asList(ref, altC)).make();
|
||||
AtoC1.setEventMap(new EventMap(Arrays.asList(vc1)));
|
||||
AtoC1.getEventMap().put(3, vc1);
|
||||
haplotypes.add(AtoC1);
|
||||
|
||||
final Haplotype AtoC2 = new Haplotype("AAACA".getBytes());
|
||||
final VariantContext vc2 = new VariantContextBuilder().chr("20").start(4).stop(4).alleles(Arrays.asList(ref, altT)).make();
|
||||
AtoC2.setEventMap(new EventMap(Arrays.asList(vc2)));
|
||||
AtoC2.getEventMap().put(4, vc2);
|
||||
haplotypes.add(AtoC2);
|
||||
|
||||
tests.add(new Object[]{vc1, haplotypes, AtoC1});
|
||||
tests.add(new Object[]{vc2, haplotypes, AtoC2});
|
||||
tests.add(new Object[]{new VariantContextBuilder().chr("20").start(1).stop(1).alleles(Arrays.asList(ref, altT)).make(), haplotypes, null});
|
||||
|
||||
return tests.toArray(new Object[][]{});
|
||||
}
|
||||
|
||||
@Test(dataProvider="CreateHaplotypeMappingProvider")
|
||||
public void testCreateHaplotypeMapping(final VariantContext vc, final Set<Haplotype> haplotypes, final Haplotype expected) {
|
||||
final Map<VariantContext, Set<Haplotype>> mapping = HaplotypeCallerGenotypingEngine.constructHaplotypeMapping(Arrays.asList(vc), haplotypes);
|
||||
final Set<Haplotype> actual = mapping.get(vc);
|
||||
if ( expected == null )
|
||||
Assert.assertTrue(actual.isEmpty(), actual.toString());
|
||||
else {
|
||||
Assert.assertEquals(actual.size(), 1);
|
||||
Assert.assertEquals(actual.iterator().next(), expected);
|
||||
}
|
||||
}
|
||||
|
||||
@DataProvider(name = "ConstructPhaseSetMappingProvider")
|
||||
public Object[][] makeConstructPhaseSetMappingData() {
|
||||
List<Object[]> tests = new ArrayList<Object[]>();
|
||||
|
||||
final Allele ref = Allele.create("A", true);
|
||||
final Allele altC = Allele.create("C", false);
|
||||
final Allele altT = Allele.create("T", false);
|
||||
|
||||
final VariantContext vc2 = new VariantContextBuilder().chr("20").start(2).stop(2).alleles(Arrays.asList(ref, altC)).make();
|
||||
final VariantContext vc3 = new VariantContextBuilder().chr("20").start(3).stop(3).alleles(Arrays.asList(ref, altT)).make();
|
||||
final VariantContext vc4 = new VariantContextBuilder().chr("20").start(4).stop(4).alleles(Arrays.asList(ref, altC)).make();
|
||||
final List<VariantContext> calls = Arrays.asList(vc2, vc3, vc4);
|
||||
|
||||
final Haplotype pos2 = new Haplotype("ACAAA".getBytes());
|
||||
pos2.setEventMap(new EventMap(Arrays.asList(vc2)));
|
||||
pos2.getEventMap().put(2, vc2);
|
||||
final Haplotype pos3 = new Haplotype("AACAA".getBytes());
|
||||
pos3.setEventMap(new EventMap(Arrays.asList(vc3)));
|
||||
pos3.getEventMap().put(3, vc3);
|
||||
final Haplotype pos4 = new Haplotype("AAACA".getBytes());
|
||||
pos4.setEventMap(new EventMap(Arrays.asList(vc4)));
|
||||
pos4.getEventMap().put(4, vc4);
|
||||
final Haplotype pos24 = new Haplotype("ACACA".getBytes());
|
||||
pos24.setEventMap(new EventMap(Arrays.asList(vc2, vc4)));
|
||||
pos24.getEventMap().put(2, vc2);
|
||||
pos24.getEventMap().put(4, vc4);
|
||||
final Haplotype pos234 = new Haplotype("ACCCA".getBytes());
|
||||
pos234.setEventMap(new EventMap(Arrays.asList(vc2, vc3, vc4)));
|
||||
pos234.getEventMap().put(2, vc2);
|
||||
pos234.getEventMap().put(3, vc3);
|
||||
pos234.getEventMap().put(4, vc4);
|
||||
|
||||
// test no phased variants
|
||||
final Set<Haplotype> haplotypes2NoPhase = new HashSet<>();
|
||||
haplotypes2NoPhase.add(pos2);
|
||||
haplotypes2NoPhase.add(pos24);
|
||||
final Set<Haplotype> haplotypes3NoPhase = new HashSet<>();
|
||||
haplotypes3NoPhase.add(pos3);
|
||||
final Set<Haplotype> haplotypes4NoPhase = new HashSet<>();
|
||||
haplotypes4NoPhase.add(pos4);
|
||||
haplotypes4NoPhase.add(pos24);
|
||||
final Map<VariantContext, Set<Haplotype>> haplotypeMapNoPhase = new HashMap<>();
|
||||
haplotypeMapNoPhase.put(vc2, haplotypes2NoPhase);
|
||||
haplotypeMapNoPhase.put(vc3, haplotypes3NoPhase);
|
||||
haplotypeMapNoPhase.put(vc4, haplotypes4NoPhase);
|
||||
tests.add(new Object[]{calls, haplotypeMapNoPhase, 0, 0});
|
||||
|
||||
// test 2 phased variants
|
||||
final Set<Haplotype> haplotypes2SomePhase = new HashSet<>();
|
||||
haplotypes2SomePhase.add(pos24);
|
||||
final Set<Haplotype> haplotypes4SomePhase = new HashSet<>();
|
||||
haplotypes4SomePhase.add(pos24);
|
||||
final Map<VariantContext, Set<Haplotype>> haplotypeMapSomePhase = new HashMap<>();
|
||||
haplotypeMapSomePhase.put(vc2, haplotypes2SomePhase);
|
||||
haplotypeMapSomePhase.put(vc3, haplotypes3NoPhase);
|
||||
haplotypeMapSomePhase.put(vc4, haplotypes4SomePhase);
|
||||
tests.add(new Object[]{calls, haplotypeMapSomePhase, 2, 1});
|
||||
|
||||
// test all phased variants
|
||||
final Set<Haplotype> haplotypes2AllPhase = new HashSet<>();
|
||||
haplotypes2AllPhase.add(pos234);
|
||||
final Set<Haplotype> haplotypes3AllPhase = new HashSet<>();
|
||||
haplotypes3AllPhase.add(pos234);
|
||||
final Set<Haplotype> haplotypes4AllPhase = new HashSet<>();
|
||||
haplotypes4AllPhase.add(pos234);
|
||||
final Map<VariantContext, Set<Haplotype>> haplotypeMapAllPhase = new HashMap<>();
|
||||
haplotypeMapAllPhase.put(vc2, haplotypes2AllPhase);
|
||||
haplotypeMapAllPhase.put(vc3, haplotypes3AllPhase);
|
||||
haplotypeMapAllPhase.put(vc4, haplotypes4AllPhase);
|
||||
tests.add(new Object[]{calls, haplotypeMapAllPhase, 3, 1});
|
||||
|
||||
return tests.toArray(new Object[][]{});
|
||||
}
|
||||
|
||||
@Test(dataProvider="ConstructPhaseSetMappingProvider")
|
||||
public void testConstructPhaseSetMapping(final List<VariantContext> calls,
|
||||
final Map<VariantContext, Set<Haplotype>> haplotypeMap,
|
||||
final int expectedMapSize,
|
||||
final int expectedNumGroups) {
|
||||
final Map<VariantContext, Integer> actualPhaseSetMapping = new HashMap<>();
|
||||
final int actualNumGroups = HaplotypeCallerGenotypingEngine.constructPhaseSetMapping(calls, haplotypeMap, actualPhaseSetMapping);
|
||||
Assert.assertEquals(actualNumGroups, expectedNumGroups);
|
||||
Assert.assertEquals(actualPhaseSetMapping.size(), expectedMapSize);
|
||||
}
|
||||
|
||||
@DataProvider(name = "ConstructPhaseGroupsProvider")
|
||||
public Object[][] makeConstructPhaseGroupsData() {
|
||||
List<Object[]> tests = new ArrayList<Object[]>();
|
||||
|
||||
final Allele ref = Allele.create("A", true);
|
||||
final Allele altC = Allele.create("C", false);
|
||||
|
||||
final Genotype g1 = new GenotypeBuilder().alleles(Arrays.asList(ref, altC)).make();
|
||||
final VariantContext vc1 = new VariantContextBuilder().chr("20").start(1).stop(1).alleles(Arrays.asList(ref, altC)).genotypes(g1).make();
|
||||
final Genotype g2 = new GenotypeBuilder().alleles(Arrays.asList(ref, altC)).make();
|
||||
final VariantContext vc2 = new VariantContextBuilder().chr("20").start(2).stop(2).alleles(Arrays.asList(ref, altC)).genotypes(g2).make();
|
||||
final Genotype g3 = new GenotypeBuilder().alleles(Arrays.asList(ref, altC)).make();
|
||||
final VariantContext vc3 = new VariantContextBuilder().chr("20").start(3).stop(3).alleles(Arrays.asList(ref, altC)).genotypes(g3).make();
|
||||
final List<VariantContext> calls = Arrays.asList(vc1, vc2, vc3);
|
||||
|
||||
// test no phased variants, empty map
|
||||
final Map<VariantContext, Integer> nonePhased1 = new HashMap<>();
|
||||
tests.add(new Object[]{calls, nonePhased1, 0, 0, 0});
|
||||
|
||||
// test no phased variants, full map, exception expected
|
||||
final Map<VariantContext, Integer> nonePhased2 = new HashMap<>();
|
||||
nonePhased2.put(vc1, 0);
|
||||
nonePhased2.put(vc2, 1);
|
||||
nonePhased2.put(vc3, 2);
|
||||
tests.add(new Object[]{calls, nonePhased2, 3, -1, -1});
|
||||
|
||||
// test 2 phased variants
|
||||
final Map<VariantContext, Integer> twoPhased = new HashMap<>();
|
||||
twoPhased.put(vc1, 0);
|
||||
twoPhased.put(vc2, 0);
|
||||
tests.add(new Object[]{calls, twoPhased, 1, 1, 2});
|
||||
|
||||
// test all phased variants
|
||||
final Map<VariantContext, Integer> allPhased = new HashMap<>();
|
||||
allPhased.put(vc1, 0);
|
||||
allPhased.put(vc2, 0);
|
||||
allPhased.put(vc3, 0);
|
||||
tests.add(new Object[]{calls, allPhased, 1, 1, 3});
|
||||
|
||||
return tests.toArray(new Object[][]{});
|
||||
}
|
||||
|
||||
@Test(dataProvider="ConstructPhaseGroupsProvider")
|
||||
public void testConstructPhaseGroups(final List<VariantContext> calls,
|
||||
final Map<VariantContext, Integer> phaseMap,
|
||||
final int endIndex,
|
||||
final int expectedNumGroups,
|
||||
final int expectedGroupSize) {
|
||||
final List<VariantContext> actualPhasedCalls;
|
||||
try {
|
||||
actualPhasedCalls = HaplotypeCallerGenotypingEngine.constructPhaseGroups(calls, phaseMap, endIndex);
|
||||
} catch (IllegalStateException e) {
|
||||
Assert.assertEquals(-1, expectedNumGroups);
|
||||
return;
|
||||
}
|
||||
|
||||
final Set<String> uniqueGroups = new HashSet<>();
|
||||
int counter = 0;
|
||||
for ( final VariantContext call : actualPhasedCalls ) {
|
||||
for ( final Genotype g : call.getGenotypes() ) {
|
||||
if ( g.hasExtendedAttribute(HaplotypeCaller.HAPLOTYPE_CALLER_PHASING_KEY) ) {
|
||||
uniqueGroups.add(g.getExtendedAttribute(HaplotypeCaller.HAPLOTYPE_CALLER_PHASING_KEY).toString());
|
||||
counter++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Assert.assertEquals(uniqueGroups.size(), expectedNumGroups);
|
||||
Assert.assertEquals(counter, expectedGroupSize);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -74,7 +74,7 @@ public class EventMap extends TreeMap<Integer, VariantContext> {
|
|||
* For testing. Let's you set up a explicit configuration without having to process a haplotype and reference
|
||||
* @param stateForTesting
|
||||
*/
|
||||
protected EventMap(final Collection<VariantContext> stateForTesting) {
|
||||
public EventMap(final Collection<VariantContext> stateForTesting) {
|
||||
haplotype = null;
|
||||
ref = null;
|
||||
refLoc = null;
|
||||
|
|
@ -176,99 +176,8 @@ public class EventMap extends TreeMap<Integer, VariantContext> {
|
|||
}
|
||||
}
|
||||
|
||||
// handle the case where the event set for the haplotype is very complex
|
||||
// TODO -- theoretically this should be part of the MergeVariantsAcrossHaplotypes class, but it's just so much more complicated to do so
|
||||
if ( variationIsTooComplex(proposedEvents) ) {
|
||||
addComplexVC(cigar, alignment, haplotype.getAlignmentStartHapwrtRef());
|
||||
} else {
|
||||
for ( final VariantContext proposedEvent : proposedEvents )
|
||||
addVC(proposedEvent, true);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Determine whether the provided set of variants is too complex for breaking up into individual parts
|
||||
*
|
||||
* @param events the individual events
|
||||
* @return true if the cigar is too complex, false otherwise
|
||||
*/
|
||||
private boolean variationIsTooComplex(final List<VariantContext> events) {
|
||||
// TODO -- we've decided to disable this for now and try "physical phasing"
|
||||
return false;
|
||||
|
||||
//int indelCount = 0;
|
||||
//for ( final VariantContext event : events ) {
|
||||
// if ( event.isIndel() )
|
||||
// indelCount++;
|
||||
//}
|
||||
//
|
||||
// don't allow too many indels
|
||||
//return indelCount > MAX_INDELS_PER_HAPLOTYPE;
|
||||
}
|
||||
|
||||
/**
|
||||
* Add a complex variant context to the events given the haplotype and its cigar
|
||||
*
|
||||
* @param cigar the cigar to convert
|
||||
* @param haplotype the bases of the alternate haplotype
|
||||
* @param refPos the position on the reference for this alignment
|
||||
*/
|
||||
private void addComplexVC(final Cigar cigar, final byte[] haplotype, final int refPos) {
|
||||
// ignore leading and trailing bases that match between this haplotype and the reference
|
||||
final int matchingPrefix = numPrefixMatch(ref, haplotype, refPos, 0);
|
||||
final int matchingSuffix = numSuffixMatch(ref, haplotype, refPos + cigar.getReferenceLength() - 1, haplotype.length - 1);
|
||||
|
||||
// edge case: too many matches
|
||||
final int totalMatch = matchingPrefix + matchingSuffix;
|
||||
if ( totalMatch >= haplotype.length || totalMatch >= ref.length )
|
||||
return;
|
||||
|
||||
final byte[] refBases = Arrays.copyOfRange(ref, refPos + matchingPrefix, refPos + cigar.getReferenceLength() - matchingSuffix);
|
||||
final byte[] altBases = Arrays.copyOfRange(haplotype, matchingPrefix, haplotype.length - matchingSuffix);
|
||||
|
||||
final List<Allele> alleles = new ArrayList<>();
|
||||
alleles.add( Allele.create( refBases, true ) );
|
||||
alleles.add( Allele.create( altBases, false ) );
|
||||
final int start = refLoc.getStart() + refPos + matchingPrefix;
|
||||
addVC(new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), start, start + refBases.length - 1, alleles).make(), true);
|
||||
}
|
||||
|
||||
/**
|
||||
* calculates the extent of the prefix match between 2 sequences
|
||||
*
|
||||
* @param seq1 the first sequence
|
||||
* @param seq2 the second sequence
|
||||
* @param startPos1 the index on seq1 to start from
|
||||
* @param startPos2 the index on seq2 to start from
|
||||
* @return non-negative int representing the matching prefix
|
||||
*/
|
||||
private int numPrefixMatch(final byte[] seq1, final byte[] seq2, final int startPos1, final int startPos2) {
|
||||
int matchingBases = 0;
|
||||
for ( int pos1 = startPos1, pos2 = startPos2; pos1 < seq1.length && pos2 < seq2.length; pos1++, pos2++ ) {
|
||||
if ( seq1[pos1] != seq2[pos2] )
|
||||
break;
|
||||
matchingBases++;
|
||||
}
|
||||
return matchingBases;
|
||||
}
|
||||
|
||||
/**
|
||||
* calculates the extent of the suffix match between 2 sequences
|
||||
*
|
||||
* @param seq1 the first sequence
|
||||
* @param seq2 the second sequence
|
||||
* @param startPos1 the index on seq1 to start from
|
||||
* @param startPos2 the index on seq2 to start from
|
||||
* @return non-negative int representing the matching suffix
|
||||
*/
|
||||
private int numSuffixMatch(final byte[] seq1, final byte[] seq2, final int startPos1, final int startPos2) {
|
||||
int matchingBases = 0;
|
||||
for ( int pos1 = startPos1, pos2 = startPos2; pos1 >=0 && pos2 >= 0; pos1--, pos2-- ) {
|
||||
if ( seq1[pos1] != seq2[pos2] )
|
||||
break;
|
||||
matchingBases++;
|
||||
}
|
||||
return matchingBases;
|
||||
for ( final VariantContext proposedEvent : proposedEvents )
|
||||
addVC(proposedEvent, true);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -345,7 +254,7 @@ public class EventMap extends TreeMap<Integer, VariantContext> {
|
|||
|
||||
// TODO -- warning this is an O(N^3) algorithm because I'm just lazy. If it's valuable we need to reengineer it
|
||||
@Requires("getNumberOfEvents() > 0")
|
||||
protected void replaceClumpedEventsWithBlockSubstititions() {
|
||||
protected void replaceClumpedEventsWithBlockSubstitutions() {
|
||||
if ( getNumberOfEvents() >= MIN_NUMBER_OF_EVENTS_TO_COMBINE_INTO_BLOCK_SUBSTITUTION) {
|
||||
int lastStart = -1;
|
||||
for ( boolean foundOne = true; foundOne; ) {
|
||||
|
|
|
|||
|
|
@ -130,7 +130,7 @@ public class EventMapUnitTest extends BaseTest {
|
|||
final Haplotype hap = new Haplotype(haplotypeBases.getBytes(), false, 0, TextCigarCodec.getSingleton().decode(cigar));
|
||||
final GenomeLoc loc = new UnvalidatingGenomeLoc(CHR, 0, 1, refBases.length());
|
||||
final EventMap ee = new EventMap(hap, refBases.getBytes(), loc, NAME);
|
||||
ee.replaceClumpedEventsWithBlockSubstititions();
|
||||
ee.replaceClumpedEventsWithBlockSubstitutions();
|
||||
Assert.assertEquals(ee.getNumberOfEvents(), 1);
|
||||
final VariantContext actual = ee.getVariantContexts().iterator().next();
|
||||
Assert.assertTrue(GATKVariantContextUtils.equalSites(actual, expectedBlock), "Failed with " + actual);
|
||||
|
|
@ -159,7 +159,7 @@ public class EventMapUnitTest extends BaseTest {
|
|||
final Haplotype hap = new Haplotype(haplotypeBases.getBytes(), false, 0, TextCigarCodec.getSingleton().decode(cigar));
|
||||
final GenomeLoc loc = new UnvalidatingGenomeLoc(CHR, 0, 1, refBases.length());
|
||||
final EventMap ee = new EventMap(hap, refBases.getBytes(), loc, NAME);
|
||||
ee.replaceClumpedEventsWithBlockSubstititions();
|
||||
ee.replaceClumpedEventsWithBlockSubstitutions();
|
||||
Assert.assertEquals(ee.getNumberOfEvents(), expectedAlleles.size());
|
||||
final List<VariantContext> actuals = new ArrayList<VariantContext>(ee.getVariantContexts());
|
||||
for ( int i = 0; i < ee.getNumberOfEvents(); i++ ) {
|
||||
|
|
|
|||
Loading…
Reference in New Issue