Lots more GGA fixes for the HC now that I understand what's going on internally. Integration tests pass except for the GGA test which I believe now produces better results.

This commit is contained in:
Eric Banks 2012-11-20 16:13:29 -05:00
parent 4f243acaa6
commit 937ac7290f
6 changed files with 87 additions and 43 deletions

View File

@ -62,6 +62,7 @@ public class GenotypingEngine {
final List<VariantContext> activeAllelesToGenotype ) {
final ArrayList<Pair<VariantContext, Map<Allele, List<Haplotype>>>> returnCalls = new ArrayList<Pair<VariantContext, Map<Allele, List<Haplotype>>>>();
final boolean in_GGA_mode = !activeAllelesToGenotype.isEmpty();
// Using the cigar from each called haplotype figure out what events need to be written out in a VCF file
final TreeSet<Integer> startPosKeySet = new TreeSet<Integer>();
@ -70,7 +71,7 @@ public class GenotypingEngine {
for( final Haplotype h : haplotypes ) {
// Walk along the alignment and turn any difference from the reference into an event
h.setEventMap( generateVCsFromAlignment( h, h.getAlignmentStartHapwrtRef(), h.getCigar(), ref, h.getBases(), refLoc, "HC" + count++ ) );
if( activeAllelesToGenotype.isEmpty() ) { startPosKeySet.addAll(h.getEventMap().keySet()); }
if( !in_GGA_mode ) { startPosKeySet.addAll(h.getEventMap().keySet()); }
if( DEBUG ) {
System.out.println( h.toString() );
System.out.println( "> Cigar = " + h.getCigar() );
@ -80,10 +81,10 @@ public class GenotypingEngine {
}
cleanUpSymbolicUnassembledEvents( haplotypes );
if( activeAllelesToGenotype.isEmpty() && haplotypes.get(0).getSampleKeySet().size() >= 10 ) { // if not in GGA mode and have at least 10 samples try to create MNP and complex events by looking at LD structure
if( !in_GGA_mode && haplotypes.get(0).getSampleKeySet().size() >= 10 ) { // if not in GGA mode and have at least 10 samples try to create MNP and complex events by looking at LD structure
mergeConsecutiveEventsBasedOnLD( haplotypes, startPosKeySet, ref, refLoc );
}
if( !activeAllelesToGenotype.isEmpty() ) { // we are in GGA mode!
if( in_GGA_mode ) {
for( final VariantContext compVC : activeAllelesToGenotype ) {
startPosKeySet.add( compVC.getStart() );
}
@ -95,7 +96,7 @@ public class GenotypingEngine {
final ArrayList<VariantContext> eventsAtThisLoc = new ArrayList<VariantContext>(); // the overlapping events to merge into a common reference view
final ArrayList<String> priorityList = new ArrayList<String>(); // used to merge overlapping events into common reference view
if( activeAllelesToGenotype.isEmpty() ) {
if( !in_GGA_mode ) {
for( final Haplotype h : haplotypes ) {
final HashMap<Integer,VariantContext> eventMap = h.getEventMap();
final VariantContext vc = eventMap.get(loc);
@ -129,9 +130,8 @@ public class GenotypingEngine {
final Allele refAllele = eventsAtThisLoc.get(0).getReference();
final ArrayList<Allele> alleleOrdering = new ArrayList<Allele>(alleleMapper.size());
alleleOrdering.add(refAllele);
for ( final Allele allele : alleleMapper.keySet() ) {
if ( !refAllele.equals(allele) )
alleleOrdering.add(allele);
for( final VariantContext vc : eventsAtThisLoc ) {
alleleOrdering.add(vc.getAlternateAllele(0));
}
// Sanity check the priority list
@ -154,6 +154,16 @@ public class GenotypingEngine {
final VariantContext mergedVC = VariantContextUtils.simpleMerge(genomeLocParser, eventsAtThisLoc, priorityList, VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, VariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, null, false, false);
if( mergedVC == null ) { continue; }
// let's update the Allele keys in the mapper because they can change after merging when there are complex events
Map<Allele, List<Haplotype>> updatedAlleleMapper = new HashMap<Allele, List<Haplotype>>(alleleMapper.size());
for ( int i = 0; i < mergedVC.getNAlleles(); i++ ) {
final Allele oldAllele = alleleOrdering.get(i);
final Allele newAllele = mergedVC.getAlleles().get(i);
updatedAlleleMapper.put(newAllele, alleleMapper.get(oldAllele));
alleleOrdering.set(i, newAllele);
}
alleleMapper = updatedAlleleMapper;
if( DEBUG ) {
System.out.println("Genotyping event at " + loc + " with alleles = " + mergedVC.getAlleles());
//System.out.println("Event/haplotype allele mapping = " + alleleMapper);
@ -358,48 +368,48 @@ public class GenotypingEngine {
@Ensures({"result.size() == eventsAtThisLoc.size() + 1"})
protected static Map<Allele, List<Haplotype>> createAlleleMapper( final int loc, final List<VariantContext> eventsAtThisLoc, final List<Haplotype> haplotypes ) {
final Allele refAllele = eventsAtThisLoc.get(0).getReference();
final Map<Allele, List<Haplotype>> alleleMapper = new HashMap<Allele, List<Haplotype>>(eventsAtThisLoc.size()+1);
final Allele refAllele = eventsAtThisLoc.get(0).getReference();
alleleMapper.put(refAllele, new ArrayList<Haplotype>());
for( final VariantContext vc : eventsAtThisLoc )
alleleMapper.put(vc.getAlternateAllele(0), new ArrayList<Haplotype>());
final ArrayList<Haplotype> undeterminedHaplotypes = new ArrayList<Haplotype>(haplotypes.size());
for( final Haplotype h : haplotypes ) {
if( h.getEventMap().get(loc) == null ) { // no event at this location so this is a reference-supporting haplotype
if ( !alleleMapper.containsKey(refAllele) )
alleleMapper.put(refAllele, new ArrayList<Haplotype>());
alleleMapper.get(refAllele).add(h);
} else if ( h.isArtificialHaplotype() ) {
if ( !alleleMapper.containsKey(h.getArtificialAllele()) )
alleleMapper.put(h.getArtificialAllele(), new ArrayList<Haplotype>());
} else if( h.isArtificialHaplotype() && loc == h.getArtificialAllelePosition() && alleleMapper.containsKey(h.getArtificialAllele()) ) {
alleleMapper.get(h.getArtificialAllele()).add(h);
} else {
boolean haplotypeIsDetermined = false;
for( final VariantContext vcAtThisLoc : eventsAtThisLoc ) {
if( h.getEventMap().get(loc).hasSameAllelesAs(vcAtThisLoc) ) {
final Allele altAllele = vcAtThisLoc.getAlternateAllele(0);
if ( !alleleMapper.containsKey(altAllele) )
alleleMapper.put(altAllele, new ArrayList<Haplotype>());
alleleMapper.get(altAllele).add(h);
alleleMapper.get(vcAtThisLoc.getAlternateAllele(0)).add(h);
haplotypeIsDetermined = true;
break;
}
}
if( !haplotypeIsDetermined )
undeterminedHaplotypes.add(h);
}
}
for( final Haplotype h : haplotypes ) {
if ( h.getEventMap().get(loc) == null || h.isArtificialHaplotype() )
continue;
for( final Haplotype h : undeterminedHaplotypes ) {
Allele matchingAllele = null;
for ( final Map.Entry<Allele, List<Haplotype>> alleleToTest : alleleMapper.entrySet() ) {
if ( alleleToTest.getKey().equals(refAllele) )
for( final Map.Entry<Allele, List<Haplotype>> alleleToTest : alleleMapper.entrySet() ) {
// don't test against the reference allele
if( alleleToTest.getKey().equals(refAllele) )
continue;
final Haplotype artificialHaplotype = alleleToTest.getValue().get(0);
if ( isSubSetOf(artificialHaplotype.getEventMap(), h.getEventMap()) ) {
if( isSubSetOf(artificialHaplotype.getEventMap(), h.getEventMap(), true) ) {
matchingAllele = alleleToTest.getKey();
break;
}
}
if ( matchingAllele == null )
if( matchingAllele == null )
matchingAllele = refAllele;
alleleMapper.get(matchingAllele).add(h);
}
@ -407,20 +417,36 @@ public class GenotypingEngine {
return alleleMapper;
}
protected static boolean isSubSetOf(final Map<Integer, VariantContext> subset, final Map<Integer, VariantContext> superset) {
protected static boolean isSubSetOf(final Map<Integer, VariantContext> subset, final Map<Integer, VariantContext> superset, final boolean resolveSupersetToSubset) {
for ( final Map.Entry<Integer, VariantContext> fromSubset : subset.entrySet() ) {
final VariantContext fromSuperset = superset.get(fromSubset.getKey());
if ( fromSuperset == null )
return false;
if ( !fromSuperset.hasAlternateAllele(fromSubset.getValue().getAlternateAllele(0)) )
List<Allele> supersetAlleles = fromSuperset.getAlternateAlleles();
if ( resolveSupersetToSubset )
supersetAlleles = resolveAlternateAlleles(fromSubset.getValue().getReference(), fromSuperset.getReference(), supersetAlleles);
if ( !supersetAlleles.contains(fromSubset.getValue().getAlternateAllele(0)) )
return false;
}
return true;
}
private static List<Allele> resolveAlternateAlleles(final Allele targetReference, final Allele actualReference, final List<Allele> currentAlleles) {
if ( targetReference.length() <= actualReference.length() )
return currentAlleles;
final List<Allele> newAlleles = new ArrayList<Allele>(currentAlleles.size());
final byte[] extraBases = Arrays.copyOfRange(targetReference.getBases(), actualReference.length(), targetReference.length());
for ( final Allele a : currentAlleles ) {
newAlleles.add(Allele.extend(a, extraBases));
}
return newAlleles;
}
@Ensures({"result.size() == haplotypeAllelesForSample.size()"})
protected static List<Allele> findEventAllelesInSample( final List<Allele> eventAlleles, final List<Allele> haplotypeAlleles, final List<Allele> haplotypeAllelesForSample, final ArrayList<ArrayList<Haplotype>> alleleMapper, final ArrayList<Haplotype> haplotypes ) {
if( haplotypeAllelesForSample.contains(Allele.NO_CALL) ) { return noCall; }

View File

@ -196,8 +196,8 @@ public class LikelihoodCalculationEngine {
}
// This function takes a set of samples to pool over and a haplotypeMapping
@Requires({"haplotypeMapping.size() > 0"})
@Ensures({"result.length == result[0].length", "result.length == haplotypeMapping.size()"})
@Requires({"haplotypeList.size() > 0"})
@Ensures({"result.length == result[0].length", "result.length == haplotypeList.size()"})
public static double[][] computeDiploidHaplotypeLikelihoods( final Set<String> samples, final List<Haplotype> haplotypeList ) {
final int numHaplotypes = haplotypeList.size();

View File

@ -278,9 +278,10 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine {
final int activeRegionStart = refHaplotype.getAlignmentStartHapwrtRef();
final int activeRegionStop = refHaplotype.getAlignmentStartHapwrtRef() + refHaplotype.getCigar().getReferenceLength();
for( final VariantContext compVC : activeAllelesToGenotype ) { // for GGA mode, add the desired allele into the haplotype
// for GGA mode, add the desired allele into the haplotype
for( final VariantContext compVC : activeAllelesToGenotype ) {
for( final Allele compAltAllele : compVC.getAlternateAlleles() ) {
final Haplotype insertedRefHaplotype = refHaplotype.insertAllele(compVC.getReference(), compAltAllele, activeRegionStart + compVC.getStart() - activeRegionWindow.getStart());
final Haplotype insertedRefHaplotype = refHaplotype.insertAllele(compVC.getReference(), compAltAllele, activeRegionStart + compVC.getStart() - activeRegionWindow.getStart(), compVC.getStart());
if( !addHaplotype( insertedRefHaplotype, fullReferenceWithPadding, returnHaplotypes, activeRegionStart, activeRegionStop ) ) {
return returnHaplotypes;
//throw new ReviewedStingException("Unable to add reference+allele haplotype during GGA-enabled assembly: " + insertedRefHaplotype);
@ -290,15 +291,24 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine {
for( final DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge> graph : graphs ) {
for ( final KBestPaths.Path path : KBestPaths.getKBestPaths(graph, NUM_BEST_PATHS_PER_KMER_GRAPH) ) {
final Haplotype h = new Haplotype( path.getBases( graph ), path.getScore() );
if( addHaplotype( h, fullReferenceWithPadding, returnHaplotypes, activeRegionStart, activeRegionStop ) ) {
if( !activeAllelesToGenotype.isEmpty() ) { // for GGA mode, add the desired allele into the haplotype if it isn't already present
// for GGA mode, add the desired allele into the haplotype if it isn't already present
if( !activeAllelesToGenotype.isEmpty() ) {
final HashMap<Integer,VariantContext> eventMap = GenotypingEngine.generateVCsFromAlignment( h, h.getAlignmentStartHapwrtRef(), h.getCigar(), fullReferenceWithPadding, h.getBases(), refLoc, "HCassembly" ); // BUGBUG: need to put this function in a shared place
for( final VariantContext compVC : activeAllelesToGenotype ) { // for GGA mode, add the desired allele into the haplotype if it isn't already present
final VariantContext vcOnHaplotype = eventMap.get(compVC.getStart());
if( vcOnHaplotype == null || !vcOnHaplotype.hasSameAllelesAs(compVC) ) {
// This if statement used to additionally have:
// "|| !vcOnHaplotype.hasSameAllelesAs(compVC)"
// but that can lead to problems downstream when e.g. you are injecting a 1bp deletion onto
// a haplotype that already contains a 1bp insertion (so practically it is reference but
// falls into the bin for the 1bp deletion because we keep track of the artificial alleles).
if( vcOnHaplotype == null ) {
for( final Allele compAltAllele : compVC.getAlternateAlleles() ) {
addHaplotype( h.insertAllele(compVC.getReference(), compAltAllele, activeRegionStart + compVC.getStart() - activeRegionWindow.getStart()), fullReferenceWithPadding, returnHaplotypes, activeRegionStart, activeRegionStop );
addHaplotype( h.insertAllele(compVC.getReference(), compAltAllele, activeRegionStart + compVC.getStart() - activeRegionWindow.getStart(), compVC.getStart()), fullReferenceWithPadding, returnHaplotypes, activeRegionStart, activeRegionStop );
}
}
}
@ -370,7 +380,7 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine {
h.setAlignmentStartHapwrtRef( swConsensus2.getAlignmentStart2wrt1() );
h.setCigar( AlignmentUtils.leftAlignIndel(swConsensus2.getCigar(), ref, h.getBases(), swConsensus2.getAlignmentStart2wrt1(), 0) );
if ( haplotype.isArtificialHaplotype() )
h.setArtificialAllele(haplotype.getArtificialAllele());
h.setArtificialAllele(haplotype.getArtificialAllele(), haplotype.getArtificialAllelePosition());
h.leftBreakPoint = leftBreakPoint;
h.rightBreakPoint = rightBreakPoint;
if( swConsensus2.getCigar().toString().contains("S") || swConsensus2.getCigar().getReferenceLength() != activeRegionStop - activeRegionStart ) { // protect against SW failures

View File

@ -29,9 +29,10 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
HCTest(NA12878_BAM, "", "baabae06c85d416920be434939124d7f");
}
// TODO -- add more tests for GGA mode, especially with input alleles that are complex variants and/or not trimmed
@Test
public void testHaplotypeCallerMultiSampleGGA() {
HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", "39da622b309597d7a0b082c8aa1748c9");
HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", "f2d0309fdf50d5827e9c60ed0dd07e3f");
}
private void HCTestComplexVariants(String bam, String args, String md5) {

View File

@ -50,7 +50,8 @@ public class Haplotype {
public int leftBreakPoint = 0;
public int rightBreakPoint = 0;
private Allele artificialAllele = null;
private int artificialAllelePosition = -1;
/**
* Create a simple consensus sequence with provided bases and a uniform quality over all bases of qual
*
@ -72,9 +73,10 @@ public class Haplotype {
this(bases, 0);
}
public Haplotype( final byte[] bases, final Allele artificialAllele ) {
protected Haplotype( final byte[] bases, final Allele artificialAllele, final int artificialAllelePosition ) {
this(bases, 0);
this.artificialAllele = artificialAllele;
this.artificialAllelePosition = artificialAllelePosition;
}
public Haplotype( final byte[] bases, final GenomeLoc loc ) {
@ -185,12 +187,17 @@ public class Haplotype {
return artificialAllele;
}
public void setArtificialAllele(final Allele artificialAllele) {
public int getArtificialAllelePosition() {
return artificialAllelePosition;
}
public void setArtificialAllele(final Allele artificialAllele, final int artificialAllelePosition) {
this.artificialAllele = artificialAllele;
this.artificialAllelePosition = artificialAllelePosition;
}
@Requires({"refInsertLocation >= 0"})
public Haplotype insertAllele( final Allele refAllele, final Allele altAllele, final int refInsertLocation ) {
public Haplotype insertAllele( final Allele refAllele, final Allele altAllele, final int refInsertLocation, final int genomicInsertLocation ) {
// refInsertLocation is in ref haplotype offset coordinates NOT genomic coordinates
final int haplotypeInsertLocation = ReadUtils.getReadCoordinateForReferenceCoordinate(alignmentStartHapwrtRef, cigar, refInsertLocation, ReadUtils.ClippingTail.RIGHT_TAIL, true);
if( haplotypeInsertLocation == -1 || haplotypeInsertLocation + refAllele.length() >= bases.length ) { // desired change falls inside deletion so don't bother creating a new haplotype
@ -200,7 +207,7 @@ public class Haplotype {
newHaplotypeBases = ArrayUtils.addAll(newHaplotypeBases, ArrayUtils.subarray(bases, 0, haplotypeInsertLocation)); // bases before the variant
newHaplotypeBases = ArrayUtils.addAll(newHaplotypeBases, altAllele.getBases()); // the alt allele of the variant
newHaplotypeBases = ArrayUtils.addAll(newHaplotypeBases, ArrayUtils.subarray(bases, haplotypeInsertLocation + refAllele.length(), bases.length)); // bases after the variant
return new Haplotype(newHaplotypeBases, altAllele);
return new Haplotype(newHaplotypeBases, altAllele, genomicInsertLocation);
}
public static class HaplotypeBaseComparator implements Comparator<Haplotype>, Serializable {

View File

@ -159,7 +159,7 @@ public class HaplotypeUnitTest extends BaseTest {
final VariantContext vc = new VariantContextBuilder().alleles(alleles).loc("1", loc, loc + h1refAllele.getBases().length - 1).make();
h.setAlignmentStartHapwrtRef(0);
h.setCigar(cigar);
final Haplotype h1 = h.insertAllele(vc.getReference(), vc.getAlternateAllele(0), loc);
final Haplotype h1 = h.insertAllele(vc.getReference(), vc.getAlternateAllele(0), loc, vc.getStart());
final Haplotype h1expected = new Haplotype(newHap.getBytes());
Assert.assertEquals(h1, h1expected);
}