Fixed non-determinism in HaplotypeCaller and some UG calls -

-- HaplotypeCaller and PerReadAlleleLikelihoodMap should use LinkedHashMaps instead of plain HashMaps. That way the ordering when traversing alleles is maintained. If the JVM traverses HashMaps with random ordering, different reads (with same likelihood) may be removed by contamination checker, and different alleles may be picked if they have same likelihoods for all reads.
-- Put in some GATKDocs and contracts in HaplotypeCaller files (far from done, code is a beast)
-- Update md5's due to different order of iteration in LinkedHashMaps instead of HashMaps inside HaplotypeCaller  (due to change in PerReadAlleleLikelihoodMap that also slightly modifies reads chosen by per-read downsampling).
-- Reenabled testHaplotypeCallerMultiSampleGGAMultiAllelic test
-- Added some defensive argument checks into HaplotypeCaller public functions (not intended to be done yet).
This commit is contained in:
Guillermo del Angel 2013-02-11 14:59:46 -05:00
parent 38cea0a7ab
commit 4308b27f8c
3 changed files with 99 additions and 17 deletions

View File

@ -79,7 +79,25 @@ public class GenotypingEngine {
noCall.add(Allele.NO_CALL);
}
/**
* Main entry point of class - given a particular set of haplotypes, samples and reference context, compute
* genotype likelihoods and assemble into a list of variant contexts and genomic events ready for calling
*
* @param UG_engine UG Engine with basic input parameters
* @param haplotypes Haplotypes to assign likelihoods to
* @param samples Samples to genotype
* @param haplotypeReadMap Map from reads->(haplotypes,likelihoods)
* @param perSampleFilteredReadList
* @param ref Reference bytes at active region
* @param refLoc Corresponding active region genome location
* @param activeRegionWindow Active window
* @param genomeLocParser GenomeLocParser
* @param activeAllelesToGenotype Alleles to genotype
* @return List of VC's with genotyped events
*/
@Requires({"refLoc.containsP(activeRegionWindow)", "haplotypes.size() > 0"})
@Ensures("result != null")
// TODO - can this be refactored? this is hard to follow!
public List<VariantContext> assignGenotypeLikelihoods( final UnifiedGenotyperEngine UG_engine,
final List<Haplotype> haplotypes,
final List<String> samples,
@ -90,6 +108,25 @@ public class GenotypingEngine {
final GenomeLoc activeRegionWindow,
final GenomeLocParser genomeLocParser,
final List<VariantContext> activeAllelesToGenotype ) {
// sanity check input arguments
if (UG_engine == null)
throw new IllegalArgumentException("UG_Engine input can't be null, got "+UG_engine);
if (haplotypes == null || haplotypes.isEmpty())
throw new IllegalArgumentException("haplotypes input should be non-empty and non-null, got "+haplotypes);
if (samples == null || samples.isEmpty())
throw new IllegalArgumentException("samples input must be non-empty and non-null, got "+samples);
if (haplotypeReadMap == null || haplotypeReadMap.isEmpty())
throw new IllegalArgumentException("haplotypeReadMap input should be non-empty and non-null, got "+haplotypeReadMap);
if (ref == null || ref.length == 0 )
throw new IllegalArgumentException("ref bytes input should be non-empty and non-null, got "+ref);
if (refLoc == null || refLoc.getStop()-refLoc.getStart()+1 != ref.length)
throw new IllegalArgumentException(" refLoc must be non-null and length must match ref bytes, got "+refLoc);
if (activeRegionWindow == null )
throw new IllegalArgumentException("activeRegionWindow must be non-null, got "+activeRegionWindow);
if (activeAllelesToGenotype == null )
throw new IllegalArgumentException("activeAllelesToGenotype must be non-null, got "+activeAllelesToGenotype);
if (genomeLocParser == null )
throw new IllegalArgumentException("genomeLocParser must be non-null, got "+genomeLocParser);
final List<VariantContext> returnCalls = new ArrayList<VariantContext>();
final boolean in_GGA_mode = !activeAllelesToGenotype.isEmpty();
@ -180,7 +217,7 @@ public class GenotypingEngine {
if( eventsAtThisLoc.size() != mergedVC.getAlternateAlleles().size() ) {
throw new ReviewedStingException("Record size mismatch! Something went wrong in the merging of alleles.");
}
final Map<VariantContext, Allele> mergeMap = new HashMap<VariantContext, Allele>();
final Map<VariantContext, Allele> mergeMap = new LinkedHashMap<VariantContext, Allele>();
mergeMap.put(null, mergedVC.getReference()); // the reference event (null) --> the reference allele
for(int iii = 0; iii < mergedVC.getAlternateAlleles().size(); iii++) {
mergeMap.put(eventsAtThisLoc.get(iii), mergedVC.getAlternateAllele(iii)); // BUGBUG: This is assuming that the order of alleles is the same as the priority list given to simpleMerge function
@ -214,6 +251,15 @@ public class GenotypingEngine {
return returnCalls;
}
/**
* For a particular event described in inputVC, form PL vector for each sample by looking into allele read map and filling likelihood matrix for each allele
* @param samples List of samples to genotype
* @param alleleReadMap Allele map describing mapping from reads to alleles and corresponding likelihoods
* @param mergedVC Input VC with event to genotype
* @return GenotypesContext object wrapping genotype objects with PLs
*/
@Requires({"samples != null","alleleReadMap!= null", "mergedVC != null"})
@Ensures("result != null")
private GenotypesContext calculateGLsForThisEvent( final List<String> samples, final Map<String, PerReadAlleleLikelihoodMap> alleleReadMap, final VariantContext mergedVC ) {
final GenotypesContext genotypes = GenotypesContext.create(samples.size());
// Grab the genotype likelihoods from the appropriate places in the haplotype likelihood matrix -- calculation performed independently per sample
@ -254,7 +300,7 @@ public class GenotypingEngine {
final Map<String, List<GATKSAMRecord>> perSampleFilteredReadList,
final VariantContext call ) {
final Map<String, PerReadAlleleLikelihoodMap> returnMap = new HashMap<String, PerReadAlleleLikelihoodMap>();
final Map<String, PerReadAlleleLikelihoodMap> returnMap = new LinkedHashMap<String, PerReadAlleleLikelihoodMap>();
final GenomeLoc callLoc = parser.createGenomeLoc(call);
for( final Map.Entry<String, PerReadAlleleLikelihoodMap> sample : perSampleReadMap.entrySet() ) {
final PerReadAlleleLikelihoodMap likelihoodMap = new PerReadAlleleLikelihoodMap();
@ -283,6 +329,12 @@ public class GenotypingEngine {
return returnMap;
}
/**
* Removes symbolic events from list of haplotypes
* @param haplotypes Input/output list of haplotypes, before/after removal
*/
// TODO - split into input haplotypes and output haplotypes as not to share I/O arguments
@Requires("haplotypes != null")
protected static void cleanUpSymbolicUnassembledEvents( final List<Haplotype> haplotypes ) {
final List<Haplotype> haplotypesToRemove = new ArrayList<Haplotype>();
for( final Haplotype h : haplotypes ) {
@ -308,7 +360,7 @@ public class GenotypingEngine {
final double downsamplingFraction,
final PrintStream downsamplingLog ) {
final Map<String, PerReadAlleleLikelihoodMap> alleleReadMap = new HashMap<String, PerReadAlleleLikelihoodMap>();
final Map<String, PerReadAlleleLikelihoodMap> alleleReadMap = new LinkedHashMap<String, PerReadAlleleLikelihoodMap>();
for( final Map.Entry<String, PerReadAlleleLikelihoodMap> haplotypeReadMapEntry : haplotypeReadMap.entrySet() ) { // for each sample
final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap = new PerReadAlleleLikelihoodMap();
for( final Map.Entry<Allele, List<Haplotype>> alleleMapperEntry : alleleMapper.entrySet() ) { // for each output allele
@ -330,6 +382,15 @@ public class GenotypingEngine {
return alleleReadMap;
}
/**
* TODO - comment me, clean me, refactor me!
* @param haplotypes
* @param samples
* @param haplotypeReadMap
* @param startPosKeySet
* @param ref
* @param refLoc
*/
protected void mergeConsecutiveEventsBasedOnLD( final List<Haplotype> haplotypes,
final List<String> samples,
final Map<String, PerReadAlleleLikelihoodMap> haplotypeReadMap,
@ -474,7 +535,7 @@ public class GenotypingEngine {
}
protected static Map<Allele, List<Haplotype>> createAlleleMapper( final Map<VariantContext, Allele> mergeMap, final Map<Event, List<Haplotype>> eventMap ) {
final Map<Allele, List<Haplotype>> alleleMapper = new HashMap<Allele, List<Haplotype>>();
final Map<Allele, List<Haplotype>> alleleMapper = new LinkedHashMap<Allele, List<Haplotype>>();
for( final Map.Entry<VariantContext, Allele> entry : mergeMap.entrySet() ) {
alleleMapper.put(entry.getValue(), eventMap.get(new Event(entry.getKey())));
}
@ -485,7 +546,7 @@ public class GenotypingEngine {
@Ensures({"result.size() == eventsAtThisLoc.size() + 1"})
protected static Map<Event, List<Haplotype>> createEventMapper( final int loc, final List<VariantContext> eventsAtThisLoc, final List<Haplotype> haplotypes ) {
final Map<Event, List<Haplotype>> eventMapper = new HashMap<Event, List<Haplotype>>(eventsAtThisLoc.size()+1);
final Map<Event, List<Haplotype>> eventMapper = new LinkedHashMap<Event, List<Haplotype>>(eventsAtThisLoc.size()+1);
VariantContext refVC = eventsAtThisLoc.get(0); // the genome loc is the only safe thing to pull out of this VC because ref/alt pairs might change reference basis
eventMapper.put(new Event(null), new ArrayList<Haplotype>());
for( final VariantContext vc : eventsAtThisLoc ) {
@ -598,7 +659,7 @@ public class GenotypingEngine {
}
protected static Map<Integer,VariantContext> generateVCsFromAlignment( final Haplotype haplotype, final int alignmentStartHapwrtRef, final Cigar cigar, final byte[] ref, final byte[] alignment, final GenomeLoc refLoc, final String sourceNameToAdd ) {
final Map<Integer,VariantContext> vcs = new HashMap<Integer,VariantContext>();
final Map<Integer,VariantContext> vcs = new LinkedHashMap<Integer,VariantContext>();
int refPos = alignmentStartHapwrtRef;
if( refPos < 0 ) { return null; } // Protection against SW failures

View File

@ -68,7 +68,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void testHaplotypeCallerMultiSample() {
HCTest(CEUTRIO_BAM, "", "042b76d4ba0c8f76e2e9cadd1c20d90d");
HCTest(CEUTRIO_BAM, "", "1e49fd927d79594a993ea6c4a1d10004");
}
@Test
@ -99,7 +99,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
"76d4c4a112cf60080adf74c3e116d1fb");
}
@Test(enabled = false) // TODO -- https://jira.broadinstitute.org/browse/GSA-722
@Test
public void testHaplotypeCallerMultiSampleGGAMultiAllelic() {
HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:133041-133161 -L 20:300207-300337",
"23a4bfa0300683d8cf2ec16ce96e89ad");
@ -146,7 +146,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void HCTestProblematicReadsModifiedInActiveRegions() {
final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3 -L 4:49139026-49139965";
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("866406b43d22a262b2d852e7252eb430"));
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("598d245498c0d0b55e263f0a061a77e3"));
executeTest("HCTestProblematicReadsModifiedInActiveRegions: ", spec);
}

View File

@ -64,15 +64,13 @@ public class PerReadAlleleLikelihoodMap {
if ( a == null ) throw new IllegalArgumentException("Cannot add a null allele to the allele likelihood map");
if ( likelihood == null ) throw new IllegalArgumentException("Likelihood cannot be null");
if ( likelihood > 0.0 ) throw new IllegalArgumentException("Likelihood must be negative (L = log(p))");
Map<Allele,Double> likelihoodMap;
if (likelihoodReadMap.containsKey(read)){
// seen pileup element before
likelihoodMap = likelihoodReadMap.get(read);
}
else {
likelihoodMap = new HashMap<Allele, Double>();
likelihoodReadMap.put(read,likelihoodMap);
Map<Allele,Double> likelihoodMap = likelihoodReadMap.get(read);
if (likelihoodMap == null){
// LinkedHashMap will ensure iterating through alleles will be in consistent order
likelihoodMap = new LinkedHashMap<Allele, Double>();
}
likelihoodReadMap.put(read,likelihoodMap);
likelihoodMap.put(a,likelihood);
if (!alleles.contains(a))
@ -221,4 +219,27 @@ public class PerReadAlleleLikelihoodMap {
}
return (maxLike - prevMaxLike > INFORMATIVE_LIKELIHOOD_THRESHOLD ? mostLikelyAllele : Allele.NO_CALL );
}
/**
* Debug method to dump contents of object into string for display
*/
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append("Alelles in map:");
for (Allele a:alleles) {
sb.append(a.getDisplayString()+",");
}
sb.append("\n");
for (Map.Entry <GATKSAMRecord, Map<Allele, Double>> el : getLikelihoodReadMap().entrySet() ) {
for (Map.Entry<Allele,Double> eli : el.getValue().entrySet()) {
sb.append("Read "+el.getKey().getReadName()+". Allele:"+eli.getKey().getDisplayString()+" has likelihood="+Double.toString(eli.getValue())+"\n");
}
}
return sb.toString();
}
}