We no longer subset down to the best N haplotypes for the GL calculation.

I explain in comments within the code that this was causing problems with the marginalization over events.
This commit is contained in:
Eric Banks 2013-06-04 09:26:50 -04:00
parent c0e3874db0
commit c0030f3f2d
2 changed files with 8 additions and 20 deletions

View File

@ -694,11 +694,14 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
//logger.info("Computing read likelihoods with " + assemblyResult.regionForGenotyping.size() + " reads");
final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap = likelihoodCalculationEngine.computeReadLikelihoods( assemblyResult.haplotypes, splitReadsBySample( assemblyResult.regionForGenotyping.getReads() ) );
// subset down to only the best haplotypes to be genotyped in all samples ( in GGA mode use all discovered haplotypes )
final List<Haplotype> bestHaplotypes = selectBestHaplotypesForGenotyping(assemblyResult.haplotypes, stratifiedReadMap);
// Note: we used to subset down at this point to only the "best" haplotypes in all samples for genotyping, but there
// was a bad interaction between that selection and the marginalization that happens over each event when computing
// GLs. In particular, for samples that are heterozygous non-reference (B/C) the marginalization for B treats the
// haplotype containing C as reference (and vice versa). Now this is fine if all possible haplotypes are included
// in the genotyping, but we lose information if we select down to a few haplotypes. [EB]
final GenotypingEngine.CalledHaplotypes calledHaplotypes = genotypingEngine.assignGenotypeLikelihoods( UG_engine,
bestHaplotypes,
assemblyResult.haplotypes,
stratifiedReadMap,
perSampleFilteredReadList,
assemblyResult.fullReferenceWithPadding,
@ -711,7 +714,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
// TODO -- must disable if we are doing NCT, or set the output type of ! presorted
if ( bamWriter != null ) {
haplotypeBAMWriter.writeReadsAlignedToHaplotypes(assemblyResult.haplotypes, assemblyResult.paddedReferenceLoc,
bestHaplotypes,
assemblyResult.haplotypes,
calledHaplotypes.getCalledHaplotypes(),
stratifiedReadMap);
}
@ -863,21 +866,6 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
return new AssemblyResult(trimmedHaplotypes, trimmedActiveRegion, fullReferenceWithPadding, paddedReferenceLoc, true);
}
/**
* Select the best N haplotypes according to their likelihoods, if appropriate
*
* @param haplotypes a list of haplotypes to consider
* @param stratifiedReadMap a map from samples -> read likelihoods
* @return the list of haplotypes to genotype
*/
protected List<Haplotype> selectBestHaplotypesForGenotyping(final List<Haplotype> haplotypes, final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap) {
if ( UG_engine.getUAC().GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) {
return haplotypes;
} else {
return likelihoodCalculationEngine.selectBestHaplotypesFromEachSample(haplotypes, stratifiedReadMap, maxNumHaplotypesInPopulation);
}
}
//---------------------------------------------------------------------------------------------------------------
//
// reduce

View File

@ -215,7 +215,7 @@ public abstract class LocalAssemblyEngine {
returnHaplotypes.add(h);
if ( debug )
logger.info("Adding haplotype " + h.getCigar() + " from debruijn graph with kmer " + graph.getKmerSize());
logger.info("Adding haplotype " + h.getCigar() + " from graph with kmer " + graph.getKmerSize());
}
}
}