Reworking of how the likelihood calculation is organized in the HaplotypeCaller to facilitate the inclusion of per allele downsampling. We now use the downsampling for both the GL calculations and the annotation calculations.

This commit is contained in:
Ryan Poplin 2012-12-02 11:58:32 -05:00
parent 97fd5de260
commit 1bdf17ef53
6 changed files with 157 additions and 199 deletions

View File

@ -30,12 +30,16 @@ import com.google.java.contract.Requires;
import net.sf.samtools.Cigar;
import net.sf.samtools.CigarElement;
import org.apache.commons.lang.ArrayUtils;
import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine;
import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.variantcontext.*;
import java.io.PrintStream;
import java.util.*;
public class GenotypingEngine {
@ -43,23 +47,27 @@ public class GenotypingEngine {
private final boolean DEBUG;
private final static List<Allele> noCall = new ArrayList<Allele>(); // used to noCall all genotypes until the exact model is applied
private final static Allele SYMBOLIC_UNASSEMBLED_EVENT_ALLELE = Allele.create("<UNASSEMBLED_EVENT>", false);
private final VariantAnnotatorEngine annotationEngine;
public GenotypingEngine( final boolean DEBUG ) {
public GenotypingEngine( final boolean DEBUG, final VariantAnnotatorEngine annotationEngine ) {
this.DEBUG = DEBUG;
this.annotationEngine = annotationEngine;
noCall.add(Allele.NO_CALL);
}
// BUGBUG: Create a class to hold this complicated return type
@Requires({"refLoc.containsP(activeRegionWindow)", "haplotypes.size() > 0"})
public List<Pair<VariantContext, Map<Allele, List<Haplotype>>>> assignGenotypeLikelihoodsAndCallIndependentEvents( final UnifiedGenotyperEngine UG_engine,
final List<Haplotype> haplotypes,
final byte[] ref,
final GenomeLoc refLoc,
final GenomeLoc activeRegionWindow,
final GenomeLocParser genomeLocParser,
final List<VariantContext> activeAllelesToGenotype ) {
public List<VariantContext> assignGenotypeLikelihoodsAndCallIndependentEvents( final UnifiedGenotyperEngine UG_engine,
final List<Haplotype> haplotypes,
final List<String> samples,
final Map<String, PerReadAlleleLikelihoodMap> haplotypeReadMap,
final Map<String, ArrayList<GATKSAMRecord>> perSampleFilteredReadList,
final byte[] ref,
final GenomeLoc refLoc,
final GenomeLoc activeRegionWindow,
final GenomeLocParser genomeLocParser,
final List<VariantContext> activeAllelesToGenotype ) {
final ArrayList<Pair<VariantContext, Map<Allele, List<Haplotype>>>> returnCalls = new ArrayList<Pair<VariantContext, Map<Allele, List<Haplotype>>>>();
final List<VariantContext> returnCalls = new ArrayList<VariantContext>();
final boolean in_GGA_mode = !activeAllelesToGenotype.isEmpty();
// Using the cigar from each called haplotype figure out what events need to be written out in a VCF file
@ -79,8 +87,8 @@ public class GenotypingEngine {
}
cleanUpSymbolicUnassembledEvents( haplotypes );
if( !in_GGA_mode && haplotypes.get(0).getSampleKeySet().size() >= 10 ) { // if not in GGA mode and have at least 10 samples try to create MNP and complex events by looking at LD structure
mergeConsecutiveEventsBasedOnLD( haplotypes, startPosKeySet, ref, refLoc );
if( !in_GGA_mode && samples.size() >= 10 ) { // if not in GGA mode and have at least 10 samples try to create MNP and complex events by looking at LD structure
mergeConsecutiveEventsBasedOnLD( haplotypes, samples, haplotypeReadMap, startPosKeySet, ref, refLoc );
}
if( in_GGA_mode ) {
for( final VariantContext compVC : activeAllelesToGenotype ) {
@ -90,7 +98,7 @@ public class GenotypingEngine {
// Walk along each position in the key set and create each event to be outputted
for( final int loc : startPosKeySet ) {
if( loc >= activeRegionWindow.getStart() && loc <= activeRegionWindow.getStop() ) {
if( loc >= activeRegionWindow.getStart() && loc <= activeRegionWindow.getStop() ) { // genotyping an event inside this active region
final ArrayList<VariantContext> eventsAtThisLoc = new ArrayList<VariantContext>(); // the overlapping events to merge into a common reference view
final ArrayList<String> priorityList = new ArrayList<String>(); // used to merge overlapping events into common reference view
@ -167,12 +175,14 @@ public class GenotypingEngine {
//System.out.println("Event/haplotype allele mapping = " + alleleMapper);
}
final Map<String, PerReadAlleleLikelihoodMap> alleleReadMap = convertHaplotypeReadMapToAlleleReadMap( haplotypeReadMap, alleleMapper, UG_engine.getUAC().CONTAMINATION_FRACTION, UG_engine.getUAC().contaminationLog );
// Grab the genotype likelihoods from the appropriate places in the haplotype likelihood matrix -- calculation performed independently per sample
final GenotypesContext genotypes = GenotypesContext.create(haplotypes.get(0).getSampleKeySet().size());
for( final String sample : haplotypes.get(0).getSampleKeySet() ) { // BUGBUG: assume all haplotypes saw the same samples
final GenotypesContext genotypes = GenotypesContext.create(samples.size());
for( final String sample : samples ) {
final int numHaplotypes = alleleMapper.size();
final double[] genotypeLikelihoods = new double[numHaplotypes * (numHaplotypes+1) / 2];
final double[][] haplotypeLikelihoodMatrix = LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods(sample, alleleMapper, alleleOrdering);
final double[][] haplotypeLikelihoodMatrix = LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods(sample, alleleReadMap, alleleOrdering);
int glIndex = 0;
for( int iii = 0; iii < numHaplotypes; iii++ ) {
for( int jjj = 0; jjj <= iii; jjj++ ) {
@ -183,25 +193,55 @@ public class GenotypingEngine {
}
VariantContext call = UG_engine.calculateGenotypes(new VariantContextBuilder(mergedVC).genotypes(genotypes).make(), UG_engine.getUAC().GLmodel);
if( call != null ) {
if( call.getAlleles().size() != mergedVC.getAlleles().size() ) { // some alleles were removed so reverseTrimming might be necessary!
final VariantContext vcCallTrim = VariantContextUtils.reverseTrimAlleles(call);
// also, need to update the allele -> haplotype mapping
final HashMap<Allele, List<Haplotype>> alleleHashMapTrim = new HashMap<Allele, List<Haplotype>>();
for( int iii = 0; iii < vcCallTrim.getAlleles().size(); iii++ ) { // BUGBUG: this is assuming that the original and trimmed alleles maintain the same ordering in the VC
alleleHashMapTrim.put(vcCallTrim.getAlleles().get(iii), alleleMapper.get(call.getAlleles().get(iii)));
}
final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap = filterToOnlyOverlappingReads( genomeLocParser, alleleReadMap, perSampleFilteredReadList, call );
VariantContext annotatedCall = annotationEngine.annotateContext(stratifiedReadMap, call);
call = vcCallTrim;
alleleMapper = alleleHashMapTrim;
if( annotatedCall.getAlleles().size() != mergedVC.getAlleles().size() ) { // some alleles were removed so reverseTrimming might be necessary!
annotatedCall = VariantContextUtils.reverseTrimAlleles(annotatedCall);
}
returnCalls.add( new Pair<VariantContext, Map<Allele,List<Haplotype>>>(call, alleleMapper) );
returnCalls.add( annotatedCall );
}
}
}
return returnCalls;
}
private static Map<String, PerReadAlleleLikelihoodMap> filterToOnlyOverlappingReads( final GenomeLocParser parser,
final Map<String, PerReadAlleleLikelihoodMap> perSampleReadMap,
final Map<String, ArrayList<GATKSAMRecord>> perSampleFilteredReadList,
final VariantContext call ) {
final Map<String, PerReadAlleleLikelihoodMap> returnMap = new HashMap<String, PerReadAlleleLikelihoodMap>();
final GenomeLoc callLoc = parser.createGenomeLoc(call);
for( final Map.Entry<String, PerReadAlleleLikelihoodMap> sample : perSampleReadMap.entrySet() ) {
final PerReadAlleleLikelihoodMap likelihoodMap = PerReadAlleleLikelihoodMap.getBestAvailablePerReadAlleleLikelihoodMap();
for( final Map.Entry<GATKSAMRecord,Map<Allele,Double>> mapEntry : likelihoodMap.getLikelihoodReadMap().entrySet() ) {
// only count the read if it overlaps the event, otherwise it is not added to the output read list at all
if( callLoc.overlapsP(parser.createGenomeLoc(mapEntry.getKey())) ) {
for( final Map.Entry<Allele,Double> a : mapEntry.getValue().entrySet() ) {
likelihoodMap.add(mapEntry.getKey(), a.getKey(), a.getValue());
}
}
}
// add all filtered reads to the NO_CALL list because they weren't given any likelihoods
for( final GATKSAMRecord read : perSampleFilteredReadList.get(sample.getKey()) ) {
// only count the read if it overlaps the event, otherwise it is not added to the output read list at all
if( callLoc.overlapsP(parser.createGenomeLoc(read)) ) {
for( final Allele a : call.getAlleles() ) {
likelihoodMap.add(read, a, 0.0);
}
}
}
returnMap.put(sample.getKey(), likelihoodMap);
}
return returnMap;
}
protected static void cleanUpSymbolicUnassembledEvents( final List<Haplotype> haplotypes ) {
final ArrayList<Haplotype> haplotypesToRemove = new ArrayList<Haplotype>();
for( final Haplotype h : haplotypes ) {
@ -221,7 +261,41 @@ public class GenotypingEngine {
haplotypes.removeAll(haplotypesToRemove);
}
protected void mergeConsecutiveEventsBasedOnLD( final List<Haplotype> haplotypes, final TreeSet<Integer> startPosKeySet, final byte[] ref, final GenomeLoc refLoc ) {
// BUGBUG: ugh, too complicated
protected Map<String, PerReadAlleleLikelihoodMap> convertHaplotypeReadMapToAlleleReadMap( final Map<String, PerReadAlleleLikelihoodMap> haplotypeReadMap,
final Map<Allele, List<Haplotype>> alleleMapper,
final double downsamplingFraction,
final PrintStream downsamplingLog ) {
final Map<String, PerReadAlleleLikelihoodMap> alleleReadMap = new HashMap<String, PerReadAlleleLikelihoodMap>();
for( final Map.Entry<String, PerReadAlleleLikelihoodMap> haplotypeReadMapEntry : haplotypeReadMap.entrySet() ) { // for each sample
final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap = PerReadAlleleLikelihoodMap.getBestAvailablePerReadAlleleLikelihoodMap();
for( final Map.Entry<Allele, List<Haplotype>> alleleMapperEntry : alleleMapper.entrySet() ) { // for each output allele
final List<Haplotype> mappedHaplotypes = alleleMapperEntry.getValue();
for( final Map.Entry<GATKSAMRecord, Map<Allele,Double>> readEntry : haplotypeReadMapEntry.getValue().getLikelihoodReadMap().entrySet() ) { // for each read
double maxLikelihood = Double.NEGATIVE_INFINITY;
for( final Map.Entry<Allele,Double> alleleDoubleEntry : readEntry.getValue().entrySet() ) { // for each input allele
if( mappedHaplotypes.contains( new Haplotype(alleleDoubleEntry.getKey().getBases())) ) { // exact match of haplotype base string
maxLikelihood = Math.max( maxLikelihood, alleleDoubleEntry.getValue() );
}
}
perReadAlleleLikelihoodMap.add(readEntry.getKey(), alleleMapperEntry.getKey(), maxLikelihood);
}
}
perReadAlleleLikelihoodMap.performPerAlleleDownsampling(downsamplingFraction, downsamplingLog); // perform contamination downsampling
alleleReadMap.put(haplotypeReadMapEntry.getKey(), perReadAlleleLikelihoodMap);
}
return alleleReadMap;
}
protected void mergeConsecutiveEventsBasedOnLD( final List<Haplotype> haplotypes,
final List<String> samples,
final Map<String, PerReadAlleleLikelihoodMap> haplotypeReadMap,
final TreeSet<Integer> startPosKeySet,
final byte[] ref,
final GenomeLoc refLoc ) {
final int MAX_SIZE_TO_COMBINE = 15;
final double MERGE_EVENTS_R2_THRESHOLD = 0.95;
if( startPosKeySet.size() <= 1 ) { return; }
@ -265,12 +339,13 @@ public class GenotypingEngine {
}
}
// count up the co-occurrences of the events for the R^2 calculation
final ArrayList<Haplotype> haplotypeList = new ArrayList<Haplotype>();
haplotypeList.add(h);
for( final String sample : haplotypes.get(0).getSampleKeySet() ) {
for( final String sample : samples ) {
final HashSet<String> sampleSet = new HashSet<String>(1);
sampleSet.add(sample);
final double haplotypeLikelihood = LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods( sampleSet, haplotypeList )[0][0];
final List<Allele> alleleList = new ArrayList<Allele>();
alleleList.add(Allele.create(h.getBases()));
final double haplotypeLikelihood = LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods( sampleSet, haplotypeReadMap, alleleList )[0][0];
if( thisHapVC == null ) {
if( nextHapVC == null ) { x11 = MathUtils.approximateLog10SumLog10(x11, haplotypeLikelihood); }
else { x12 = MathUtils.approximateLog10SumLog10(x12, haplotypeLikelihood); }

View File

@ -202,9 +202,6 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
// the genotyping engine
private GenotypingEngine genotypingEngine = null;
// the annotation engine
private VariantAnnotatorEngine annotationEngine;
// fasta reference reader to supplement the edges of the reference sequence
private CachingIndexedFastaSequenceFile referenceReader;
@ -249,7 +246,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
UG_engine_simple_genotyper = new UnifiedGenotyperEngine(getToolkit(), simpleUAC, logger, null, null, samples, VariantContextUtils.DEFAULT_PLOIDY);
// initialize the output VCF header
annotationEngine = new VariantAnnotatorEngine(Arrays.asList(annotationClassesToUse), annotationsToUse, annotationsToExclude, this, getToolkit());
final VariantAnnotatorEngine annotationEngine = new VariantAnnotatorEngine(Arrays.asList(annotationClassesToUse), annotationsToUse, annotationsToExclude, this, getToolkit());
Set<VCFHeaderLine> headerInfo = new HashSet<VCFHeaderLine>();
@ -282,7 +279,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
assemblyEngine = new SimpleDeBruijnAssembler( DEBUG, graphWriter );
likelihoodCalculationEngine = new LikelihoodCalculationEngine( (byte)gcpHMM, DEBUG, pairHMM );
genotypingEngine = new GenotypingEngine( DEBUG );
genotypingEngine = new GenotypingEngine( DEBUG, annotationEngine );
}
//---------------------------------------------------------------------------------------------------------------
@ -398,21 +395,23 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
Collections.sort( haplotypes, new Haplotype.HaplotypeBaseComparator() );
// evaluate each sample's reads against all haplotypes
final HashMap<String, ArrayList<GATKSAMRecord>> perSampleReadList = splitReadsBySample( activeRegion.getReads() );
final HashMap<String, ArrayList<GATKSAMRecord>> perSampleFilteredReadList = splitReadsBySample( filteredReads );
likelihoodCalculationEngine.computeReadLikelihoods( haplotypes, perSampleReadList );
final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap = likelihoodCalculationEngine.computeReadLikelihoods( haplotypes, splitReadsBySample( activeRegion.getReads() ) );
final Map<String, ArrayList<GATKSAMRecord>> perSampleFilteredReadList = splitReadsBySample( filteredReads );
// subset down to only the best haplotypes to be genotyped in all samples ( in GGA mode use all discovered haplotypes )
final ArrayList<Haplotype> bestHaplotypes = ( UG_engine.getUAC().GenotypingMode != GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ? likelihoodCalculationEngine.selectBestHaplotypes( haplotypes ) : haplotypes );
final ArrayList<Haplotype> bestHaplotypes = ( UG_engine.getUAC().GenotypingMode != GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ? likelihoodCalculationEngine.selectBestHaplotypes( haplotypes, stratifiedReadMap ) : haplotypes );
for( final Pair<VariantContext, Map<Allele, List<Haplotype>>> callResult :
genotypingEngine.assignGenotypeLikelihoodsAndCallIndependentEvents( UG_engine, bestHaplotypes, fullReferenceWithPadding, getPaddedLoc(activeRegion), activeRegion.getLocation(), getToolkit().getGenomeLocParser(), activeAllelesToGenotype ) ) {
if( DEBUG ) { System.out.println(callResult.getFirst().toStringWithoutGenotypes()); }
final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap = LikelihoodCalculationEngine.partitionReadsBasedOnLikelihoods( getToolkit().getGenomeLocParser(), perSampleReadList, perSampleFilteredReadList, callResult, UG_engine.getUAC().CONTAMINATION_FRACTION, UG_engine.getUAC().contaminationLog );
final VariantContext annotatedCall = annotationEngine.annotateContext(stratifiedReadMap, callResult.getFirst());
final Map<String, Object> myAttributes = new LinkedHashMap<String, Object>(annotatedCall.getAttributes());
vcfWriter.add( new VariantContextBuilder(annotatedCall).attributes(myAttributes).make() );
for( final VariantContext call : genotypingEngine.assignGenotypeLikelihoodsAndCallIndependentEvents( UG_engine,
bestHaplotypes,
samplesList,
stratifiedReadMap,
perSampleFilteredReadList,
fullReferenceWithPadding,
getPaddedLoc(activeRegion),
activeRegion.getLocation(),
getToolkit().getGenomeLocParser(),
activeAllelesToGenotype ) ) {
vcfWriter.add( call );
}
if( DEBUG ) { System.out.println("----------------------------------------------------------------------------------"); }
@ -467,7 +466,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
if( postAdapterRead != null && !postAdapterRead.isEmpty() && postAdapterRead.getCigar().getReadLength() > 0 ) {
final GATKSAMRecord clippedRead = ReadClipper.hardClipLowQualEnds( postAdapterRead, MIN_TAIL_QUALITY );
// protect against INTERVALS with abnormally high coverage
// BUGBUG: remove when positinal downsampler is hooked up to ART/HC
// BUGBUG: remove when positional downsampler is hooked up to ART/HC
if( clippedRead.getReadLength() > 0 && activeRegion.size() < samplesList.size() * DOWNSAMPLE_PER_SAMPLE_PER_REGION ) {
activeRegion.add(clippedRead);
}

View File

@ -71,8 +71,9 @@ public class LikelihoodCalculationEngine {
DEBUG = debug;
}
public void computeReadLikelihoods( final ArrayList<Haplotype> haplotypes, final HashMap<String, ArrayList<GATKSAMRecord>> perSampleReadList ) {
public Map<String, PerReadAlleleLikelihoodMap> computeReadLikelihoods( final ArrayList<Haplotype> haplotypes, final HashMap<String, ArrayList<GATKSAMRecord>> perSampleReadList ) {
final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap = new HashMap<String, PerReadAlleleLikelihoodMap>();
int X_METRIC_LENGTH = 0;
for( final Map.Entry<String, ArrayList<GATKSAMRecord>> sample : perSampleReadList.entrySet() ) {
for( final GATKSAMRecord read : sample.getValue() ) {
@ -97,20 +98,16 @@ public class LikelihoodCalculationEngine {
for( final Map.Entry<String, ArrayList<GATKSAMRecord>> sampleEntry : perSampleReadList.entrySet() ) {
//if( DEBUG ) { System.out.println("Evaluating sample " + sample + " with " + perSampleReadList.get( sample ).size() + " passing reads"); }
// evaluate the likelihood of the reads given those haplotypes
computeReadLikelihoods( haplotypes, sampleEntry.getValue(), sampleEntry.getKey() );
stratifiedReadMap.put(sampleEntry.getKey(), computeReadLikelihoods(haplotypes, sampleEntry.getValue(), sampleEntry.getKey()));
}
return stratifiedReadMap;
}
private void computeReadLikelihoods( final ArrayList<Haplotype> haplotypes, final ArrayList<GATKSAMRecord> reads, final String sample ) {
private PerReadAlleleLikelihoodMap computeReadLikelihoods( final ArrayList<Haplotype> haplotypes, final ArrayList<GATKSAMRecord> reads, final String sample ) {
final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap = PerReadAlleleLikelihoodMap.getBestAvailablePerReadAlleleLikelihoodMap();
final int numHaplotypes = haplotypes.size();
final int numReads = reads.size();
final double[][] readLikelihoods = new double[numHaplotypes][numReads];
final int[][] readCounts = new int[numHaplotypes][numReads];
for( int iii = 0; iii < numReads; iii++ ) {
final GATKSAMRecord read = reads.get(iii);
final int readCount = ReadUtils.getMeanRepresentativeReadCount(read);
for( final GATKSAMRecord read : reads ) {
final byte[] overallGCP = new byte[read.getReadLength()];
Arrays.fill( overallGCP, constantGCP ); // Is there a way to derive empirical estimates for this from the data?
Haplotype previousHaplotypeSeen = null;
@ -129,14 +126,12 @@ public class LikelihoodCalculationEngine {
final int haplotypeStart = ( previousHaplotypeSeen == null ? 0 : computeFirstDifferingPosition(haplotype.getBases(), previousHaplotypeSeen.getBases()) );
previousHaplotypeSeen = haplotype;
readLikelihoods[jjj][iii] = pairHMM.computeReadLikelihoodGivenHaplotypeLog10(haplotype.getBases(), read.getReadBases(),
readQuals, readInsQuals, readDelQuals, overallGCP, haplotypeStart, jjj == 0);
readCounts[jjj][iii] = readCount;
perReadAlleleLikelihoodMap.add(read, Allele.create(haplotype.getBases()),
pairHMM.computeReadLikelihoodGivenHaplotypeLog10(haplotype.getBases(), read.getReadBases(),
readQuals, readInsQuals, readDelQuals, overallGCP, haplotypeStart, jjj == 0));
}
}
for( int jjj = 0; jjj < numHaplotypes; jjj++ ) {
haplotypes.get(jjj).addReadLikelihoods( sample, readLikelihoods[jjj], readCounts[jjj] );
}
return perReadAlleleLikelihoodMap;
}
private static int computeFirstDifferingPosition( final byte[] b1, final byte[] b2 ) {
@ -148,19 +143,21 @@ public class LikelihoodCalculationEngine {
return Math.min(b1.length, b2.length);
}
// This function takes just a single sample and a haplotypeMapping
@Requires({"haplotypeMapping.size() > 0"})
@Ensures({"result.length == result[0].length", "result.length == haplotypeMapping.size()"})
public static double[][] computeDiploidHaplotypeLikelihoods( final String sample, final Map<Allele, List<Haplotype>> haplotypeMapping, final List<Allele> alleleOrdering ) {
public static double[][] computeDiploidHaplotypeLikelihoods( final String sample,
final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap,
final List<Allele> alleleOrdering ) {
final TreeSet<String> sampleSet = new TreeSet<String>();
sampleSet.add(sample);
return computeDiploidHaplotypeLikelihoods(sampleSet, haplotypeMapping, alleleOrdering);
return computeDiploidHaplotypeLikelihoods(sampleSet, stratifiedReadMap, alleleOrdering);
}
// This function takes a set of samples to pool over and a haplotypeMapping
@Requires({"haplotypeMapping.size() > 0"})
@Ensures({"result.length == result[0].length", "result.length == haplotypeMapping.size()"})
public static double[][] computeDiploidHaplotypeLikelihoods( final Set<String> samples, final Map<Allele, List<Haplotype>> haplotypeMapping, final List<Allele> alleleOrdering ) {
public static double[][] computeDiploidHaplotypeLikelihoods( final Set<String> samples,
final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap,
final List<Allele> alleleOrdering ) {
final int numHaplotypes = alleleOrdering.size();
final double[][] haplotypeLikelihoodMatrix = new double[numHaplotypes][numHaplotypes];
@ -170,59 +167,19 @@ public class LikelihoodCalculationEngine {
// compute the diploid haplotype likelihoods
for( int iii = 0; iii < numHaplotypes; iii++ ) {
final Allele iii_allele = alleleOrdering.get(iii);
for( int jjj = 0; jjj <= iii; jjj++ ) {
for( final Haplotype iii_mapped : haplotypeMapping.get(alleleOrdering.get(iii)) ) {
for( final Haplotype jjj_mapped : haplotypeMapping.get(alleleOrdering.get(jjj)) ) {
double haplotypeLikelihood = 0.0;
for( final String sample : samples ) {
final double[] readLikelihoods_iii = iii_mapped.getReadLikelihoods(sample);
final int[] readCounts_iii = iii_mapped.getReadCounts(sample);
final double[] readLikelihoods_jjj = jjj_mapped.getReadLikelihoods(sample);
for( int kkk = 0; kkk < readLikelihoods_iii.length; kkk++ ) {
// Compute log10(10^x1/2 + 10^x2/2) = log10(10^x1+10^x2)-log10(2)
// First term is approximated by Jacobian log with table lookup.
haplotypeLikelihood += readCounts_iii[kkk] * ( MathUtils.approximateLog10SumLog10(readLikelihoods_iii[kkk], readLikelihoods_jjj[kkk]) + LOG_ONE_HALF );
}
}
haplotypeLikelihoodMatrix[iii][jjj] = Math.max(haplotypeLikelihoodMatrix[iii][jjj], haplotypeLikelihood);
}
}
}
}
// normalize the diploid likelihoods matrix
return normalizeDiploidLikelihoodMatrixFromLog10( haplotypeLikelihoodMatrix );
}
// This function takes a set of samples to pool over and a haplotypeMapping
@Requires({"haplotypeList.size() > 0"})
@Ensures({"result.length == result[0].length", "result.length == haplotypeList.size()"})
public static double[][] computeDiploidHaplotypeLikelihoods( final Set<String> samples, final List<Haplotype> haplotypeList ) {
final int numHaplotypes = haplotypeList.size();
final double[][] haplotypeLikelihoodMatrix = new double[numHaplotypes][numHaplotypes];
for( int iii = 0; iii < numHaplotypes; iii++ ) {
Arrays.fill(haplotypeLikelihoodMatrix[iii], Double.NEGATIVE_INFINITY);
}
// compute the diploid haplotype likelihoods
// todo - needs to be generalized to arbitrary ploidy, cleaned and merged with PairHMMIndelErrorModel code
for( int iii = 0; iii < numHaplotypes; iii++ ) {
final Haplotype iii_haplotype = haplotypeList.get(iii);
for( int jjj = 0; jjj <= iii; jjj++ ) {
final Haplotype jjj_haplotype = haplotypeList.get(jjj);
final Allele jjj_allele = alleleOrdering.get(jjj);
double haplotypeLikelihood = 0.0;
for( final String sample : samples ) {
final double[] readLikelihoods_iii = iii_haplotype.getReadLikelihoods(sample);
final int[] readCounts_iii = iii_haplotype.getReadCounts(sample);
final double[] readLikelihoods_jjj = jjj_haplotype.getReadLikelihoods(sample);
for( int kkk = 0; kkk < readLikelihoods_iii.length; kkk++ ) {
for( final Map.Entry<GATKSAMRecord, Map<Allele,Double>> entry : stratifiedReadMap.get(sample).getLikelihoodReadMap().entrySet() ) {
// Compute log10(10^x1/2 + 10^x2/2) = log10(10^x1+10^x2)-log10(2)
// First term is approximated by Jacobian log with table lookup.
haplotypeLikelihood += readCounts_iii[kkk] * ( MathUtils.approximateLog10SumLog10(readLikelihoods_iii[kkk], readLikelihoods_jjj[kkk]) + LOG_ONE_HALF );
haplotypeLikelihood += ReadUtils.getMeanRepresentativeReadCount( entry.getKey() ) *
( MathUtils.approximateLog10SumLog10(entry.getValue().get(iii_allele), entry.getValue().get(jjj_allele)) + LOG_ONE_HALF );
}
}
haplotypeLikelihoodMatrix[iii][jjj] = Math.max(haplotypeLikelihoodMatrix[iii][jjj], haplotypeLikelihood);
haplotypeLikelihoodMatrix[iii][jjj] = haplotypeLikelihood;
}
}
@ -312,13 +269,16 @@ public class LikelihoodCalculationEngine {
@Requires({"haplotypes.size() > 0"})
@Ensures({"result.size() <= haplotypes.size()"})
public ArrayList<Haplotype> selectBestHaplotypes( final ArrayList<Haplotype> haplotypes ) {
public ArrayList<Haplotype> selectBestHaplotypes( final ArrayList<Haplotype> haplotypes, final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap ) {
final int numHaplotypes = haplotypes.size();
final Set<String> sampleKeySet = haplotypes.get(0).getSampleKeySet(); // BUGBUG: assume all haplotypes saw the same samples
final Set<String> sampleKeySet = stratifiedReadMap.keySet();
final ArrayList<Integer> bestHaplotypesIndexList = new ArrayList<Integer>();
bestHaplotypesIndexList.add( findReferenceIndex(haplotypes) ); // always start with the reference haplotype
final double[][] haplotypeLikelihoodMatrix = computeDiploidHaplotypeLikelihoods( sampleKeySet, haplotypes ); // all samples pooled together
final List<Allele> haplotypesAsAlleles = new ArrayList<Allele>();
for( final Haplotype h : haplotypes ) { haplotypesAsAlleles.add(Allele.create(h.getBases())); }
final double[][] haplotypeLikelihoodMatrix = computeDiploidHaplotypeLikelihoods( sampleKeySet, stratifiedReadMap, haplotypesAsAlleles ); // all samples pooled together
int hap1 = 0;
int hap2 = 0;
@ -358,52 +318,4 @@ public class LikelihoodCalculationEngine {
}
throw new ReviewedStingException( "No reference haplotype found in the list of haplotypes!" );
}
public static Map<String, PerReadAlleleLikelihoodMap> partitionReadsBasedOnLikelihoods( final GenomeLocParser parser,
final HashMap<String, ArrayList<GATKSAMRecord>> perSampleReadList,
final HashMap<String, ArrayList<GATKSAMRecord>> perSampleFilteredReadList,
final Pair<VariantContext, Map<Allele,List<Haplotype>>> call,
final double downsamplingFraction,
final PrintStream downsamplingLog ) {
final Map<String, PerReadAlleleLikelihoodMap> returnMap = new HashMap<String, PerReadAlleleLikelihoodMap>();
final GenomeLoc callLoc = parser.createGenomeLoc(call.getFirst());
for( final Map.Entry<String, ArrayList<GATKSAMRecord>> sample : perSampleReadList.entrySet() ) {
final PerReadAlleleLikelihoodMap likelihoodMap = PerReadAlleleLikelihoodMap.getBestAvailablePerReadAlleleLikelihoodMap();
final ArrayList<GATKSAMRecord> readsForThisSample = sample.getValue();
for( int iii = 0; iii < readsForThisSample.size(); iii++ ) {
final GATKSAMRecord read = readsForThisSample.get(iii); // BUGBUG: assumes read order in this list and haplotype likelihood list are the same!
// only count the read if it overlaps the event, otherwise it is not added to the output read list at all
if( callLoc.overlapsP(parser.createGenomeLoc(read)) ) {
for( final Allele a : call.getFirst().getAlleles() ) {
double maxLikelihood = Double.NEGATIVE_INFINITY;
for( final Haplotype h : call.getSecond().get(a) ) { // use the max likelihood from all the haplotypes which mapped to this allele (achieved via the haplotype mapper object)
final double likelihood = h.getReadLikelihoods(sample.getKey())[iii];
if( likelihood > maxLikelihood ) {
maxLikelihood = likelihood;
}
}
likelihoodMap.add(read, a, maxLikelihood);
}
}
}
// down-sample before adding filtered reads
likelihoodMap.performPerAlleleDownsampling(downsamplingFraction, downsamplingLog);
// add all filtered reads to the NO_CALL list because they weren't given any likelihoods
for( final GATKSAMRecord read : perSampleFilteredReadList.get(sample.getKey()) ) {
// only count the read if it overlaps the event, otherwise it is not added to the output read list at all
if( callLoc.overlapsP(parser.createGenomeLoc(read)) ) {
for( final Allele a : call.getFirst().getAlleles() ) {
likelihoodMap.add(read, a, 0.0);
}
}
}
returnMap.put(sample.getKey(), likelihoodMap);
}
return returnMap;
}
}

View File

@ -329,7 +329,6 @@ public class PairHMMIndelErrorModel {
getContextHomopolymerLength(readBases,hrunProfile);
fillGapProbabilities(hrunProfile, contextLogGapOpenProbabilities, contextLogGapContinuationProbabilities);
for (Allele a: haplotypeMap.keySet()) {
Haplotype haplotype = haplotypeMap.get(a);

View File

@ -41,8 +41,6 @@ public class Haplotype {
protected final byte[] bases;
protected final double[] quals;
private GenomeLoc genomeLocation = null;
private HashMap<String, double[]> readLikelihoodsPerSample = null;
private HashMap<String, int[]> readCountsPerSample = null;
private HashMap<Integer, VariantContext> eventMap = null;
private boolean isRef = false;
private Cigar cigar;
@ -94,31 +92,6 @@ public class Haplotype {
return Arrays.hashCode(bases);
}
public void addReadLikelihoods( final String sample, final double[] readLikelihoods, final int[] readCounts ) {
if( readLikelihoodsPerSample == null ) {
readLikelihoodsPerSample = new HashMap<String, double[]>();
}
readLikelihoodsPerSample.put(sample, readLikelihoods);
if( readCountsPerSample == null ) {
readCountsPerSample = new HashMap<String, int[]>();
}
readCountsPerSample.put(sample, readCounts);
}
@Ensures({"result != null"})
public double[] getReadLikelihoods( final String sample ) {
return readLikelihoodsPerSample.get(sample);
}
@Ensures({"result != null"})
public int[] getReadCounts( final String sample ) {
return readCountsPerSample.get(sample);
}
public Set<String> getSampleKeySet() {
return readLikelihoodsPerSample.keySet();
}
public HashMap<Integer, VariantContext> getEventMap() {
return eventMap;
}

View File

@ -38,10 +38,10 @@ import java.util.*;
public abstract class PerReadAlleleLikelihoodMap {
public static final double INDEL_LIKELIHOOD_THRESH = 0.1;
public static final double INFORMATIVE_LIKELIHOOD_THRESHOLD = 0.1;
protected List<Allele> alleles;
protected Map<GATKSAMRecord,Map<Allele,Double>> likelihoodReadMap;
protected Map<GATKSAMRecord, Map<Allele, Double>> likelihoodReadMap;
public abstract void performPerAlleleDownsampling(final double downsamplingFraction, final PrintStream log);
public abstract ReadBackedPileup createPerAlleleDownsampledBasePileup(final ReadBackedPileup pileup, final double downsamplingFraction, final PrintStream log);
@ -68,7 +68,7 @@ public abstract class PerReadAlleleLikelihoodMap {
}
public void add(PileupElement p, Allele a, Double likelihood) {
add(p.getRead(),a,likelihood);
add(p.getRead(), a, likelihood);
}
public boolean containsPileupElement(PileupElement p) {
@ -120,7 +120,7 @@ public abstract class PerReadAlleleLikelihoodMap {
prevMaxLike = el.getValue();
}
}
return (maxLike - prevMaxLike > INDEL_LIKELIHOOD_THRESH ? mostLikelyAllele : Allele.NO_CALL );
return (maxLike - prevMaxLike > INFORMATIVE_LIKELIHOOD_THRESHOLD ? mostLikelyAllele : Allele.NO_CALL );
}
public static PerReadAlleleLikelihoodMap getBestAvailablePerReadAlleleLikelihoodMap() {