Reworking of how the likelihood calculation is organized in the HaplotypeCaller to facilitate the inclusion of per allele downsampling. We now use the downsampling for both the GL calculations and the annotation calculations.
This commit is contained in:
parent
97fd5de260
commit
1bdf17ef53
|
|
@ -30,12 +30,16 @@ import com.google.java.contract.Requires;
|
|||
import net.sf.samtools.Cigar;
|
||||
import net.sf.samtools.CigarElement;
|
||||
import org.apache.commons.lang.ArrayUtils;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.sting.utils.variantcontext.*;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.util.*;
|
||||
|
||||
public class GenotypingEngine {
|
||||
|
|
@ -43,23 +47,27 @@ public class GenotypingEngine {
|
|||
private final boolean DEBUG;
|
||||
private final static List<Allele> noCall = new ArrayList<Allele>(); // used to noCall all genotypes until the exact model is applied
|
||||
private final static Allele SYMBOLIC_UNASSEMBLED_EVENT_ALLELE = Allele.create("<UNASSEMBLED_EVENT>", false);
|
||||
private final VariantAnnotatorEngine annotationEngine;
|
||||
|
||||
public GenotypingEngine( final boolean DEBUG ) {
|
||||
public GenotypingEngine( final boolean DEBUG, final VariantAnnotatorEngine annotationEngine ) {
|
||||
this.DEBUG = DEBUG;
|
||||
this.annotationEngine = annotationEngine;
|
||||
noCall.add(Allele.NO_CALL);
|
||||
}
|
||||
|
||||
// BUGBUG: Create a class to hold this complicated return type
|
||||
@Requires({"refLoc.containsP(activeRegionWindow)", "haplotypes.size() > 0"})
|
||||
public List<Pair<VariantContext, Map<Allele, List<Haplotype>>>> assignGenotypeLikelihoodsAndCallIndependentEvents( final UnifiedGenotyperEngine UG_engine,
|
||||
final List<Haplotype> haplotypes,
|
||||
final byte[] ref,
|
||||
final GenomeLoc refLoc,
|
||||
final GenomeLoc activeRegionWindow,
|
||||
final GenomeLocParser genomeLocParser,
|
||||
final List<VariantContext> activeAllelesToGenotype ) {
|
||||
public List<VariantContext> assignGenotypeLikelihoodsAndCallIndependentEvents( final UnifiedGenotyperEngine UG_engine,
|
||||
final List<Haplotype> haplotypes,
|
||||
final List<String> samples,
|
||||
final Map<String, PerReadAlleleLikelihoodMap> haplotypeReadMap,
|
||||
final Map<String, ArrayList<GATKSAMRecord>> perSampleFilteredReadList,
|
||||
final byte[] ref,
|
||||
final GenomeLoc refLoc,
|
||||
final GenomeLoc activeRegionWindow,
|
||||
final GenomeLocParser genomeLocParser,
|
||||
final List<VariantContext> activeAllelesToGenotype ) {
|
||||
|
||||
final ArrayList<Pair<VariantContext, Map<Allele, List<Haplotype>>>> returnCalls = new ArrayList<Pair<VariantContext, Map<Allele, List<Haplotype>>>>();
|
||||
final List<VariantContext> returnCalls = new ArrayList<VariantContext>();
|
||||
final boolean in_GGA_mode = !activeAllelesToGenotype.isEmpty();
|
||||
|
||||
// Using the cigar from each called haplotype figure out what events need to be written out in a VCF file
|
||||
|
|
@ -79,8 +87,8 @@ public class GenotypingEngine {
|
|||
}
|
||||
|
||||
cleanUpSymbolicUnassembledEvents( haplotypes );
|
||||
if( !in_GGA_mode && haplotypes.get(0).getSampleKeySet().size() >= 10 ) { // if not in GGA mode and have at least 10 samples try to create MNP and complex events by looking at LD structure
|
||||
mergeConsecutiveEventsBasedOnLD( haplotypes, startPosKeySet, ref, refLoc );
|
||||
if( !in_GGA_mode && samples.size() >= 10 ) { // if not in GGA mode and have at least 10 samples try to create MNP and complex events by looking at LD structure
|
||||
mergeConsecutiveEventsBasedOnLD( haplotypes, samples, haplotypeReadMap, startPosKeySet, ref, refLoc );
|
||||
}
|
||||
if( in_GGA_mode ) {
|
||||
for( final VariantContext compVC : activeAllelesToGenotype ) {
|
||||
|
|
@ -90,7 +98,7 @@ public class GenotypingEngine {
|
|||
|
||||
// Walk along each position in the key set and create each event to be outputted
|
||||
for( final int loc : startPosKeySet ) {
|
||||
if( loc >= activeRegionWindow.getStart() && loc <= activeRegionWindow.getStop() ) {
|
||||
if( loc >= activeRegionWindow.getStart() && loc <= activeRegionWindow.getStop() ) { // genotyping an event inside this active region
|
||||
final ArrayList<VariantContext> eventsAtThisLoc = new ArrayList<VariantContext>(); // the overlapping events to merge into a common reference view
|
||||
final ArrayList<String> priorityList = new ArrayList<String>(); // used to merge overlapping events into common reference view
|
||||
|
||||
|
|
@ -167,12 +175,14 @@ public class GenotypingEngine {
|
|||
//System.out.println("Event/haplotype allele mapping = " + alleleMapper);
|
||||
}
|
||||
|
||||
final Map<String, PerReadAlleleLikelihoodMap> alleleReadMap = convertHaplotypeReadMapToAlleleReadMap( haplotypeReadMap, alleleMapper, UG_engine.getUAC().CONTAMINATION_FRACTION, UG_engine.getUAC().contaminationLog );
|
||||
|
||||
// Grab the genotype likelihoods from the appropriate places in the haplotype likelihood matrix -- calculation performed independently per sample
|
||||
final GenotypesContext genotypes = GenotypesContext.create(haplotypes.get(0).getSampleKeySet().size());
|
||||
for( final String sample : haplotypes.get(0).getSampleKeySet() ) { // BUGBUG: assume all haplotypes saw the same samples
|
||||
final GenotypesContext genotypes = GenotypesContext.create(samples.size());
|
||||
for( final String sample : samples ) {
|
||||
final int numHaplotypes = alleleMapper.size();
|
||||
final double[] genotypeLikelihoods = new double[numHaplotypes * (numHaplotypes+1) / 2];
|
||||
final double[][] haplotypeLikelihoodMatrix = LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods(sample, alleleMapper, alleleOrdering);
|
||||
final double[][] haplotypeLikelihoodMatrix = LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods(sample, alleleReadMap, alleleOrdering);
|
||||
int glIndex = 0;
|
||||
for( int iii = 0; iii < numHaplotypes; iii++ ) {
|
||||
for( int jjj = 0; jjj <= iii; jjj++ ) {
|
||||
|
|
@ -183,25 +193,55 @@ public class GenotypingEngine {
|
|||
}
|
||||
VariantContext call = UG_engine.calculateGenotypes(new VariantContextBuilder(mergedVC).genotypes(genotypes).make(), UG_engine.getUAC().GLmodel);
|
||||
if( call != null ) {
|
||||
if( call.getAlleles().size() != mergedVC.getAlleles().size() ) { // some alleles were removed so reverseTrimming might be necessary!
|
||||
final VariantContext vcCallTrim = VariantContextUtils.reverseTrimAlleles(call);
|
||||
// also, need to update the allele -> haplotype mapping
|
||||
final HashMap<Allele, List<Haplotype>> alleleHashMapTrim = new HashMap<Allele, List<Haplotype>>();
|
||||
for( int iii = 0; iii < vcCallTrim.getAlleles().size(); iii++ ) { // BUGBUG: this is assuming that the original and trimmed alleles maintain the same ordering in the VC
|
||||
alleleHashMapTrim.put(vcCallTrim.getAlleles().get(iii), alleleMapper.get(call.getAlleles().get(iii)));
|
||||
}
|
||||
final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap = filterToOnlyOverlappingReads( genomeLocParser, alleleReadMap, perSampleFilteredReadList, call );
|
||||
VariantContext annotatedCall = annotationEngine.annotateContext(stratifiedReadMap, call);
|
||||
|
||||
call = vcCallTrim;
|
||||
alleleMapper = alleleHashMapTrim;
|
||||
if( annotatedCall.getAlleles().size() != mergedVC.getAlleles().size() ) { // some alleles were removed so reverseTrimming might be necessary!
|
||||
annotatedCall = VariantContextUtils.reverseTrimAlleles(annotatedCall);
|
||||
}
|
||||
|
||||
returnCalls.add( new Pair<VariantContext, Map<Allele,List<Haplotype>>>(call, alleleMapper) );
|
||||
returnCalls.add( annotatedCall );
|
||||
}
|
||||
}
|
||||
}
|
||||
return returnCalls;
|
||||
}
|
||||
|
||||
private static Map<String, PerReadAlleleLikelihoodMap> filterToOnlyOverlappingReads( final GenomeLocParser parser,
|
||||
final Map<String, PerReadAlleleLikelihoodMap> perSampleReadMap,
|
||||
final Map<String, ArrayList<GATKSAMRecord>> perSampleFilteredReadList,
|
||||
final VariantContext call ) {
|
||||
|
||||
final Map<String, PerReadAlleleLikelihoodMap> returnMap = new HashMap<String, PerReadAlleleLikelihoodMap>();
|
||||
final GenomeLoc callLoc = parser.createGenomeLoc(call);
|
||||
for( final Map.Entry<String, PerReadAlleleLikelihoodMap> sample : perSampleReadMap.entrySet() ) {
|
||||
final PerReadAlleleLikelihoodMap likelihoodMap = PerReadAlleleLikelihoodMap.getBestAvailablePerReadAlleleLikelihoodMap();
|
||||
|
||||
for( final Map.Entry<GATKSAMRecord,Map<Allele,Double>> mapEntry : likelihoodMap.getLikelihoodReadMap().entrySet() ) {
|
||||
// only count the read if it overlaps the event, otherwise it is not added to the output read list at all
|
||||
if( callLoc.overlapsP(parser.createGenomeLoc(mapEntry.getKey())) ) {
|
||||
for( final Map.Entry<Allele,Double> a : mapEntry.getValue().entrySet() ) {
|
||||
likelihoodMap.add(mapEntry.getKey(), a.getKey(), a.getValue());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// add all filtered reads to the NO_CALL list because they weren't given any likelihoods
|
||||
for( final GATKSAMRecord read : perSampleFilteredReadList.get(sample.getKey()) ) {
|
||||
// only count the read if it overlaps the event, otherwise it is not added to the output read list at all
|
||||
if( callLoc.overlapsP(parser.createGenomeLoc(read)) ) {
|
||||
for( final Allele a : call.getAlleles() ) {
|
||||
likelihoodMap.add(read, a, 0.0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
returnMap.put(sample.getKey(), likelihoodMap);
|
||||
}
|
||||
return returnMap;
|
||||
}
|
||||
|
||||
|
||||
protected static void cleanUpSymbolicUnassembledEvents( final List<Haplotype> haplotypes ) {
|
||||
final ArrayList<Haplotype> haplotypesToRemove = new ArrayList<Haplotype>();
|
||||
for( final Haplotype h : haplotypes ) {
|
||||
|
|
@ -221,7 +261,41 @@ public class GenotypingEngine {
|
|||
haplotypes.removeAll(haplotypesToRemove);
|
||||
}
|
||||
|
||||
protected void mergeConsecutiveEventsBasedOnLD( final List<Haplotype> haplotypes, final TreeSet<Integer> startPosKeySet, final byte[] ref, final GenomeLoc refLoc ) {
|
||||
// BUGBUG: ugh, too complicated
|
||||
protected Map<String, PerReadAlleleLikelihoodMap> convertHaplotypeReadMapToAlleleReadMap( final Map<String, PerReadAlleleLikelihoodMap> haplotypeReadMap,
|
||||
final Map<Allele, List<Haplotype>> alleleMapper,
|
||||
final double downsamplingFraction,
|
||||
final PrintStream downsamplingLog ) {
|
||||
|
||||
final Map<String, PerReadAlleleLikelihoodMap> alleleReadMap = new HashMap<String, PerReadAlleleLikelihoodMap>();
|
||||
for( final Map.Entry<String, PerReadAlleleLikelihoodMap> haplotypeReadMapEntry : haplotypeReadMap.entrySet() ) { // for each sample
|
||||
final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap = PerReadAlleleLikelihoodMap.getBestAvailablePerReadAlleleLikelihoodMap();
|
||||
for( final Map.Entry<Allele, List<Haplotype>> alleleMapperEntry : alleleMapper.entrySet() ) { // for each output allele
|
||||
final List<Haplotype> mappedHaplotypes = alleleMapperEntry.getValue();
|
||||
for( final Map.Entry<GATKSAMRecord, Map<Allele,Double>> readEntry : haplotypeReadMapEntry.getValue().getLikelihoodReadMap().entrySet() ) { // for each read
|
||||
double maxLikelihood = Double.NEGATIVE_INFINITY;
|
||||
for( final Map.Entry<Allele,Double> alleleDoubleEntry : readEntry.getValue().entrySet() ) { // for each input allele
|
||||
if( mappedHaplotypes.contains( new Haplotype(alleleDoubleEntry.getKey().getBases())) ) { // exact match of haplotype base string
|
||||
maxLikelihood = Math.max( maxLikelihood, alleleDoubleEntry.getValue() );
|
||||
}
|
||||
}
|
||||
perReadAlleleLikelihoodMap.add(readEntry.getKey(), alleleMapperEntry.getKey(), maxLikelihood);
|
||||
}
|
||||
}
|
||||
perReadAlleleLikelihoodMap.performPerAlleleDownsampling(downsamplingFraction, downsamplingLog); // perform contamination downsampling
|
||||
alleleReadMap.put(haplotypeReadMapEntry.getKey(), perReadAlleleLikelihoodMap);
|
||||
}
|
||||
|
||||
return alleleReadMap;
|
||||
}
|
||||
|
||||
protected void mergeConsecutiveEventsBasedOnLD( final List<Haplotype> haplotypes,
|
||||
final List<String> samples,
|
||||
final Map<String, PerReadAlleleLikelihoodMap> haplotypeReadMap,
|
||||
final TreeSet<Integer> startPosKeySet,
|
||||
final byte[] ref,
|
||||
final GenomeLoc refLoc ) {
|
||||
|
||||
final int MAX_SIZE_TO_COMBINE = 15;
|
||||
final double MERGE_EVENTS_R2_THRESHOLD = 0.95;
|
||||
if( startPosKeySet.size() <= 1 ) { return; }
|
||||
|
|
@ -265,12 +339,13 @@ public class GenotypingEngine {
|
|||
}
|
||||
}
|
||||
// count up the co-occurrences of the events for the R^2 calculation
|
||||
final ArrayList<Haplotype> haplotypeList = new ArrayList<Haplotype>();
|
||||
haplotypeList.add(h);
|
||||
for( final String sample : haplotypes.get(0).getSampleKeySet() ) {
|
||||
for( final String sample : samples ) {
|
||||
final HashSet<String> sampleSet = new HashSet<String>(1);
|
||||
sampleSet.add(sample);
|
||||
final double haplotypeLikelihood = LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods( sampleSet, haplotypeList )[0][0];
|
||||
|
||||
final List<Allele> alleleList = new ArrayList<Allele>();
|
||||
alleleList.add(Allele.create(h.getBases()));
|
||||
final double haplotypeLikelihood = LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods( sampleSet, haplotypeReadMap, alleleList )[0][0];
|
||||
if( thisHapVC == null ) {
|
||||
if( nextHapVC == null ) { x11 = MathUtils.approximateLog10SumLog10(x11, haplotypeLikelihood); }
|
||||
else { x12 = MathUtils.approximateLog10SumLog10(x12, haplotypeLikelihood); }
|
||||
|
|
|
|||
|
|
@ -202,9 +202,6 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
|
|||
// the genotyping engine
|
||||
private GenotypingEngine genotypingEngine = null;
|
||||
|
||||
// the annotation engine
|
||||
private VariantAnnotatorEngine annotationEngine;
|
||||
|
||||
// fasta reference reader to supplement the edges of the reference sequence
|
||||
private CachingIndexedFastaSequenceFile referenceReader;
|
||||
|
||||
|
|
@ -249,7 +246,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
|
|||
UG_engine_simple_genotyper = new UnifiedGenotyperEngine(getToolkit(), simpleUAC, logger, null, null, samples, VariantContextUtils.DEFAULT_PLOIDY);
|
||||
|
||||
// initialize the output VCF header
|
||||
annotationEngine = new VariantAnnotatorEngine(Arrays.asList(annotationClassesToUse), annotationsToUse, annotationsToExclude, this, getToolkit());
|
||||
final VariantAnnotatorEngine annotationEngine = new VariantAnnotatorEngine(Arrays.asList(annotationClassesToUse), annotationsToUse, annotationsToExclude, this, getToolkit());
|
||||
|
||||
Set<VCFHeaderLine> headerInfo = new HashSet<VCFHeaderLine>();
|
||||
|
||||
|
|
@ -282,7 +279,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
|
|||
|
||||
assemblyEngine = new SimpleDeBruijnAssembler( DEBUG, graphWriter );
|
||||
likelihoodCalculationEngine = new LikelihoodCalculationEngine( (byte)gcpHMM, DEBUG, pairHMM );
|
||||
genotypingEngine = new GenotypingEngine( DEBUG );
|
||||
genotypingEngine = new GenotypingEngine( DEBUG, annotationEngine );
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
|
@ -398,21 +395,23 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
|
|||
Collections.sort( haplotypes, new Haplotype.HaplotypeBaseComparator() );
|
||||
|
||||
// evaluate each sample's reads against all haplotypes
|
||||
final HashMap<String, ArrayList<GATKSAMRecord>> perSampleReadList = splitReadsBySample( activeRegion.getReads() );
|
||||
final HashMap<String, ArrayList<GATKSAMRecord>> perSampleFilteredReadList = splitReadsBySample( filteredReads );
|
||||
likelihoodCalculationEngine.computeReadLikelihoods( haplotypes, perSampleReadList );
|
||||
final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap = likelihoodCalculationEngine.computeReadLikelihoods( haplotypes, splitReadsBySample( activeRegion.getReads() ) );
|
||||
final Map<String, ArrayList<GATKSAMRecord>> perSampleFilteredReadList = splitReadsBySample( filteredReads );
|
||||
|
||||
// subset down to only the best haplotypes to be genotyped in all samples ( in GGA mode use all discovered haplotypes )
|
||||
final ArrayList<Haplotype> bestHaplotypes = ( UG_engine.getUAC().GenotypingMode != GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ? likelihoodCalculationEngine.selectBestHaplotypes( haplotypes ) : haplotypes );
|
||||
final ArrayList<Haplotype> bestHaplotypes = ( UG_engine.getUAC().GenotypingMode != GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ? likelihoodCalculationEngine.selectBestHaplotypes( haplotypes, stratifiedReadMap ) : haplotypes );
|
||||
|
||||
for( final Pair<VariantContext, Map<Allele, List<Haplotype>>> callResult :
|
||||
genotypingEngine.assignGenotypeLikelihoodsAndCallIndependentEvents( UG_engine, bestHaplotypes, fullReferenceWithPadding, getPaddedLoc(activeRegion), activeRegion.getLocation(), getToolkit().getGenomeLocParser(), activeAllelesToGenotype ) ) {
|
||||
if( DEBUG ) { System.out.println(callResult.getFirst().toStringWithoutGenotypes()); }
|
||||
|
||||
final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap = LikelihoodCalculationEngine.partitionReadsBasedOnLikelihoods( getToolkit().getGenomeLocParser(), perSampleReadList, perSampleFilteredReadList, callResult, UG_engine.getUAC().CONTAMINATION_FRACTION, UG_engine.getUAC().contaminationLog );
|
||||
final VariantContext annotatedCall = annotationEngine.annotateContext(stratifiedReadMap, callResult.getFirst());
|
||||
final Map<String, Object> myAttributes = new LinkedHashMap<String, Object>(annotatedCall.getAttributes());
|
||||
vcfWriter.add( new VariantContextBuilder(annotatedCall).attributes(myAttributes).make() );
|
||||
for( final VariantContext call : genotypingEngine.assignGenotypeLikelihoodsAndCallIndependentEvents( UG_engine,
|
||||
bestHaplotypes,
|
||||
samplesList,
|
||||
stratifiedReadMap,
|
||||
perSampleFilteredReadList,
|
||||
fullReferenceWithPadding,
|
||||
getPaddedLoc(activeRegion),
|
||||
activeRegion.getLocation(),
|
||||
getToolkit().getGenomeLocParser(),
|
||||
activeAllelesToGenotype ) ) {
|
||||
vcfWriter.add( call );
|
||||
}
|
||||
|
||||
if( DEBUG ) { System.out.println("----------------------------------------------------------------------------------"); }
|
||||
|
|
@ -467,7 +466,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
|
|||
if( postAdapterRead != null && !postAdapterRead.isEmpty() && postAdapterRead.getCigar().getReadLength() > 0 ) {
|
||||
final GATKSAMRecord clippedRead = ReadClipper.hardClipLowQualEnds( postAdapterRead, MIN_TAIL_QUALITY );
|
||||
// protect against INTERVALS with abnormally high coverage
|
||||
// BUGBUG: remove when positinal downsampler is hooked up to ART/HC
|
||||
// BUGBUG: remove when positional downsampler is hooked up to ART/HC
|
||||
if( clippedRead.getReadLength() > 0 && activeRegion.size() < samplesList.size() * DOWNSAMPLE_PER_SAMPLE_PER_REGION ) {
|
||||
activeRegion.add(clippedRead);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -71,8 +71,9 @@ public class LikelihoodCalculationEngine {
|
|||
DEBUG = debug;
|
||||
}
|
||||
|
||||
public void computeReadLikelihoods( final ArrayList<Haplotype> haplotypes, final HashMap<String, ArrayList<GATKSAMRecord>> perSampleReadList ) {
|
||||
public Map<String, PerReadAlleleLikelihoodMap> computeReadLikelihoods( final ArrayList<Haplotype> haplotypes, final HashMap<String, ArrayList<GATKSAMRecord>> perSampleReadList ) {
|
||||
|
||||
final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap = new HashMap<String, PerReadAlleleLikelihoodMap>();
|
||||
int X_METRIC_LENGTH = 0;
|
||||
for( final Map.Entry<String, ArrayList<GATKSAMRecord>> sample : perSampleReadList.entrySet() ) {
|
||||
for( final GATKSAMRecord read : sample.getValue() ) {
|
||||
|
|
@ -97,20 +98,16 @@ public class LikelihoodCalculationEngine {
|
|||
for( final Map.Entry<String, ArrayList<GATKSAMRecord>> sampleEntry : perSampleReadList.entrySet() ) {
|
||||
//if( DEBUG ) { System.out.println("Evaluating sample " + sample + " with " + perSampleReadList.get( sample ).size() + " passing reads"); }
|
||||
// evaluate the likelihood of the reads given those haplotypes
|
||||
computeReadLikelihoods( haplotypes, sampleEntry.getValue(), sampleEntry.getKey() );
|
||||
stratifiedReadMap.put(sampleEntry.getKey(), computeReadLikelihoods(haplotypes, sampleEntry.getValue(), sampleEntry.getKey()));
|
||||
}
|
||||
return stratifiedReadMap;
|
||||
}
|
||||
|
||||
private void computeReadLikelihoods( final ArrayList<Haplotype> haplotypes, final ArrayList<GATKSAMRecord> reads, final String sample ) {
|
||||
private PerReadAlleleLikelihoodMap computeReadLikelihoods( final ArrayList<Haplotype> haplotypes, final ArrayList<GATKSAMRecord> reads, final String sample ) {
|
||||
|
||||
final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap = PerReadAlleleLikelihoodMap.getBestAvailablePerReadAlleleLikelihoodMap();
|
||||
final int numHaplotypes = haplotypes.size();
|
||||
final int numReads = reads.size();
|
||||
final double[][] readLikelihoods = new double[numHaplotypes][numReads];
|
||||
final int[][] readCounts = new int[numHaplotypes][numReads];
|
||||
for( int iii = 0; iii < numReads; iii++ ) {
|
||||
final GATKSAMRecord read = reads.get(iii);
|
||||
final int readCount = ReadUtils.getMeanRepresentativeReadCount(read);
|
||||
|
||||
for( final GATKSAMRecord read : reads ) {
|
||||
final byte[] overallGCP = new byte[read.getReadLength()];
|
||||
Arrays.fill( overallGCP, constantGCP ); // Is there a way to derive empirical estimates for this from the data?
|
||||
Haplotype previousHaplotypeSeen = null;
|
||||
|
|
@ -129,14 +126,12 @@ public class LikelihoodCalculationEngine {
|
|||
final int haplotypeStart = ( previousHaplotypeSeen == null ? 0 : computeFirstDifferingPosition(haplotype.getBases(), previousHaplotypeSeen.getBases()) );
|
||||
previousHaplotypeSeen = haplotype;
|
||||
|
||||
readLikelihoods[jjj][iii] = pairHMM.computeReadLikelihoodGivenHaplotypeLog10(haplotype.getBases(), read.getReadBases(),
|
||||
readQuals, readInsQuals, readDelQuals, overallGCP, haplotypeStart, jjj == 0);
|
||||
readCounts[jjj][iii] = readCount;
|
||||
perReadAlleleLikelihoodMap.add(read, Allele.create(haplotype.getBases()),
|
||||
pairHMM.computeReadLikelihoodGivenHaplotypeLog10(haplotype.getBases(), read.getReadBases(),
|
||||
readQuals, readInsQuals, readDelQuals, overallGCP, haplotypeStart, jjj == 0));
|
||||
}
|
||||
}
|
||||
for( int jjj = 0; jjj < numHaplotypes; jjj++ ) {
|
||||
haplotypes.get(jjj).addReadLikelihoods( sample, readLikelihoods[jjj], readCounts[jjj] );
|
||||
}
|
||||
return perReadAlleleLikelihoodMap;
|
||||
}
|
||||
|
||||
private static int computeFirstDifferingPosition( final byte[] b1, final byte[] b2 ) {
|
||||
|
|
@ -148,19 +143,21 @@ public class LikelihoodCalculationEngine {
|
|||
return Math.min(b1.length, b2.length);
|
||||
}
|
||||
|
||||
// This function takes just a single sample and a haplotypeMapping
|
||||
@Requires({"haplotypeMapping.size() > 0"})
|
||||
@Ensures({"result.length == result[0].length", "result.length == haplotypeMapping.size()"})
|
||||
public static double[][] computeDiploidHaplotypeLikelihoods( final String sample, final Map<Allele, List<Haplotype>> haplotypeMapping, final List<Allele> alleleOrdering ) {
|
||||
public static double[][] computeDiploidHaplotypeLikelihoods( final String sample,
|
||||
final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap,
|
||||
final List<Allele> alleleOrdering ) {
|
||||
final TreeSet<String> sampleSet = new TreeSet<String>();
|
||||
sampleSet.add(sample);
|
||||
return computeDiploidHaplotypeLikelihoods(sampleSet, haplotypeMapping, alleleOrdering);
|
||||
return computeDiploidHaplotypeLikelihoods(sampleSet, stratifiedReadMap, alleleOrdering);
|
||||
}
|
||||
|
||||
// This function takes a set of samples to pool over and a haplotypeMapping
|
||||
@Requires({"haplotypeMapping.size() > 0"})
|
||||
@Ensures({"result.length == result[0].length", "result.length == haplotypeMapping.size()"})
|
||||
public static double[][] computeDiploidHaplotypeLikelihoods( final Set<String> samples, final Map<Allele, List<Haplotype>> haplotypeMapping, final List<Allele> alleleOrdering ) {
|
||||
public static double[][] computeDiploidHaplotypeLikelihoods( final Set<String> samples,
|
||||
final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap,
|
||||
final List<Allele> alleleOrdering ) {
|
||||
|
||||
final int numHaplotypes = alleleOrdering.size();
|
||||
final double[][] haplotypeLikelihoodMatrix = new double[numHaplotypes][numHaplotypes];
|
||||
|
|
@ -170,59 +167,19 @@ public class LikelihoodCalculationEngine {
|
|||
|
||||
// compute the diploid haplotype likelihoods
|
||||
for( int iii = 0; iii < numHaplotypes; iii++ ) {
|
||||
final Allele iii_allele = alleleOrdering.get(iii);
|
||||
for( int jjj = 0; jjj <= iii; jjj++ ) {
|
||||
for( final Haplotype iii_mapped : haplotypeMapping.get(alleleOrdering.get(iii)) ) {
|
||||
for( final Haplotype jjj_mapped : haplotypeMapping.get(alleleOrdering.get(jjj)) ) {
|
||||
double haplotypeLikelihood = 0.0;
|
||||
for( final String sample : samples ) {
|
||||
final double[] readLikelihoods_iii = iii_mapped.getReadLikelihoods(sample);
|
||||
final int[] readCounts_iii = iii_mapped.getReadCounts(sample);
|
||||
final double[] readLikelihoods_jjj = jjj_mapped.getReadLikelihoods(sample);
|
||||
for( int kkk = 0; kkk < readLikelihoods_iii.length; kkk++ ) {
|
||||
// Compute log10(10^x1/2 + 10^x2/2) = log10(10^x1+10^x2)-log10(2)
|
||||
// First term is approximated by Jacobian log with table lookup.
|
||||
haplotypeLikelihood += readCounts_iii[kkk] * ( MathUtils.approximateLog10SumLog10(readLikelihoods_iii[kkk], readLikelihoods_jjj[kkk]) + LOG_ONE_HALF );
|
||||
}
|
||||
}
|
||||
haplotypeLikelihoodMatrix[iii][jjj] = Math.max(haplotypeLikelihoodMatrix[iii][jjj], haplotypeLikelihood);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// normalize the diploid likelihoods matrix
|
||||
return normalizeDiploidLikelihoodMatrixFromLog10( haplotypeLikelihoodMatrix );
|
||||
}
|
||||
|
||||
// This function takes a set of samples to pool over and a haplotypeMapping
|
||||
@Requires({"haplotypeList.size() > 0"})
|
||||
@Ensures({"result.length == result[0].length", "result.length == haplotypeList.size()"})
|
||||
public static double[][] computeDiploidHaplotypeLikelihoods( final Set<String> samples, final List<Haplotype> haplotypeList ) {
|
||||
|
||||
final int numHaplotypes = haplotypeList.size();
|
||||
final double[][] haplotypeLikelihoodMatrix = new double[numHaplotypes][numHaplotypes];
|
||||
for( int iii = 0; iii < numHaplotypes; iii++ ) {
|
||||
Arrays.fill(haplotypeLikelihoodMatrix[iii], Double.NEGATIVE_INFINITY);
|
||||
}
|
||||
|
||||
// compute the diploid haplotype likelihoods
|
||||
// todo - needs to be generalized to arbitrary ploidy, cleaned and merged with PairHMMIndelErrorModel code
|
||||
for( int iii = 0; iii < numHaplotypes; iii++ ) {
|
||||
final Haplotype iii_haplotype = haplotypeList.get(iii);
|
||||
for( int jjj = 0; jjj <= iii; jjj++ ) {
|
||||
final Haplotype jjj_haplotype = haplotypeList.get(jjj);
|
||||
final Allele jjj_allele = alleleOrdering.get(jjj);
|
||||
double haplotypeLikelihood = 0.0;
|
||||
for( final String sample : samples ) {
|
||||
final double[] readLikelihoods_iii = iii_haplotype.getReadLikelihoods(sample);
|
||||
final int[] readCounts_iii = iii_haplotype.getReadCounts(sample);
|
||||
final double[] readLikelihoods_jjj = jjj_haplotype.getReadLikelihoods(sample);
|
||||
for( int kkk = 0; kkk < readLikelihoods_iii.length; kkk++ ) {
|
||||
for( final Map.Entry<GATKSAMRecord, Map<Allele,Double>> entry : stratifiedReadMap.get(sample).getLikelihoodReadMap().entrySet() ) {
|
||||
// Compute log10(10^x1/2 + 10^x2/2) = log10(10^x1+10^x2)-log10(2)
|
||||
// First term is approximated by Jacobian log with table lookup.
|
||||
haplotypeLikelihood += readCounts_iii[kkk] * ( MathUtils.approximateLog10SumLog10(readLikelihoods_iii[kkk], readLikelihoods_jjj[kkk]) + LOG_ONE_HALF );
|
||||
haplotypeLikelihood += ReadUtils.getMeanRepresentativeReadCount( entry.getKey() ) *
|
||||
( MathUtils.approximateLog10SumLog10(entry.getValue().get(iii_allele), entry.getValue().get(jjj_allele)) + LOG_ONE_HALF );
|
||||
}
|
||||
}
|
||||
haplotypeLikelihoodMatrix[iii][jjj] = Math.max(haplotypeLikelihoodMatrix[iii][jjj], haplotypeLikelihood);
|
||||
haplotypeLikelihoodMatrix[iii][jjj] = haplotypeLikelihood;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -312,13 +269,16 @@ public class LikelihoodCalculationEngine {
|
|||
|
||||
@Requires({"haplotypes.size() > 0"})
|
||||
@Ensures({"result.size() <= haplotypes.size()"})
|
||||
public ArrayList<Haplotype> selectBestHaplotypes( final ArrayList<Haplotype> haplotypes ) {
|
||||
public ArrayList<Haplotype> selectBestHaplotypes( final ArrayList<Haplotype> haplotypes, final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap ) {
|
||||
|
||||
final int numHaplotypes = haplotypes.size();
|
||||
final Set<String> sampleKeySet = haplotypes.get(0).getSampleKeySet(); // BUGBUG: assume all haplotypes saw the same samples
|
||||
final Set<String> sampleKeySet = stratifiedReadMap.keySet();
|
||||
final ArrayList<Integer> bestHaplotypesIndexList = new ArrayList<Integer>();
|
||||
bestHaplotypesIndexList.add( findReferenceIndex(haplotypes) ); // always start with the reference haplotype
|
||||
final double[][] haplotypeLikelihoodMatrix = computeDiploidHaplotypeLikelihoods( sampleKeySet, haplotypes ); // all samples pooled together
|
||||
final List<Allele> haplotypesAsAlleles = new ArrayList<Allele>();
|
||||
for( final Haplotype h : haplotypes ) { haplotypesAsAlleles.add(Allele.create(h.getBases())); }
|
||||
|
||||
final double[][] haplotypeLikelihoodMatrix = computeDiploidHaplotypeLikelihoods( sampleKeySet, stratifiedReadMap, haplotypesAsAlleles ); // all samples pooled together
|
||||
|
||||
int hap1 = 0;
|
||||
int hap2 = 0;
|
||||
|
|
@ -358,52 +318,4 @@ public class LikelihoodCalculationEngine {
|
|||
}
|
||||
throw new ReviewedStingException( "No reference haplotype found in the list of haplotypes!" );
|
||||
}
|
||||
|
||||
public static Map<String, PerReadAlleleLikelihoodMap> partitionReadsBasedOnLikelihoods( final GenomeLocParser parser,
|
||||
final HashMap<String, ArrayList<GATKSAMRecord>> perSampleReadList,
|
||||
final HashMap<String, ArrayList<GATKSAMRecord>> perSampleFilteredReadList,
|
||||
final Pair<VariantContext, Map<Allele,List<Haplotype>>> call,
|
||||
final double downsamplingFraction,
|
||||
final PrintStream downsamplingLog ) {
|
||||
final Map<String, PerReadAlleleLikelihoodMap> returnMap = new HashMap<String, PerReadAlleleLikelihoodMap>();
|
||||
final GenomeLoc callLoc = parser.createGenomeLoc(call.getFirst());
|
||||
for( final Map.Entry<String, ArrayList<GATKSAMRecord>> sample : perSampleReadList.entrySet() ) {
|
||||
final PerReadAlleleLikelihoodMap likelihoodMap = PerReadAlleleLikelihoodMap.getBestAvailablePerReadAlleleLikelihoodMap();
|
||||
|
||||
final ArrayList<GATKSAMRecord> readsForThisSample = sample.getValue();
|
||||
for( int iii = 0; iii < readsForThisSample.size(); iii++ ) {
|
||||
final GATKSAMRecord read = readsForThisSample.get(iii); // BUGBUG: assumes read order in this list and haplotype likelihood list are the same!
|
||||
// only count the read if it overlaps the event, otherwise it is not added to the output read list at all
|
||||
if( callLoc.overlapsP(parser.createGenomeLoc(read)) ) {
|
||||
for( final Allele a : call.getFirst().getAlleles() ) {
|
||||
double maxLikelihood = Double.NEGATIVE_INFINITY;
|
||||
for( final Haplotype h : call.getSecond().get(a) ) { // use the max likelihood from all the haplotypes which mapped to this allele (achieved via the haplotype mapper object)
|
||||
final double likelihood = h.getReadLikelihoods(sample.getKey())[iii];
|
||||
if( likelihood > maxLikelihood ) {
|
||||
maxLikelihood = likelihood;
|
||||
}
|
||||
}
|
||||
likelihoodMap.add(read, a, maxLikelihood);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// down-sample before adding filtered reads
|
||||
likelihoodMap.performPerAlleleDownsampling(downsamplingFraction, downsamplingLog);
|
||||
|
||||
// add all filtered reads to the NO_CALL list because they weren't given any likelihoods
|
||||
for( final GATKSAMRecord read : perSampleFilteredReadList.get(sample.getKey()) ) {
|
||||
// only count the read if it overlaps the event, otherwise it is not added to the output read list at all
|
||||
if( callLoc.overlapsP(parser.createGenomeLoc(read)) ) {
|
||||
for( final Allele a : call.getFirst().getAlleles() ) {
|
||||
likelihoodMap.add(read, a, 0.0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
returnMap.put(sample.getKey(), likelihoodMap);
|
||||
|
||||
}
|
||||
return returnMap;
|
||||
}
|
||||
}
|
||||
|
|
@ -329,7 +329,6 @@ public class PairHMMIndelErrorModel {
|
|||
getContextHomopolymerLength(readBases,hrunProfile);
|
||||
fillGapProbabilities(hrunProfile, contextLogGapOpenProbabilities, contextLogGapContinuationProbabilities);
|
||||
|
||||
|
||||
for (Allele a: haplotypeMap.keySet()) {
|
||||
|
||||
Haplotype haplotype = haplotypeMap.get(a);
|
||||
|
|
|
|||
|
|
@ -41,8 +41,6 @@ public class Haplotype {
|
|||
protected final byte[] bases;
|
||||
protected final double[] quals;
|
||||
private GenomeLoc genomeLocation = null;
|
||||
private HashMap<String, double[]> readLikelihoodsPerSample = null;
|
||||
private HashMap<String, int[]> readCountsPerSample = null;
|
||||
private HashMap<Integer, VariantContext> eventMap = null;
|
||||
private boolean isRef = false;
|
||||
private Cigar cigar;
|
||||
|
|
@ -94,31 +92,6 @@ public class Haplotype {
|
|||
return Arrays.hashCode(bases);
|
||||
}
|
||||
|
||||
public void addReadLikelihoods( final String sample, final double[] readLikelihoods, final int[] readCounts ) {
|
||||
if( readLikelihoodsPerSample == null ) {
|
||||
readLikelihoodsPerSample = new HashMap<String, double[]>();
|
||||
}
|
||||
readLikelihoodsPerSample.put(sample, readLikelihoods);
|
||||
if( readCountsPerSample == null ) {
|
||||
readCountsPerSample = new HashMap<String, int[]>();
|
||||
}
|
||||
readCountsPerSample.put(sample, readCounts);
|
||||
}
|
||||
|
||||
@Ensures({"result != null"})
|
||||
public double[] getReadLikelihoods( final String sample ) {
|
||||
return readLikelihoodsPerSample.get(sample);
|
||||
}
|
||||
|
||||
@Ensures({"result != null"})
|
||||
public int[] getReadCounts( final String sample ) {
|
||||
return readCountsPerSample.get(sample);
|
||||
}
|
||||
|
||||
public Set<String> getSampleKeySet() {
|
||||
return readLikelihoodsPerSample.keySet();
|
||||
}
|
||||
|
||||
public HashMap<Integer, VariantContext> getEventMap() {
|
||||
return eventMap;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -38,10 +38,10 @@ import java.util.*;
|
|||
|
||||
public abstract class PerReadAlleleLikelihoodMap {
|
||||
|
||||
public static final double INDEL_LIKELIHOOD_THRESH = 0.1;
|
||||
public static final double INFORMATIVE_LIKELIHOOD_THRESHOLD = 0.1;
|
||||
|
||||
protected List<Allele> alleles;
|
||||
protected Map<GATKSAMRecord,Map<Allele,Double>> likelihoodReadMap;
|
||||
protected Map<GATKSAMRecord, Map<Allele, Double>> likelihoodReadMap;
|
||||
|
||||
public abstract void performPerAlleleDownsampling(final double downsamplingFraction, final PrintStream log);
|
||||
public abstract ReadBackedPileup createPerAlleleDownsampledBasePileup(final ReadBackedPileup pileup, final double downsamplingFraction, final PrintStream log);
|
||||
|
|
@ -68,7 +68,7 @@ public abstract class PerReadAlleleLikelihoodMap {
|
|||
}
|
||||
|
||||
public void add(PileupElement p, Allele a, Double likelihood) {
|
||||
add(p.getRead(),a,likelihood);
|
||||
add(p.getRead(), a, likelihood);
|
||||
}
|
||||
|
||||
public boolean containsPileupElement(PileupElement p) {
|
||||
|
|
@ -120,7 +120,7 @@ public abstract class PerReadAlleleLikelihoodMap {
|
|||
prevMaxLike = el.getValue();
|
||||
}
|
||||
}
|
||||
return (maxLike - prevMaxLike > INDEL_LIKELIHOOD_THRESH ? mostLikelyAllele : Allele.NO_CALL );
|
||||
return (maxLike - prevMaxLike > INFORMATIVE_LIKELIHOOD_THRESHOLD ? mostLikelyAllele : Allele.NO_CALL );
|
||||
}
|
||||
|
||||
public static PerReadAlleleLikelihoodMap getBestAvailablePerReadAlleleLikelihoodMap() {
|
||||
|
|
|
|||
Loading…
Reference in New Issue