From 1bdf17ef53710e1b1c8633e6dcc4b2c549d48a82 Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Sun, 2 Dec 2012 11:58:32 -0500 Subject: [PATCH] Reworking of how the likelihood calculation is organized in the HaplotypeCaller to facilitate the inclusion of per allele downsampling. We now use the downsampling for both the GL calculations and the annotation calculations. --- .../haplotypecaller/GenotypingEngine.java | 137 ++++++++++++---- .../haplotypecaller/HaplotypeCaller.java | 35 ++--- .../LikelihoodCalculationEngine.java | 148 ++++-------------- .../indels/PairHMMIndelErrorModel.java | 1 - .../broadinstitute/sting/utils/Haplotype.java | 27 ---- .../genotyper/PerReadAlleleLikelihoodMap.java | 8 +- 6 files changed, 157 insertions(+), 199 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java index 4fc2dc8f7..6f94e2657 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java @@ -30,12 +30,16 @@ import com.google.java.contract.Requires; import net.sf.samtools.Cigar; import net.sf.samtools.CigarElement; import org.apache.commons.lang.ArrayUtils; +import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine; import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine; import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.variantcontext.*; +import java.io.PrintStream; import java.util.*; public class GenotypingEngine { @@ -43,23 +47,27 @@ public class GenotypingEngine { private final boolean DEBUG; private final static List noCall = new ArrayList(); // used to noCall all genotypes until the exact model is applied private final static Allele SYMBOLIC_UNASSEMBLED_EVENT_ALLELE = Allele.create("", false); + private final VariantAnnotatorEngine annotationEngine; - public GenotypingEngine( final boolean DEBUG ) { + public GenotypingEngine( final boolean DEBUG, final VariantAnnotatorEngine annotationEngine ) { this.DEBUG = DEBUG; + this.annotationEngine = annotationEngine; noCall.add(Allele.NO_CALL); } - // BUGBUG: Create a class to hold this complicated return type @Requires({"refLoc.containsP(activeRegionWindow)", "haplotypes.size() > 0"}) - public List>>> assignGenotypeLikelihoodsAndCallIndependentEvents( final UnifiedGenotyperEngine UG_engine, - final List haplotypes, - final byte[] ref, - final GenomeLoc refLoc, - final GenomeLoc activeRegionWindow, - final GenomeLocParser genomeLocParser, - final List activeAllelesToGenotype ) { + public List assignGenotypeLikelihoodsAndCallIndependentEvents( final UnifiedGenotyperEngine UG_engine, + final List haplotypes, + final List samples, + final Map haplotypeReadMap, + final Map> perSampleFilteredReadList, + final byte[] ref, + final GenomeLoc refLoc, + final GenomeLoc activeRegionWindow, + final GenomeLocParser genomeLocParser, + final List activeAllelesToGenotype ) { - final ArrayList>>> returnCalls = new ArrayList>>>(); + final List returnCalls = new ArrayList(); final boolean in_GGA_mode = !activeAllelesToGenotype.isEmpty(); // Using the cigar from each called haplotype figure out what events need to be written out in a VCF file @@ -79,8 +87,8 @@ public class GenotypingEngine { } cleanUpSymbolicUnassembledEvents( haplotypes ); - if( !in_GGA_mode && haplotypes.get(0).getSampleKeySet().size() >= 10 ) { // if not in GGA mode and have at least 10 samples try to create MNP and complex events by looking at LD structure - mergeConsecutiveEventsBasedOnLD( haplotypes, startPosKeySet, ref, refLoc ); + if( !in_GGA_mode && samples.size() >= 10 ) { // if not in GGA mode and have at least 10 samples try to create MNP and complex events by looking at LD structure + mergeConsecutiveEventsBasedOnLD( haplotypes, samples, haplotypeReadMap, startPosKeySet, ref, refLoc ); } if( in_GGA_mode ) { for( final VariantContext compVC : activeAllelesToGenotype ) { @@ -90,7 +98,7 @@ public class GenotypingEngine { // Walk along each position in the key set and create each event to be outputted for( final int loc : startPosKeySet ) { - if( loc >= activeRegionWindow.getStart() && loc <= activeRegionWindow.getStop() ) { + if( loc >= activeRegionWindow.getStart() && loc <= activeRegionWindow.getStop() ) { // genotyping an event inside this active region final ArrayList eventsAtThisLoc = new ArrayList(); // the overlapping events to merge into a common reference view final ArrayList priorityList = new ArrayList(); // used to merge overlapping events into common reference view @@ -167,12 +175,14 @@ public class GenotypingEngine { //System.out.println("Event/haplotype allele mapping = " + alleleMapper); } + final Map alleleReadMap = convertHaplotypeReadMapToAlleleReadMap( haplotypeReadMap, alleleMapper, UG_engine.getUAC().CONTAMINATION_FRACTION, UG_engine.getUAC().contaminationLog ); + // Grab the genotype likelihoods from the appropriate places in the haplotype likelihood matrix -- calculation performed independently per sample - final GenotypesContext genotypes = GenotypesContext.create(haplotypes.get(0).getSampleKeySet().size()); - for( final String sample : haplotypes.get(0).getSampleKeySet() ) { // BUGBUG: assume all haplotypes saw the same samples + final GenotypesContext genotypes = GenotypesContext.create(samples.size()); + for( final String sample : samples ) { final int numHaplotypes = alleleMapper.size(); final double[] genotypeLikelihoods = new double[numHaplotypes * (numHaplotypes+1) / 2]; - final double[][] haplotypeLikelihoodMatrix = LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods(sample, alleleMapper, alleleOrdering); + final double[][] haplotypeLikelihoodMatrix = LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods(sample, alleleReadMap, alleleOrdering); int glIndex = 0; for( int iii = 0; iii < numHaplotypes; iii++ ) { for( int jjj = 0; jjj <= iii; jjj++ ) { @@ -183,25 +193,55 @@ public class GenotypingEngine { } VariantContext call = UG_engine.calculateGenotypes(new VariantContextBuilder(mergedVC).genotypes(genotypes).make(), UG_engine.getUAC().GLmodel); if( call != null ) { - if( call.getAlleles().size() != mergedVC.getAlleles().size() ) { // some alleles were removed so reverseTrimming might be necessary! - final VariantContext vcCallTrim = VariantContextUtils.reverseTrimAlleles(call); - // also, need to update the allele -> haplotype mapping - final HashMap> alleleHashMapTrim = new HashMap>(); - for( int iii = 0; iii < vcCallTrim.getAlleles().size(); iii++ ) { // BUGBUG: this is assuming that the original and trimmed alleles maintain the same ordering in the VC - alleleHashMapTrim.put(vcCallTrim.getAlleles().get(iii), alleleMapper.get(call.getAlleles().get(iii))); - } + final Map stratifiedReadMap = filterToOnlyOverlappingReads( genomeLocParser, alleleReadMap, perSampleFilteredReadList, call ); + VariantContext annotatedCall = annotationEngine.annotateContext(stratifiedReadMap, call); - call = vcCallTrim; - alleleMapper = alleleHashMapTrim; + if( annotatedCall.getAlleles().size() != mergedVC.getAlleles().size() ) { // some alleles were removed so reverseTrimming might be necessary! + annotatedCall = VariantContextUtils.reverseTrimAlleles(annotatedCall); } - returnCalls.add( new Pair>>(call, alleleMapper) ); + returnCalls.add( annotatedCall ); } } } return returnCalls; } + private static Map filterToOnlyOverlappingReads( final GenomeLocParser parser, + final Map perSampleReadMap, + final Map> perSampleFilteredReadList, + final VariantContext call ) { + + final Map returnMap = new HashMap(); + final GenomeLoc callLoc = parser.createGenomeLoc(call); + for( final Map.Entry sample : perSampleReadMap.entrySet() ) { + final PerReadAlleleLikelihoodMap likelihoodMap = PerReadAlleleLikelihoodMap.getBestAvailablePerReadAlleleLikelihoodMap(); + + for( final Map.Entry> mapEntry : likelihoodMap.getLikelihoodReadMap().entrySet() ) { + // only count the read if it overlaps the event, otherwise it is not added to the output read list at all + if( callLoc.overlapsP(parser.createGenomeLoc(mapEntry.getKey())) ) { + for( final Map.Entry a : mapEntry.getValue().entrySet() ) { + likelihoodMap.add(mapEntry.getKey(), a.getKey(), a.getValue()); + } + } + } + + // add all filtered reads to the NO_CALL list because they weren't given any likelihoods + for( final GATKSAMRecord read : perSampleFilteredReadList.get(sample.getKey()) ) { + // only count the read if it overlaps the event, otherwise it is not added to the output read list at all + if( callLoc.overlapsP(parser.createGenomeLoc(read)) ) { + for( final Allele a : call.getAlleles() ) { + likelihoodMap.add(read, a, 0.0); + } + } + } + + returnMap.put(sample.getKey(), likelihoodMap); + } + return returnMap; + } + + protected static void cleanUpSymbolicUnassembledEvents( final List haplotypes ) { final ArrayList haplotypesToRemove = new ArrayList(); for( final Haplotype h : haplotypes ) { @@ -221,7 +261,41 @@ public class GenotypingEngine { haplotypes.removeAll(haplotypesToRemove); } - protected void mergeConsecutiveEventsBasedOnLD( final List haplotypes, final TreeSet startPosKeySet, final byte[] ref, final GenomeLoc refLoc ) { + // BUGBUG: ugh, too complicated + protected Map convertHaplotypeReadMapToAlleleReadMap( final Map haplotypeReadMap, + final Map> alleleMapper, + final double downsamplingFraction, + final PrintStream downsamplingLog ) { + + final Map alleleReadMap = new HashMap(); + for( final Map.Entry haplotypeReadMapEntry : haplotypeReadMap.entrySet() ) { // for each sample + final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap = PerReadAlleleLikelihoodMap.getBestAvailablePerReadAlleleLikelihoodMap(); + for( final Map.Entry> alleleMapperEntry : alleleMapper.entrySet() ) { // for each output allele + final List mappedHaplotypes = alleleMapperEntry.getValue(); + for( final Map.Entry> readEntry : haplotypeReadMapEntry.getValue().getLikelihoodReadMap().entrySet() ) { // for each read + double maxLikelihood = Double.NEGATIVE_INFINITY; + for( final Map.Entry alleleDoubleEntry : readEntry.getValue().entrySet() ) { // for each input allele + if( mappedHaplotypes.contains( new Haplotype(alleleDoubleEntry.getKey().getBases())) ) { // exact match of haplotype base string + maxLikelihood = Math.max( maxLikelihood, alleleDoubleEntry.getValue() ); + } + } + perReadAlleleLikelihoodMap.add(readEntry.getKey(), alleleMapperEntry.getKey(), maxLikelihood); + } + } + perReadAlleleLikelihoodMap.performPerAlleleDownsampling(downsamplingFraction, downsamplingLog); // perform contamination downsampling + alleleReadMap.put(haplotypeReadMapEntry.getKey(), perReadAlleleLikelihoodMap); + } + + return alleleReadMap; + } + + protected void mergeConsecutiveEventsBasedOnLD( final List haplotypes, + final List samples, + final Map haplotypeReadMap, + final TreeSet startPosKeySet, + final byte[] ref, + final GenomeLoc refLoc ) { + final int MAX_SIZE_TO_COMBINE = 15; final double MERGE_EVENTS_R2_THRESHOLD = 0.95; if( startPosKeySet.size() <= 1 ) { return; } @@ -265,12 +339,13 @@ public class GenotypingEngine { } } // count up the co-occurrences of the events for the R^2 calculation - final ArrayList haplotypeList = new ArrayList(); - haplotypeList.add(h); - for( final String sample : haplotypes.get(0).getSampleKeySet() ) { + for( final String sample : samples ) { final HashSet sampleSet = new HashSet(1); sampleSet.add(sample); - final double haplotypeLikelihood = LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods( sampleSet, haplotypeList )[0][0]; + + final List alleleList = new ArrayList(); + alleleList.add(Allele.create(h.getBases())); + final double haplotypeLikelihood = LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods( sampleSet, haplotypeReadMap, alleleList )[0][0]; if( thisHapVC == null ) { if( nextHapVC == null ) { x11 = MathUtils.approximateLog10SumLog10(x11, haplotypeLikelihood); } else { x12 = MathUtils.approximateLog10SumLog10(x12, haplotypeLikelihood); } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index 24b3309f1..2b3513bef 100755 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -202,9 +202,6 @@ public class HaplotypeCaller extends ActiveRegionWalker implem // the genotyping engine private GenotypingEngine genotypingEngine = null; - // the annotation engine - private VariantAnnotatorEngine annotationEngine; - // fasta reference reader to supplement the edges of the reference sequence private CachingIndexedFastaSequenceFile referenceReader; @@ -249,7 +246,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem UG_engine_simple_genotyper = new UnifiedGenotyperEngine(getToolkit(), simpleUAC, logger, null, null, samples, VariantContextUtils.DEFAULT_PLOIDY); // initialize the output VCF header - annotationEngine = new VariantAnnotatorEngine(Arrays.asList(annotationClassesToUse), annotationsToUse, annotationsToExclude, this, getToolkit()); + final VariantAnnotatorEngine annotationEngine = new VariantAnnotatorEngine(Arrays.asList(annotationClassesToUse), annotationsToUse, annotationsToExclude, this, getToolkit()); Set headerInfo = new HashSet(); @@ -282,7 +279,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem assemblyEngine = new SimpleDeBruijnAssembler( DEBUG, graphWriter ); likelihoodCalculationEngine = new LikelihoodCalculationEngine( (byte)gcpHMM, DEBUG, pairHMM ); - genotypingEngine = new GenotypingEngine( DEBUG ); + genotypingEngine = new GenotypingEngine( DEBUG, annotationEngine ); } //--------------------------------------------------------------------------------------------------------------- @@ -398,21 +395,23 @@ public class HaplotypeCaller extends ActiveRegionWalker implem Collections.sort( haplotypes, new Haplotype.HaplotypeBaseComparator() ); // evaluate each sample's reads against all haplotypes - final HashMap> perSampleReadList = splitReadsBySample( activeRegion.getReads() ); - final HashMap> perSampleFilteredReadList = splitReadsBySample( filteredReads ); - likelihoodCalculationEngine.computeReadLikelihoods( haplotypes, perSampleReadList ); + final Map stratifiedReadMap = likelihoodCalculationEngine.computeReadLikelihoods( haplotypes, splitReadsBySample( activeRegion.getReads() ) ); + final Map> perSampleFilteredReadList = splitReadsBySample( filteredReads ); // subset down to only the best haplotypes to be genotyped in all samples ( in GGA mode use all discovered haplotypes ) - final ArrayList bestHaplotypes = ( UG_engine.getUAC().GenotypingMode != GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ? likelihoodCalculationEngine.selectBestHaplotypes( haplotypes ) : haplotypes ); + final ArrayList bestHaplotypes = ( UG_engine.getUAC().GenotypingMode != GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ? likelihoodCalculationEngine.selectBestHaplotypes( haplotypes, stratifiedReadMap ) : haplotypes ); - for( final Pair>> callResult : - genotypingEngine.assignGenotypeLikelihoodsAndCallIndependentEvents( UG_engine, bestHaplotypes, fullReferenceWithPadding, getPaddedLoc(activeRegion), activeRegion.getLocation(), getToolkit().getGenomeLocParser(), activeAllelesToGenotype ) ) { - if( DEBUG ) { System.out.println(callResult.getFirst().toStringWithoutGenotypes()); } - - final Map stratifiedReadMap = LikelihoodCalculationEngine.partitionReadsBasedOnLikelihoods( getToolkit().getGenomeLocParser(), perSampleReadList, perSampleFilteredReadList, callResult, UG_engine.getUAC().CONTAMINATION_FRACTION, UG_engine.getUAC().contaminationLog ); - final VariantContext annotatedCall = annotationEngine.annotateContext(stratifiedReadMap, callResult.getFirst()); - final Map myAttributes = new LinkedHashMap(annotatedCall.getAttributes()); - vcfWriter.add( new VariantContextBuilder(annotatedCall).attributes(myAttributes).make() ); + for( final VariantContext call : genotypingEngine.assignGenotypeLikelihoodsAndCallIndependentEvents( UG_engine, + bestHaplotypes, + samplesList, + stratifiedReadMap, + perSampleFilteredReadList, + fullReferenceWithPadding, + getPaddedLoc(activeRegion), + activeRegion.getLocation(), + getToolkit().getGenomeLocParser(), + activeAllelesToGenotype ) ) { + vcfWriter.add( call ); } if( DEBUG ) { System.out.println("----------------------------------------------------------------------------------"); } @@ -467,7 +466,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem if( postAdapterRead != null && !postAdapterRead.isEmpty() && postAdapterRead.getCigar().getReadLength() > 0 ) { final GATKSAMRecord clippedRead = ReadClipper.hardClipLowQualEnds( postAdapterRead, MIN_TAIL_QUALITY ); // protect against INTERVALS with abnormally high coverage - // BUGBUG: remove when positinal downsampler is hooked up to ART/HC + // BUGBUG: remove when positional downsampler is hooked up to ART/HC if( clippedRead.getReadLength() > 0 && activeRegion.size() < samplesList.size() * DOWNSAMPLE_PER_SAMPLE_PER_REGION ) { activeRegion.add(clippedRead); } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java index 29622ca17..4a5c7fe9b 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java @@ -71,8 +71,9 @@ public class LikelihoodCalculationEngine { DEBUG = debug; } - public void computeReadLikelihoods( final ArrayList haplotypes, final HashMap> perSampleReadList ) { + public Map computeReadLikelihoods( final ArrayList haplotypes, final HashMap> perSampleReadList ) { + final Map stratifiedReadMap = new HashMap(); int X_METRIC_LENGTH = 0; for( final Map.Entry> sample : perSampleReadList.entrySet() ) { for( final GATKSAMRecord read : sample.getValue() ) { @@ -97,20 +98,16 @@ public class LikelihoodCalculationEngine { for( final Map.Entry> sampleEntry : perSampleReadList.entrySet() ) { //if( DEBUG ) { System.out.println("Evaluating sample " + sample + " with " + perSampleReadList.get( sample ).size() + " passing reads"); } // evaluate the likelihood of the reads given those haplotypes - computeReadLikelihoods( haplotypes, sampleEntry.getValue(), sampleEntry.getKey() ); + stratifiedReadMap.put(sampleEntry.getKey(), computeReadLikelihoods(haplotypes, sampleEntry.getValue(), sampleEntry.getKey())); } + return stratifiedReadMap; } - private void computeReadLikelihoods( final ArrayList haplotypes, final ArrayList reads, final String sample ) { + private PerReadAlleleLikelihoodMap computeReadLikelihoods( final ArrayList haplotypes, final ArrayList reads, final String sample ) { + final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap = PerReadAlleleLikelihoodMap.getBestAvailablePerReadAlleleLikelihoodMap(); final int numHaplotypes = haplotypes.size(); - final int numReads = reads.size(); - final double[][] readLikelihoods = new double[numHaplotypes][numReads]; - final int[][] readCounts = new int[numHaplotypes][numReads]; - for( int iii = 0; iii < numReads; iii++ ) { - final GATKSAMRecord read = reads.get(iii); - final int readCount = ReadUtils.getMeanRepresentativeReadCount(read); - + for( final GATKSAMRecord read : reads ) { final byte[] overallGCP = new byte[read.getReadLength()]; Arrays.fill( overallGCP, constantGCP ); // Is there a way to derive empirical estimates for this from the data? Haplotype previousHaplotypeSeen = null; @@ -129,14 +126,12 @@ public class LikelihoodCalculationEngine { final int haplotypeStart = ( previousHaplotypeSeen == null ? 0 : computeFirstDifferingPosition(haplotype.getBases(), previousHaplotypeSeen.getBases()) ); previousHaplotypeSeen = haplotype; - readLikelihoods[jjj][iii] = pairHMM.computeReadLikelihoodGivenHaplotypeLog10(haplotype.getBases(), read.getReadBases(), - readQuals, readInsQuals, readDelQuals, overallGCP, haplotypeStart, jjj == 0); - readCounts[jjj][iii] = readCount; + perReadAlleleLikelihoodMap.add(read, Allele.create(haplotype.getBases()), + pairHMM.computeReadLikelihoodGivenHaplotypeLog10(haplotype.getBases(), read.getReadBases(), + readQuals, readInsQuals, readDelQuals, overallGCP, haplotypeStart, jjj == 0)); } } - for( int jjj = 0; jjj < numHaplotypes; jjj++ ) { - haplotypes.get(jjj).addReadLikelihoods( sample, readLikelihoods[jjj], readCounts[jjj] ); - } + return perReadAlleleLikelihoodMap; } private static int computeFirstDifferingPosition( final byte[] b1, final byte[] b2 ) { @@ -148,19 +143,21 @@ public class LikelihoodCalculationEngine { return Math.min(b1.length, b2.length); } - // This function takes just a single sample and a haplotypeMapping @Requires({"haplotypeMapping.size() > 0"}) @Ensures({"result.length == result[0].length", "result.length == haplotypeMapping.size()"}) - public static double[][] computeDiploidHaplotypeLikelihoods( final String sample, final Map> haplotypeMapping, final List alleleOrdering ) { + public static double[][] computeDiploidHaplotypeLikelihoods( final String sample, + final Map stratifiedReadMap, + final List alleleOrdering ) { final TreeSet sampleSet = new TreeSet(); sampleSet.add(sample); - return computeDiploidHaplotypeLikelihoods(sampleSet, haplotypeMapping, alleleOrdering); + return computeDiploidHaplotypeLikelihoods(sampleSet, stratifiedReadMap, alleleOrdering); } - // This function takes a set of samples to pool over and a haplotypeMapping @Requires({"haplotypeMapping.size() > 0"}) @Ensures({"result.length == result[0].length", "result.length == haplotypeMapping.size()"}) - public static double[][] computeDiploidHaplotypeLikelihoods( final Set samples, final Map> haplotypeMapping, final List alleleOrdering ) { + public static double[][] computeDiploidHaplotypeLikelihoods( final Set samples, + final Map stratifiedReadMap, + final List alleleOrdering ) { final int numHaplotypes = alleleOrdering.size(); final double[][] haplotypeLikelihoodMatrix = new double[numHaplotypes][numHaplotypes]; @@ -170,59 +167,19 @@ public class LikelihoodCalculationEngine { // compute the diploid haplotype likelihoods for( int iii = 0; iii < numHaplotypes; iii++ ) { + final Allele iii_allele = alleleOrdering.get(iii); for( int jjj = 0; jjj <= iii; jjj++ ) { - for( final Haplotype iii_mapped : haplotypeMapping.get(alleleOrdering.get(iii)) ) { - for( final Haplotype jjj_mapped : haplotypeMapping.get(alleleOrdering.get(jjj)) ) { - double haplotypeLikelihood = 0.0; - for( final String sample : samples ) { - final double[] readLikelihoods_iii = iii_mapped.getReadLikelihoods(sample); - final int[] readCounts_iii = iii_mapped.getReadCounts(sample); - final double[] readLikelihoods_jjj = jjj_mapped.getReadLikelihoods(sample); - for( int kkk = 0; kkk < readLikelihoods_iii.length; kkk++ ) { - // Compute log10(10^x1/2 + 10^x2/2) = log10(10^x1+10^x2)-log10(2) - // First term is approximated by Jacobian log with table lookup. - haplotypeLikelihood += readCounts_iii[kkk] * ( MathUtils.approximateLog10SumLog10(readLikelihoods_iii[kkk], readLikelihoods_jjj[kkk]) + LOG_ONE_HALF ); - } - } - haplotypeLikelihoodMatrix[iii][jjj] = Math.max(haplotypeLikelihoodMatrix[iii][jjj], haplotypeLikelihood); - } - } - } - } - - // normalize the diploid likelihoods matrix - return normalizeDiploidLikelihoodMatrixFromLog10( haplotypeLikelihoodMatrix ); - } - - // This function takes a set of samples to pool over and a haplotypeMapping - @Requires({"haplotypeList.size() > 0"}) - @Ensures({"result.length == result[0].length", "result.length == haplotypeList.size()"}) - public static double[][] computeDiploidHaplotypeLikelihoods( final Set samples, final List haplotypeList ) { - - final int numHaplotypes = haplotypeList.size(); - final double[][] haplotypeLikelihoodMatrix = new double[numHaplotypes][numHaplotypes]; - for( int iii = 0; iii < numHaplotypes; iii++ ) { - Arrays.fill(haplotypeLikelihoodMatrix[iii], Double.NEGATIVE_INFINITY); - } - - // compute the diploid haplotype likelihoods - // todo - needs to be generalized to arbitrary ploidy, cleaned and merged with PairHMMIndelErrorModel code - for( int iii = 0; iii < numHaplotypes; iii++ ) { - final Haplotype iii_haplotype = haplotypeList.get(iii); - for( int jjj = 0; jjj <= iii; jjj++ ) { - final Haplotype jjj_haplotype = haplotypeList.get(jjj); + final Allele jjj_allele = alleleOrdering.get(jjj); double haplotypeLikelihood = 0.0; for( final String sample : samples ) { - final double[] readLikelihoods_iii = iii_haplotype.getReadLikelihoods(sample); - final int[] readCounts_iii = iii_haplotype.getReadCounts(sample); - final double[] readLikelihoods_jjj = jjj_haplotype.getReadLikelihoods(sample); - for( int kkk = 0; kkk < readLikelihoods_iii.length; kkk++ ) { + for( final Map.Entry> entry : stratifiedReadMap.get(sample).getLikelihoodReadMap().entrySet() ) { // Compute log10(10^x1/2 + 10^x2/2) = log10(10^x1+10^x2)-log10(2) // First term is approximated by Jacobian log with table lookup. - haplotypeLikelihood += readCounts_iii[kkk] * ( MathUtils.approximateLog10SumLog10(readLikelihoods_iii[kkk], readLikelihoods_jjj[kkk]) + LOG_ONE_HALF ); + haplotypeLikelihood += ReadUtils.getMeanRepresentativeReadCount( entry.getKey() ) * + ( MathUtils.approximateLog10SumLog10(entry.getValue().get(iii_allele), entry.getValue().get(jjj_allele)) + LOG_ONE_HALF ); } } - haplotypeLikelihoodMatrix[iii][jjj] = Math.max(haplotypeLikelihoodMatrix[iii][jjj], haplotypeLikelihood); + haplotypeLikelihoodMatrix[iii][jjj] = haplotypeLikelihood; } } @@ -312,13 +269,16 @@ public class LikelihoodCalculationEngine { @Requires({"haplotypes.size() > 0"}) @Ensures({"result.size() <= haplotypes.size()"}) - public ArrayList selectBestHaplotypes( final ArrayList haplotypes ) { + public ArrayList selectBestHaplotypes( final ArrayList haplotypes, final Map stratifiedReadMap ) { final int numHaplotypes = haplotypes.size(); - final Set sampleKeySet = haplotypes.get(0).getSampleKeySet(); // BUGBUG: assume all haplotypes saw the same samples + final Set sampleKeySet = stratifiedReadMap.keySet(); final ArrayList bestHaplotypesIndexList = new ArrayList(); bestHaplotypesIndexList.add( findReferenceIndex(haplotypes) ); // always start with the reference haplotype - final double[][] haplotypeLikelihoodMatrix = computeDiploidHaplotypeLikelihoods( sampleKeySet, haplotypes ); // all samples pooled together + final List haplotypesAsAlleles = new ArrayList(); + for( final Haplotype h : haplotypes ) { haplotypesAsAlleles.add(Allele.create(h.getBases())); } + + final double[][] haplotypeLikelihoodMatrix = computeDiploidHaplotypeLikelihoods( sampleKeySet, stratifiedReadMap, haplotypesAsAlleles ); // all samples pooled together int hap1 = 0; int hap2 = 0; @@ -358,52 +318,4 @@ public class LikelihoodCalculationEngine { } throw new ReviewedStingException( "No reference haplotype found in the list of haplotypes!" ); } - - public static Map partitionReadsBasedOnLikelihoods( final GenomeLocParser parser, - final HashMap> perSampleReadList, - final HashMap> perSampleFilteredReadList, - final Pair>> call, - final double downsamplingFraction, - final PrintStream downsamplingLog ) { - final Map returnMap = new HashMap(); - final GenomeLoc callLoc = parser.createGenomeLoc(call.getFirst()); - for( final Map.Entry> sample : perSampleReadList.entrySet() ) { - final PerReadAlleleLikelihoodMap likelihoodMap = PerReadAlleleLikelihoodMap.getBestAvailablePerReadAlleleLikelihoodMap(); - - final ArrayList readsForThisSample = sample.getValue(); - for( int iii = 0; iii < readsForThisSample.size(); iii++ ) { - final GATKSAMRecord read = readsForThisSample.get(iii); // BUGBUG: assumes read order in this list and haplotype likelihood list are the same! - // only count the read if it overlaps the event, otherwise it is not added to the output read list at all - if( callLoc.overlapsP(parser.createGenomeLoc(read)) ) { - for( final Allele a : call.getFirst().getAlleles() ) { - double maxLikelihood = Double.NEGATIVE_INFINITY; - for( final Haplotype h : call.getSecond().get(a) ) { // use the max likelihood from all the haplotypes which mapped to this allele (achieved via the haplotype mapper object) - final double likelihood = h.getReadLikelihoods(sample.getKey())[iii]; - if( likelihood > maxLikelihood ) { - maxLikelihood = likelihood; - } - } - likelihoodMap.add(read, a, maxLikelihood); - } - } - } - - // down-sample before adding filtered reads - likelihoodMap.performPerAlleleDownsampling(downsamplingFraction, downsamplingLog); - - // add all filtered reads to the NO_CALL list because they weren't given any likelihoods - for( final GATKSAMRecord read : perSampleFilteredReadList.get(sample.getKey()) ) { - // only count the read if it overlaps the event, otherwise it is not added to the output read list at all - if( callLoc.overlapsP(parser.createGenomeLoc(read)) ) { - for( final Allele a : call.getFirst().getAlleles() ) { - likelihoodMap.add(read, a, 0.0); - } - } - } - - returnMap.put(sample.getKey(), likelihoodMap); - - } - return returnMap; - } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java index 79962a3e4..7b797432d 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java @@ -329,7 +329,6 @@ public class PairHMMIndelErrorModel { getContextHomopolymerLength(readBases,hrunProfile); fillGapProbabilities(hrunProfile, contextLogGapOpenProbabilities, contextLogGapContinuationProbabilities); - for (Allele a: haplotypeMap.keySet()) { Haplotype haplotype = haplotypeMap.get(a); diff --git a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java index 30fdce75d..4c708f2bf 100755 --- a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java +++ b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java @@ -41,8 +41,6 @@ public class Haplotype { protected final byte[] bases; protected final double[] quals; private GenomeLoc genomeLocation = null; - private HashMap readLikelihoodsPerSample = null; - private HashMap readCountsPerSample = null; private HashMap eventMap = null; private boolean isRef = false; private Cigar cigar; @@ -94,31 +92,6 @@ public class Haplotype { return Arrays.hashCode(bases); } - public void addReadLikelihoods( final String sample, final double[] readLikelihoods, final int[] readCounts ) { - if( readLikelihoodsPerSample == null ) { - readLikelihoodsPerSample = new HashMap(); - } - readLikelihoodsPerSample.put(sample, readLikelihoods); - if( readCountsPerSample == null ) { - readCountsPerSample = new HashMap(); - } - readCountsPerSample.put(sample, readCounts); - } - - @Ensures({"result != null"}) - public double[] getReadLikelihoods( final String sample ) { - return readLikelihoodsPerSample.get(sample); - } - - @Ensures({"result != null"}) - public int[] getReadCounts( final String sample ) { - return readCountsPerSample.get(sample); - } - - public Set getSampleKeySet() { - return readLikelihoodsPerSample.keySet(); - } - public HashMap getEventMap() { return eventMap; } diff --git a/public/java/src/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMap.java b/public/java/src/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMap.java index 22d249240..9bb0e646f 100644 --- a/public/java/src/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMap.java +++ b/public/java/src/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMap.java @@ -38,10 +38,10 @@ import java.util.*; public abstract class PerReadAlleleLikelihoodMap { - public static final double INDEL_LIKELIHOOD_THRESH = 0.1; + public static final double INFORMATIVE_LIKELIHOOD_THRESHOLD = 0.1; protected List alleles; - protected Map> likelihoodReadMap; + protected Map> likelihoodReadMap; public abstract void performPerAlleleDownsampling(final double downsamplingFraction, final PrintStream log); public abstract ReadBackedPileup createPerAlleleDownsampledBasePileup(final ReadBackedPileup pileup, final double downsamplingFraction, final PrintStream log); @@ -68,7 +68,7 @@ public abstract class PerReadAlleleLikelihoodMap { } public void add(PileupElement p, Allele a, Double likelihood) { - add(p.getRead(),a,likelihood); + add(p.getRead(), a, likelihood); } public boolean containsPileupElement(PileupElement p) { @@ -120,7 +120,7 @@ public abstract class PerReadAlleleLikelihoodMap { prevMaxLike = el.getValue(); } } - return (maxLike - prevMaxLike > INDEL_LIKELIHOOD_THRESH ? mostLikelyAllele : Allele.NO_CALL ); + return (maxLike - prevMaxLike > INFORMATIVE_LIKELIHOOD_THRESHOLD ? mostLikelyAllele : Allele.NO_CALL ); } public static PerReadAlleleLikelihoodMap getBestAvailablePerReadAlleleLikelihoodMap() {