diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java index 4fc2dc8f7..6f94e2657 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java @@ -30,12 +30,16 @@ import com.google.java.contract.Requires; import net.sf.samtools.Cigar; import net.sf.samtools.CigarElement; import org.apache.commons.lang.ArrayUtils; +import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine; import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine; import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.variantcontext.*; +import java.io.PrintStream; import java.util.*; public class GenotypingEngine { @@ -43,23 +47,27 @@ public class GenotypingEngine { private final boolean DEBUG; private final static List noCall = new ArrayList(); // used to noCall all genotypes until the exact model is applied private final static Allele SYMBOLIC_UNASSEMBLED_EVENT_ALLELE = Allele.create("", false); + private final VariantAnnotatorEngine annotationEngine; - public GenotypingEngine( final boolean DEBUG ) { + public GenotypingEngine( final boolean DEBUG, final VariantAnnotatorEngine annotationEngine ) { this.DEBUG = DEBUG; + this.annotationEngine = annotationEngine; noCall.add(Allele.NO_CALL); } - // BUGBUG: Create a class to hold this complicated return type @Requires({"refLoc.containsP(activeRegionWindow)", "haplotypes.size() > 0"}) - public List>>> assignGenotypeLikelihoodsAndCallIndependentEvents( final UnifiedGenotyperEngine UG_engine, - final List haplotypes, - final byte[] ref, - final GenomeLoc refLoc, - final GenomeLoc activeRegionWindow, - final GenomeLocParser genomeLocParser, - final List activeAllelesToGenotype ) { + public List assignGenotypeLikelihoodsAndCallIndependentEvents( final UnifiedGenotyperEngine UG_engine, + final List haplotypes, + final List samples, + final Map haplotypeReadMap, + final Map> perSampleFilteredReadList, + final byte[] ref, + final GenomeLoc refLoc, + final GenomeLoc activeRegionWindow, + final GenomeLocParser genomeLocParser, + final List activeAllelesToGenotype ) { - final ArrayList>>> returnCalls = new ArrayList>>>(); + final List returnCalls = new ArrayList(); final boolean in_GGA_mode = !activeAllelesToGenotype.isEmpty(); // Using the cigar from each called haplotype figure out what events need to be written out in a VCF file @@ -79,8 +87,8 @@ public class GenotypingEngine { } cleanUpSymbolicUnassembledEvents( haplotypes ); - if( !in_GGA_mode && haplotypes.get(0).getSampleKeySet().size() >= 10 ) { // if not in GGA mode and have at least 10 samples try to create MNP and complex events by looking at LD structure - mergeConsecutiveEventsBasedOnLD( haplotypes, startPosKeySet, ref, refLoc ); + if( !in_GGA_mode && samples.size() >= 10 ) { // if not in GGA mode and have at least 10 samples try to create MNP and complex events by looking at LD structure + mergeConsecutiveEventsBasedOnLD( haplotypes, samples, haplotypeReadMap, startPosKeySet, ref, refLoc ); } if( in_GGA_mode ) { for( final VariantContext compVC : activeAllelesToGenotype ) { @@ -90,7 +98,7 @@ public class GenotypingEngine { // Walk along each position in the key set and create each event to be outputted for( final int loc : startPosKeySet ) { - if( loc >= activeRegionWindow.getStart() && loc <= activeRegionWindow.getStop() ) { + if( loc >= activeRegionWindow.getStart() && loc <= activeRegionWindow.getStop() ) { // genotyping an event inside this active region final ArrayList eventsAtThisLoc = new ArrayList(); // the overlapping events to merge into a common reference view final ArrayList priorityList = new ArrayList(); // used to merge overlapping events into common reference view @@ -167,12 +175,14 @@ public class GenotypingEngine { //System.out.println("Event/haplotype allele mapping = " + alleleMapper); } + final Map alleleReadMap = convertHaplotypeReadMapToAlleleReadMap( haplotypeReadMap, alleleMapper, UG_engine.getUAC().CONTAMINATION_FRACTION, UG_engine.getUAC().contaminationLog ); + // Grab the genotype likelihoods from the appropriate places in the haplotype likelihood matrix -- calculation performed independently per sample - final GenotypesContext genotypes = GenotypesContext.create(haplotypes.get(0).getSampleKeySet().size()); - for( final String sample : haplotypes.get(0).getSampleKeySet() ) { // BUGBUG: assume all haplotypes saw the same samples + final GenotypesContext genotypes = GenotypesContext.create(samples.size()); + for( final String sample : samples ) { final int numHaplotypes = alleleMapper.size(); final double[] genotypeLikelihoods = new double[numHaplotypes * (numHaplotypes+1) / 2]; - final double[][] haplotypeLikelihoodMatrix = LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods(sample, alleleMapper, alleleOrdering); + final double[][] haplotypeLikelihoodMatrix = LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods(sample, alleleReadMap, alleleOrdering); int glIndex = 0; for( int iii = 0; iii < numHaplotypes; iii++ ) { for( int jjj = 0; jjj <= iii; jjj++ ) { @@ -183,25 +193,55 @@ public class GenotypingEngine { } VariantContext call = UG_engine.calculateGenotypes(new VariantContextBuilder(mergedVC).genotypes(genotypes).make(), UG_engine.getUAC().GLmodel); if( call != null ) { - if( call.getAlleles().size() != mergedVC.getAlleles().size() ) { // some alleles were removed so reverseTrimming might be necessary! - final VariantContext vcCallTrim = VariantContextUtils.reverseTrimAlleles(call); - // also, need to update the allele -> haplotype mapping - final HashMap> alleleHashMapTrim = new HashMap>(); - for( int iii = 0; iii < vcCallTrim.getAlleles().size(); iii++ ) { // BUGBUG: this is assuming that the original and trimmed alleles maintain the same ordering in the VC - alleleHashMapTrim.put(vcCallTrim.getAlleles().get(iii), alleleMapper.get(call.getAlleles().get(iii))); - } + final Map stratifiedReadMap = filterToOnlyOverlappingReads( genomeLocParser, alleleReadMap, perSampleFilteredReadList, call ); + VariantContext annotatedCall = annotationEngine.annotateContext(stratifiedReadMap, call); - call = vcCallTrim; - alleleMapper = alleleHashMapTrim; + if( annotatedCall.getAlleles().size() != mergedVC.getAlleles().size() ) { // some alleles were removed so reverseTrimming might be necessary! + annotatedCall = VariantContextUtils.reverseTrimAlleles(annotatedCall); } - returnCalls.add( new Pair>>(call, alleleMapper) ); + returnCalls.add( annotatedCall ); } } } return returnCalls; } + private static Map filterToOnlyOverlappingReads( final GenomeLocParser parser, + final Map perSampleReadMap, + final Map> perSampleFilteredReadList, + final VariantContext call ) { + + final Map returnMap = new HashMap(); + final GenomeLoc callLoc = parser.createGenomeLoc(call); + for( final Map.Entry sample : perSampleReadMap.entrySet() ) { + final PerReadAlleleLikelihoodMap likelihoodMap = PerReadAlleleLikelihoodMap.getBestAvailablePerReadAlleleLikelihoodMap(); + + for( final Map.Entry> mapEntry : likelihoodMap.getLikelihoodReadMap().entrySet() ) { + // only count the read if it overlaps the event, otherwise it is not added to the output read list at all + if( callLoc.overlapsP(parser.createGenomeLoc(mapEntry.getKey())) ) { + for( final Map.Entry a : mapEntry.getValue().entrySet() ) { + likelihoodMap.add(mapEntry.getKey(), a.getKey(), a.getValue()); + } + } + } + + // add all filtered reads to the NO_CALL list because they weren't given any likelihoods + for( final GATKSAMRecord read : perSampleFilteredReadList.get(sample.getKey()) ) { + // only count the read if it overlaps the event, otherwise it is not added to the output read list at all + if( callLoc.overlapsP(parser.createGenomeLoc(read)) ) { + for( final Allele a : call.getAlleles() ) { + likelihoodMap.add(read, a, 0.0); + } + } + } + + returnMap.put(sample.getKey(), likelihoodMap); + } + return returnMap; + } + + protected static void cleanUpSymbolicUnassembledEvents( final List haplotypes ) { final ArrayList haplotypesToRemove = new ArrayList(); for( final Haplotype h : haplotypes ) { @@ -221,7 +261,41 @@ public class GenotypingEngine { haplotypes.removeAll(haplotypesToRemove); } - protected void mergeConsecutiveEventsBasedOnLD( final List haplotypes, final TreeSet startPosKeySet, final byte[] ref, final GenomeLoc refLoc ) { + // BUGBUG: ugh, too complicated + protected Map convertHaplotypeReadMapToAlleleReadMap( final Map haplotypeReadMap, + final Map> alleleMapper, + final double downsamplingFraction, + final PrintStream downsamplingLog ) { + + final Map alleleReadMap = new HashMap(); + for( final Map.Entry haplotypeReadMapEntry : haplotypeReadMap.entrySet() ) { // for each sample + final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap = PerReadAlleleLikelihoodMap.getBestAvailablePerReadAlleleLikelihoodMap(); + for( final Map.Entry> alleleMapperEntry : alleleMapper.entrySet() ) { // for each output allele + final List mappedHaplotypes = alleleMapperEntry.getValue(); + for( final Map.Entry> readEntry : haplotypeReadMapEntry.getValue().getLikelihoodReadMap().entrySet() ) { // for each read + double maxLikelihood = Double.NEGATIVE_INFINITY; + for( final Map.Entry alleleDoubleEntry : readEntry.getValue().entrySet() ) { // for each input allele + if( mappedHaplotypes.contains( new Haplotype(alleleDoubleEntry.getKey().getBases())) ) { // exact match of haplotype base string + maxLikelihood = Math.max( maxLikelihood, alleleDoubleEntry.getValue() ); + } + } + perReadAlleleLikelihoodMap.add(readEntry.getKey(), alleleMapperEntry.getKey(), maxLikelihood); + } + } + perReadAlleleLikelihoodMap.performPerAlleleDownsampling(downsamplingFraction, downsamplingLog); // perform contamination downsampling + alleleReadMap.put(haplotypeReadMapEntry.getKey(), perReadAlleleLikelihoodMap); + } + + return alleleReadMap; + } + + protected void mergeConsecutiveEventsBasedOnLD( final List haplotypes, + final List samples, + final Map haplotypeReadMap, + final TreeSet startPosKeySet, + final byte[] ref, + final GenomeLoc refLoc ) { + final int MAX_SIZE_TO_COMBINE = 15; final double MERGE_EVENTS_R2_THRESHOLD = 0.95; if( startPosKeySet.size() <= 1 ) { return; } @@ -265,12 +339,13 @@ public class GenotypingEngine { } } // count up the co-occurrences of the events for the R^2 calculation - final ArrayList haplotypeList = new ArrayList(); - haplotypeList.add(h); - for( final String sample : haplotypes.get(0).getSampleKeySet() ) { + for( final String sample : samples ) { final HashSet sampleSet = new HashSet(1); sampleSet.add(sample); - final double haplotypeLikelihood = LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods( sampleSet, haplotypeList )[0][0]; + + final List alleleList = new ArrayList(); + alleleList.add(Allele.create(h.getBases())); + final double haplotypeLikelihood = LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods( sampleSet, haplotypeReadMap, alleleList )[0][0]; if( thisHapVC == null ) { if( nextHapVC == null ) { x11 = MathUtils.approximateLog10SumLog10(x11, haplotypeLikelihood); } else { x12 = MathUtils.approximateLog10SumLog10(x12, haplotypeLikelihood); } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index 24b3309f1..2b3513bef 100755 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -202,9 +202,6 @@ public class HaplotypeCaller extends ActiveRegionWalker implem // the genotyping engine private GenotypingEngine genotypingEngine = null; - // the annotation engine - private VariantAnnotatorEngine annotationEngine; - // fasta reference reader to supplement the edges of the reference sequence private CachingIndexedFastaSequenceFile referenceReader; @@ -249,7 +246,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem UG_engine_simple_genotyper = new UnifiedGenotyperEngine(getToolkit(), simpleUAC, logger, null, null, samples, VariantContextUtils.DEFAULT_PLOIDY); // initialize the output VCF header - annotationEngine = new VariantAnnotatorEngine(Arrays.asList(annotationClassesToUse), annotationsToUse, annotationsToExclude, this, getToolkit()); + final VariantAnnotatorEngine annotationEngine = new VariantAnnotatorEngine(Arrays.asList(annotationClassesToUse), annotationsToUse, annotationsToExclude, this, getToolkit()); Set headerInfo = new HashSet(); @@ -282,7 +279,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem assemblyEngine = new SimpleDeBruijnAssembler( DEBUG, graphWriter ); likelihoodCalculationEngine = new LikelihoodCalculationEngine( (byte)gcpHMM, DEBUG, pairHMM ); - genotypingEngine = new GenotypingEngine( DEBUG ); + genotypingEngine = new GenotypingEngine( DEBUG, annotationEngine ); } //--------------------------------------------------------------------------------------------------------------- @@ -398,21 +395,23 @@ public class HaplotypeCaller extends ActiveRegionWalker implem Collections.sort( haplotypes, new Haplotype.HaplotypeBaseComparator() ); // evaluate each sample's reads against all haplotypes - final HashMap> perSampleReadList = splitReadsBySample( activeRegion.getReads() ); - final HashMap> perSampleFilteredReadList = splitReadsBySample( filteredReads ); - likelihoodCalculationEngine.computeReadLikelihoods( haplotypes, perSampleReadList ); + final Map stratifiedReadMap = likelihoodCalculationEngine.computeReadLikelihoods( haplotypes, splitReadsBySample( activeRegion.getReads() ) ); + final Map> perSampleFilteredReadList = splitReadsBySample( filteredReads ); // subset down to only the best haplotypes to be genotyped in all samples ( in GGA mode use all discovered haplotypes ) - final ArrayList bestHaplotypes = ( UG_engine.getUAC().GenotypingMode != GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ? likelihoodCalculationEngine.selectBestHaplotypes( haplotypes ) : haplotypes ); + final ArrayList bestHaplotypes = ( UG_engine.getUAC().GenotypingMode != GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ? likelihoodCalculationEngine.selectBestHaplotypes( haplotypes, stratifiedReadMap ) : haplotypes ); - for( final Pair>> callResult : - genotypingEngine.assignGenotypeLikelihoodsAndCallIndependentEvents( UG_engine, bestHaplotypes, fullReferenceWithPadding, getPaddedLoc(activeRegion), activeRegion.getLocation(), getToolkit().getGenomeLocParser(), activeAllelesToGenotype ) ) { - if( DEBUG ) { System.out.println(callResult.getFirst().toStringWithoutGenotypes()); } - - final Map stratifiedReadMap = LikelihoodCalculationEngine.partitionReadsBasedOnLikelihoods( getToolkit().getGenomeLocParser(), perSampleReadList, perSampleFilteredReadList, callResult, UG_engine.getUAC().CONTAMINATION_FRACTION, UG_engine.getUAC().contaminationLog ); - final VariantContext annotatedCall = annotationEngine.annotateContext(stratifiedReadMap, callResult.getFirst()); - final Map myAttributes = new LinkedHashMap(annotatedCall.getAttributes()); - vcfWriter.add( new VariantContextBuilder(annotatedCall).attributes(myAttributes).make() ); + for( final VariantContext call : genotypingEngine.assignGenotypeLikelihoodsAndCallIndependentEvents( UG_engine, + bestHaplotypes, + samplesList, + stratifiedReadMap, + perSampleFilteredReadList, + fullReferenceWithPadding, + getPaddedLoc(activeRegion), + activeRegion.getLocation(), + getToolkit().getGenomeLocParser(), + activeAllelesToGenotype ) ) { + vcfWriter.add( call ); } if( DEBUG ) { System.out.println("----------------------------------------------------------------------------------"); } @@ -467,7 +466,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem if( postAdapterRead != null && !postAdapterRead.isEmpty() && postAdapterRead.getCigar().getReadLength() > 0 ) { final GATKSAMRecord clippedRead = ReadClipper.hardClipLowQualEnds( postAdapterRead, MIN_TAIL_QUALITY ); // protect against INTERVALS with abnormally high coverage - // BUGBUG: remove when positinal downsampler is hooked up to ART/HC + // BUGBUG: remove when positional downsampler is hooked up to ART/HC if( clippedRead.getReadLength() > 0 && activeRegion.size() < samplesList.size() * DOWNSAMPLE_PER_SAMPLE_PER_REGION ) { activeRegion.add(clippedRead); } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java index 29622ca17..4a5c7fe9b 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java @@ -71,8 +71,9 @@ public class LikelihoodCalculationEngine { DEBUG = debug; } - public void computeReadLikelihoods( final ArrayList haplotypes, final HashMap> perSampleReadList ) { + public Map computeReadLikelihoods( final ArrayList haplotypes, final HashMap> perSampleReadList ) { + final Map stratifiedReadMap = new HashMap(); int X_METRIC_LENGTH = 0; for( final Map.Entry> sample : perSampleReadList.entrySet() ) { for( final GATKSAMRecord read : sample.getValue() ) { @@ -97,20 +98,16 @@ public class LikelihoodCalculationEngine { for( final Map.Entry> sampleEntry : perSampleReadList.entrySet() ) { //if( DEBUG ) { System.out.println("Evaluating sample " + sample + " with " + perSampleReadList.get( sample ).size() + " passing reads"); } // evaluate the likelihood of the reads given those haplotypes - computeReadLikelihoods( haplotypes, sampleEntry.getValue(), sampleEntry.getKey() ); + stratifiedReadMap.put(sampleEntry.getKey(), computeReadLikelihoods(haplotypes, sampleEntry.getValue(), sampleEntry.getKey())); } + return stratifiedReadMap; } - private void computeReadLikelihoods( final ArrayList haplotypes, final ArrayList reads, final String sample ) { + private PerReadAlleleLikelihoodMap computeReadLikelihoods( final ArrayList haplotypes, final ArrayList reads, final String sample ) { + final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap = PerReadAlleleLikelihoodMap.getBestAvailablePerReadAlleleLikelihoodMap(); final int numHaplotypes = haplotypes.size(); - final int numReads = reads.size(); - final double[][] readLikelihoods = new double[numHaplotypes][numReads]; - final int[][] readCounts = new int[numHaplotypes][numReads]; - for( int iii = 0; iii < numReads; iii++ ) { - final GATKSAMRecord read = reads.get(iii); - final int readCount = ReadUtils.getMeanRepresentativeReadCount(read); - + for( final GATKSAMRecord read : reads ) { final byte[] overallGCP = new byte[read.getReadLength()]; Arrays.fill( overallGCP, constantGCP ); // Is there a way to derive empirical estimates for this from the data? Haplotype previousHaplotypeSeen = null; @@ -129,14 +126,12 @@ public class LikelihoodCalculationEngine { final int haplotypeStart = ( previousHaplotypeSeen == null ? 0 : computeFirstDifferingPosition(haplotype.getBases(), previousHaplotypeSeen.getBases()) ); previousHaplotypeSeen = haplotype; - readLikelihoods[jjj][iii] = pairHMM.computeReadLikelihoodGivenHaplotypeLog10(haplotype.getBases(), read.getReadBases(), - readQuals, readInsQuals, readDelQuals, overallGCP, haplotypeStart, jjj == 0); - readCounts[jjj][iii] = readCount; + perReadAlleleLikelihoodMap.add(read, Allele.create(haplotype.getBases()), + pairHMM.computeReadLikelihoodGivenHaplotypeLog10(haplotype.getBases(), read.getReadBases(), + readQuals, readInsQuals, readDelQuals, overallGCP, haplotypeStart, jjj == 0)); } } - for( int jjj = 0; jjj < numHaplotypes; jjj++ ) { - haplotypes.get(jjj).addReadLikelihoods( sample, readLikelihoods[jjj], readCounts[jjj] ); - } + return perReadAlleleLikelihoodMap; } private static int computeFirstDifferingPosition( final byte[] b1, final byte[] b2 ) { @@ -148,19 +143,21 @@ public class LikelihoodCalculationEngine { return Math.min(b1.length, b2.length); } - // This function takes just a single sample and a haplotypeMapping @Requires({"haplotypeMapping.size() > 0"}) @Ensures({"result.length == result[0].length", "result.length == haplotypeMapping.size()"}) - public static double[][] computeDiploidHaplotypeLikelihoods( final String sample, final Map> haplotypeMapping, final List alleleOrdering ) { + public static double[][] computeDiploidHaplotypeLikelihoods( final String sample, + final Map stratifiedReadMap, + final List alleleOrdering ) { final TreeSet sampleSet = new TreeSet(); sampleSet.add(sample); - return computeDiploidHaplotypeLikelihoods(sampleSet, haplotypeMapping, alleleOrdering); + return computeDiploidHaplotypeLikelihoods(sampleSet, stratifiedReadMap, alleleOrdering); } - // This function takes a set of samples to pool over and a haplotypeMapping @Requires({"haplotypeMapping.size() > 0"}) @Ensures({"result.length == result[0].length", "result.length == haplotypeMapping.size()"}) - public static double[][] computeDiploidHaplotypeLikelihoods( final Set samples, final Map> haplotypeMapping, final List alleleOrdering ) { + public static double[][] computeDiploidHaplotypeLikelihoods( final Set samples, + final Map stratifiedReadMap, + final List alleleOrdering ) { final int numHaplotypes = alleleOrdering.size(); final double[][] haplotypeLikelihoodMatrix = new double[numHaplotypes][numHaplotypes]; @@ -170,59 +167,19 @@ public class LikelihoodCalculationEngine { // compute the diploid haplotype likelihoods for( int iii = 0; iii < numHaplotypes; iii++ ) { + final Allele iii_allele = alleleOrdering.get(iii); for( int jjj = 0; jjj <= iii; jjj++ ) { - for( final Haplotype iii_mapped : haplotypeMapping.get(alleleOrdering.get(iii)) ) { - for( final Haplotype jjj_mapped : haplotypeMapping.get(alleleOrdering.get(jjj)) ) { - double haplotypeLikelihood = 0.0; - for( final String sample : samples ) { - final double[] readLikelihoods_iii = iii_mapped.getReadLikelihoods(sample); - final int[] readCounts_iii = iii_mapped.getReadCounts(sample); - final double[] readLikelihoods_jjj = jjj_mapped.getReadLikelihoods(sample); - for( int kkk = 0; kkk < readLikelihoods_iii.length; kkk++ ) { - // Compute log10(10^x1/2 + 10^x2/2) = log10(10^x1+10^x2)-log10(2) - // First term is approximated by Jacobian log with table lookup. - haplotypeLikelihood += readCounts_iii[kkk] * ( MathUtils.approximateLog10SumLog10(readLikelihoods_iii[kkk], readLikelihoods_jjj[kkk]) + LOG_ONE_HALF ); - } - } - haplotypeLikelihoodMatrix[iii][jjj] = Math.max(haplotypeLikelihoodMatrix[iii][jjj], haplotypeLikelihood); - } - } - } - } - - // normalize the diploid likelihoods matrix - return normalizeDiploidLikelihoodMatrixFromLog10( haplotypeLikelihoodMatrix ); - } - - // This function takes a set of samples to pool over and a haplotypeMapping - @Requires({"haplotypeList.size() > 0"}) - @Ensures({"result.length == result[0].length", "result.length == haplotypeList.size()"}) - public static double[][] computeDiploidHaplotypeLikelihoods( final Set samples, final List haplotypeList ) { - - final int numHaplotypes = haplotypeList.size(); - final double[][] haplotypeLikelihoodMatrix = new double[numHaplotypes][numHaplotypes]; - for( int iii = 0; iii < numHaplotypes; iii++ ) { - Arrays.fill(haplotypeLikelihoodMatrix[iii], Double.NEGATIVE_INFINITY); - } - - // compute the diploid haplotype likelihoods - // todo - needs to be generalized to arbitrary ploidy, cleaned and merged with PairHMMIndelErrorModel code - for( int iii = 0; iii < numHaplotypes; iii++ ) { - final Haplotype iii_haplotype = haplotypeList.get(iii); - for( int jjj = 0; jjj <= iii; jjj++ ) { - final Haplotype jjj_haplotype = haplotypeList.get(jjj); + final Allele jjj_allele = alleleOrdering.get(jjj); double haplotypeLikelihood = 0.0; for( final String sample : samples ) { - final double[] readLikelihoods_iii = iii_haplotype.getReadLikelihoods(sample); - final int[] readCounts_iii = iii_haplotype.getReadCounts(sample); - final double[] readLikelihoods_jjj = jjj_haplotype.getReadLikelihoods(sample); - for( int kkk = 0; kkk < readLikelihoods_iii.length; kkk++ ) { + for( final Map.Entry> entry : stratifiedReadMap.get(sample).getLikelihoodReadMap().entrySet() ) { // Compute log10(10^x1/2 + 10^x2/2) = log10(10^x1+10^x2)-log10(2) // First term is approximated by Jacobian log with table lookup. - haplotypeLikelihood += readCounts_iii[kkk] * ( MathUtils.approximateLog10SumLog10(readLikelihoods_iii[kkk], readLikelihoods_jjj[kkk]) + LOG_ONE_HALF ); + haplotypeLikelihood += ReadUtils.getMeanRepresentativeReadCount( entry.getKey() ) * + ( MathUtils.approximateLog10SumLog10(entry.getValue().get(iii_allele), entry.getValue().get(jjj_allele)) + LOG_ONE_HALF ); } } - haplotypeLikelihoodMatrix[iii][jjj] = Math.max(haplotypeLikelihoodMatrix[iii][jjj], haplotypeLikelihood); + haplotypeLikelihoodMatrix[iii][jjj] = haplotypeLikelihood; } } @@ -312,13 +269,16 @@ public class LikelihoodCalculationEngine { @Requires({"haplotypes.size() > 0"}) @Ensures({"result.size() <= haplotypes.size()"}) - public ArrayList selectBestHaplotypes( final ArrayList haplotypes ) { + public ArrayList selectBestHaplotypes( final ArrayList haplotypes, final Map stratifiedReadMap ) { final int numHaplotypes = haplotypes.size(); - final Set sampleKeySet = haplotypes.get(0).getSampleKeySet(); // BUGBUG: assume all haplotypes saw the same samples + final Set sampleKeySet = stratifiedReadMap.keySet(); final ArrayList bestHaplotypesIndexList = new ArrayList(); bestHaplotypesIndexList.add( findReferenceIndex(haplotypes) ); // always start with the reference haplotype - final double[][] haplotypeLikelihoodMatrix = computeDiploidHaplotypeLikelihoods( sampleKeySet, haplotypes ); // all samples pooled together + final List haplotypesAsAlleles = new ArrayList(); + for( final Haplotype h : haplotypes ) { haplotypesAsAlleles.add(Allele.create(h.getBases())); } + + final double[][] haplotypeLikelihoodMatrix = computeDiploidHaplotypeLikelihoods( sampleKeySet, stratifiedReadMap, haplotypesAsAlleles ); // all samples pooled together int hap1 = 0; int hap2 = 0; @@ -358,52 +318,4 @@ public class LikelihoodCalculationEngine { } throw new ReviewedStingException( "No reference haplotype found in the list of haplotypes!" ); } - - public static Map partitionReadsBasedOnLikelihoods( final GenomeLocParser parser, - final HashMap> perSampleReadList, - final HashMap> perSampleFilteredReadList, - final Pair>> call, - final double downsamplingFraction, - final PrintStream downsamplingLog ) { - final Map returnMap = new HashMap(); - final GenomeLoc callLoc = parser.createGenomeLoc(call.getFirst()); - for( final Map.Entry> sample : perSampleReadList.entrySet() ) { - final PerReadAlleleLikelihoodMap likelihoodMap = PerReadAlleleLikelihoodMap.getBestAvailablePerReadAlleleLikelihoodMap(); - - final ArrayList readsForThisSample = sample.getValue(); - for( int iii = 0; iii < readsForThisSample.size(); iii++ ) { - final GATKSAMRecord read = readsForThisSample.get(iii); // BUGBUG: assumes read order in this list and haplotype likelihood list are the same! - // only count the read if it overlaps the event, otherwise it is not added to the output read list at all - if( callLoc.overlapsP(parser.createGenomeLoc(read)) ) { - for( final Allele a : call.getFirst().getAlleles() ) { - double maxLikelihood = Double.NEGATIVE_INFINITY; - for( final Haplotype h : call.getSecond().get(a) ) { // use the max likelihood from all the haplotypes which mapped to this allele (achieved via the haplotype mapper object) - final double likelihood = h.getReadLikelihoods(sample.getKey())[iii]; - if( likelihood > maxLikelihood ) { - maxLikelihood = likelihood; - } - } - likelihoodMap.add(read, a, maxLikelihood); - } - } - } - - // down-sample before adding filtered reads - likelihoodMap.performPerAlleleDownsampling(downsamplingFraction, downsamplingLog); - - // add all filtered reads to the NO_CALL list because they weren't given any likelihoods - for( final GATKSAMRecord read : perSampleFilteredReadList.get(sample.getKey()) ) { - // only count the read if it overlaps the event, otherwise it is not added to the output read list at all - if( callLoc.overlapsP(parser.createGenomeLoc(read)) ) { - for( final Allele a : call.getFirst().getAlleles() ) { - likelihoodMap.add(read, a, 0.0); - } - } - } - - returnMap.put(sample.getKey(), likelihoodMap); - - } - return returnMap; - } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java index 79962a3e4..7b797432d 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java @@ -329,7 +329,6 @@ public class PairHMMIndelErrorModel { getContextHomopolymerLength(readBases,hrunProfile); fillGapProbabilities(hrunProfile, contextLogGapOpenProbabilities, contextLogGapContinuationProbabilities); - for (Allele a: haplotypeMap.keySet()) { Haplotype haplotype = haplotypeMap.get(a); diff --git a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java index 30fdce75d..4c708f2bf 100755 --- a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java +++ b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java @@ -41,8 +41,6 @@ public class Haplotype { protected final byte[] bases; protected final double[] quals; private GenomeLoc genomeLocation = null; - private HashMap readLikelihoodsPerSample = null; - private HashMap readCountsPerSample = null; private HashMap eventMap = null; private boolean isRef = false; private Cigar cigar; @@ -94,31 +92,6 @@ public class Haplotype { return Arrays.hashCode(bases); } - public void addReadLikelihoods( final String sample, final double[] readLikelihoods, final int[] readCounts ) { - if( readLikelihoodsPerSample == null ) { - readLikelihoodsPerSample = new HashMap(); - } - readLikelihoodsPerSample.put(sample, readLikelihoods); - if( readCountsPerSample == null ) { - readCountsPerSample = new HashMap(); - } - readCountsPerSample.put(sample, readCounts); - } - - @Ensures({"result != null"}) - public double[] getReadLikelihoods( final String sample ) { - return readLikelihoodsPerSample.get(sample); - } - - @Ensures({"result != null"}) - public int[] getReadCounts( final String sample ) { - return readCountsPerSample.get(sample); - } - - public Set getSampleKeySet() { - return readLikelihoodsPerSample.keySet(); - } - public HashMap getEventMap() { return eventMap; } diff --git a/public/java/src/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMap.java b/public/java/src/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMap.java index 22d249240..9bb0e646f 100644 --- a/public/java/src/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMap.java +++ b/public/java/src/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMap.java @@ -38,10 +38,10 @@ import java.util.*; public abstract class PerReadAlleleLikelihoodMap { - public static final double INDEL_LIKELIHOOD_THRESH = 0.1; + public static final double INFORMATIVE_LIKELIHOOD_THRESHOLD = 0.1; protected List alleles; - protected Map> likelihoodReadMap; + protected Map> likelihoodReadMap; public abstract void performPerAlleleDownsampling(final double downsamplingFraction, final PrintStream log); public abstract ReadBackedPileup createPerAlleleDownsampledBasePileup(final ReadBackedPileup pileup, final double downsamplingFraction, final PrintStream log); @@ -68,7 +68,7 @@ public abstract class PerReadAlleleLikelihoodMap { } public void add(PileupElement p, Allele a, Double likelihood) { - add(p.getRead(),a,likelihood); + add(p.getRead(), a, likelihood); } public boolean containsPileupElement(PileupElement p) { @@ -120,7 +120,7 @@ public abstract class PerReadAlleleLikelihoodMap { prevMaxLike = el.getValue(); } } - return (maxLike - prevMaxLike > INDEL_LIKELIHOOD_THRESH ? mostLikelyAllele : Allele.NO_CALL ); + return (maxLike - prevMaxLike > INFORMATIVE_LIKELIHOOD_THRESHOLD ? mostLikelyAllele : Allele.NO_CALL ); } public static PerReadAlleleLikelihoodMap getBestAvailablePerReadAlleleLikelihoodMap() {