From ff180a8e02eaeffbc42226c789c6c6946affae68 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Mon, 19 Nov 2012 09:09:57 -0500 Subject: [PATCH] Significant refactoring of the Haplotype Caller to handle problems with GGA. The main fix is that we now maintain a mapping from 'original' allele to 'Smith-Waterman-based' allele so that we no longer need to do a (buggy) matching throughout the calling process. --- .../haplotypecaller/GenotypingEngine.java | 253 ++++++------------ .../haplotypecaller/HaplotypeCaller.java | 6 +- .../LikelihoodCalculationEngine.java | 78 +++--- .../LikelihoodCalculationEngineUnitTest.java | 5 +- .../broadinstitute/sting/utils/Haplotype.java | 16 +- 5 files changed, 144 insertions(+), 214 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java index d91df82e2..9fc636efe 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java @@ -31,7 +31,6 @@ import net.sf.samtools.Cigar; import net.sf.samtools.CigarElement; import org.apache.commons.lang.ArrayUtils; import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine; -import org.broadinstitute.sting.gatk.walkers.genotyper.VariantCallContext; import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; @@ -52,153 +51,17 @@ public class GenotypingEngine { noCall.add(Allele.NO_CALL); } - // WARN - // This function is the streamlined approach, currently not being used by default - // WARN - // WARN: This function is currently only being used by Menachem. Slated for removal/merging with the rest of the code. - // WARN - @Requires({"refLoc.containsP(activeRegionWindow)", "haplotypes.size() > 0"}) - public List>>> assignGenotypeLikelihoodsAndCallHaplotypeEvents( final UnifiedGenotyperEngine UG_engine, - final ArrayList haplotypes, - final byte[] ref, - final GenomeLoc refLoc, - final GenomeLoc activeRegionWindow, - final GenomeLocParser genomeLocParser ) { - // Prepare the list of haplotype indices to genotype - final ArrayList allelesToGenotype = new ArrayList(); - - for( final Haplotype h : haplotypes ) { - allelesToGenotype.add( Allele.create(h.getBases(), h.isReference()) ); - } - final int numHaplotypes = haplotypes.size(); - - // Grab the genotype likelihoods from the appropriate places in the haplotype likelihood matrix -- calculation performed independently per sample - final GenotypesContext genotypes = GenotypesContext.create(haplotypes.get(0).getSampleKeySet().size()); - for( final String sample : haplotypes.get(0).getSampleKeySet() ) { // BUGBUG: assume all haplotypes saw the same samples - final double[] genotypeLikelihoods = new double[numHaplotypes * (numHaplotypes+1) / 2]; - final double[][] haplotypeLikelihoodMatrix = LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods(haplotypes, sample); - int glIndex = 0; - for( int iii = 0; iii < numHaplotypes; iii++ ) { - for( int jjj = 0; jjj <= iii; jjj++ ) { - genotypeLikelihoods[glIndex++] = haplotypeLikelihoodMatrix[iii][jjj]; // for example: AA,AB,BB,AC,BC,CC - } - } - genotypes.add(new GenotypeBuilder(sample, noCall).PL(genotypeLikelihoods).make()); - } - final VariantCallContext call = UG_engine.calculateGenotypes(new VariantContextBuilder().loc(activeRegionWindow).alleles(allelesToGenotype).genotypes(genotypes).make(), UG_engine.getUAC().GLmodel); - if( call == null ) { return Collections.emptyList(); } // exact model says that the call confidence is below the specified confidence threshold so nothing to do here - - // Prepare the list of haplotypes that need to be run through Smith-Waterman for output to VCF - final ArrayList haplotypesToRemove = new ArrayList(); - for( final Haplotype h : haplotypes ) { - if( call.getAllele(h.getBases()) == null ) { // exact model removed this allele from the list so no need to run SW and output to VCF - haplotypesToRemove.add(h); - } - } - haplotypes.removeAll(haplotypesToRemove); - - if( OUTPUT_FULL_HAPLOTYPE_SEQUENCE ) { - final List>>> returnVCs = new ArrayList>>>(); - // set up the default 1-to-1 haplotype mapping object - final HashMap> haplotypeMapping = new HashMap>(); - for( final Haplotype h : haplotypes ) { - final ArrayList list = new ArrayList(); - list.add(h); - haplotypeMapping.put(call.getAllele(h.getBases()), list); - } - returnVCs.add( new Pair>>(call,haplotypeMapping) ); - return returnVCs; - } - - final ArrayList>>> returnCalls = new ArrayList>>>(); - - // Using the cigar from each called haplotype figure out what events need to be written out in a VCF file - final TreeSet startPosKeySet = new TreeSet(); - int count = 0; - if( DEBUG ) { System.out.println("=== Best Haplotypes ==="); } - for( final Haplotype h : haplotypes ) { - if( DEBUG ) { - System.out.println( h.toString() ); - System.out.println( "> Cigar = " + h.getCigar() ); - } - // Walk along the alignment and turn any difference from the reference into an event - h.setEventMap( generateVCsFromAlignment( h, h.getAlignmentStartHapwrtRef(), h.getCigar(), ref, h.getBases(), refLoc, "HC" + count++ ) ); - startPosKeySet.addAll(h.getEventMap().keySet()); - } - - // Create the VC merge priority list - final ArrayList priorityList = new ArrayList(); - for( int iii = 0; iii < haplotypes.size(); iii++ ) { - priorityList.add("HC" + iii); - } - - // Walk along each position in the key set and create each event to be outputted - for( final int loc : startPosKeySet ) { - if( loc >= activeRegionWindow.getStart() && loc <= activeRegionWindow.getStop() ) { - final ArrayList eventsAtThisLoc = new ArrayList(); - for( final Haplotype h : haplotypes ) { - final HashMap eventMap = h.getEventMap(); - final VariantContext vc = eventMap.get(loc); - if( vc != null && !containsVCWithMatchingAlleles(eventsAtThisLoc, vc) ) { - eventsAtThisLoc.add(vc); - } - } - - // Create the allele mapping object which maps the original haplotype alleles to the alleles present in just this event - final ArrayList> alleleMapper = createAlleleMapper( loc, eventsAtThisLoc, haplotypes ); - - // Merge the event to find a common reference representation - final VariantContext mergedVC = VariantContextUtils.simpleMerge(genomeLocParser, eventsAtThisLoc, priorityList, VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, VariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, null, false, false); - - final HashMap> alleleHashMap = new HashMap>(); - int aCount = 0; - for( final Allele a : mergedVC.getAlleles() ) { - alleleHashMap.put(a, alleleMapper.get(aCount++)); // BUGBUG: needs to be cleaned up and merged with alleleMapper - } - - if( DEBUG ) { - System.out.println("Genotyping event at " + loc + " with alleles = " + mergedVC.getAlleles()); - //System.out.println("Event/haplotype allele mapping = " + alleleMapper); - } - - // Grab the genotype likelihoods from the appropriate places in the haplotype likelihood matrix -- calculation performed independently per sample - final GenotypesContext myGenotypes = GenotypesContext.create(haplotypes.get(0).getSampleKeySet().size()); - for( final String sample : haplotypes.get(0).getSampleKeySet() ) { // BUGBUG: assume all haplotypes saw the same samples - final int myNumHaplotypes = alleleMapper.size(); - final double[] genotypeLikelihoods = new double[myNumHaplotypes * (myNumHaplotypes+1) / 2]; - final double[][] haplotypeLikelihoodMatrix = LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods(sample, alleleMapper); - int glIndex = 0; - for( int iii = 0; iii < myNumHaplotypes; iii++ ) { - for( int jjj = 0; jjj <= iii; jjj++ ) { - genotypeLikelihoods[glIndex++] = haplotypeLikelihoodMatrix[iii][jjj]; // for example: AA,AB,BB,AC,BC,CC - } - } - - // using the allele mapping object translate the haplotype allele into the event allele - final Genotype g = new GenotypeBuilder(sample) - .alleles(findEventAllelesInSample(mergedVC.getAlleles(), call.getAlleles(), call.getGenotype(sample).getAlleles(), alleleMapper, haplotypes)) - .phased(loc != startPosKeySet.first()) - .PL(genotypeLikelihoods).make(); - myGenotypes.add(g); - } - returnCalls.add( new Pair>>( - new VariantContextBuilder(mergedVC).log10PError(call.getLog10PError()).genotypes(myGenotypes).make(), alleleHashMap) ); - } - } - return returnCalls; - } - // BUGBUG: Create a class to hold this complicated return type @Requires({"refLoc.containsP(activeRegionWindow)", "haplotypes.size() > 0"}) - public List>>> assignGenotypeLikelihoodsAndCallIndependentEvents( final UnifiedGenotyperEngine UG_engine, - final ArrayList haplotypes, - final byte[] ref, - final GenomeLoc refLoc, - final GenomeLoc activeRegionWindow, - final GenomeLocParser genomeLocParser, - final ArrayList activeAllelesToGenotype ) { + public List>>> assignGenotypeLikelihoodsAndCallIndependentEvents( final UnifiedGenotyperEngine UG_engine, + final List haplotypes, + final byte[] ref, + final GenomeLoc refLoc, + final GenomeLoc activeRegionWindow, + final GenomeLocParser genomeLocParser, + final List activeAllelesToGenotype ) { - final ArrayList>>> returnCalls = new ArrayList>>>(); + final ArrayList>>> returnCalls = new ArrayList>>>(); // Using the cigar from each called haplotype figure out what events need to be written out in a VCF file final TreeSet startPosKeySet = new TreeSet(); @@ -261,7 +124,15 @@ public class GenotypingEngine { if( eventsAtThisLoc.isEmpty() ) { continue; } // Create the allele mapping object which maps the original haplotype alleles to the alleles present in just this event - final ArrayList> alleleMapper = createAlleleMapper( loc, eventsAtThisLoc, haplotypes ); + Map> alleleMapper = createAlleleMapper( loc, eventsAtThisLoc, haplotypes ); + + final Allele refAllele = eventsAtThisLoc.get(0).getReference(); + final ArrayList alleleOrdering = new ArrayList(alleleMapper.size()); + alleleOrdering.add(refAllele); + for ( final Allele allele : alleleMapper.keySet() ) { + if ( !refAllele.equals(allele) ) + alleleOrdering.add(allele); + } // Sanity check the priority list for( final VariantContext vc : eventsAtThisLoc ) { @@ -283,12 +154,6 @@ public class GenotypingEngine { final VariantContext mergedVC = VariantContextUtils.simpleMerge(genomeLocParser, eventsAtThisLoc, priorityList, VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, VariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, null, false, false); if( mergedVC == null ) { continue; } - HashMap> alleleHashMap = new HashMap>(); - int aCount = 0; - for( final Allele a : mergedVC.getAlleles() ) { - alleleHashMap.put(a, alleleMapper.get(aCount++)); // BUGBUG: needs to be cleaned up and merged with alleleMapper - } - if( DEBUG ) { System.out.println("Genotyping event at " + loc + " with alleles = " + mergedVC.getAlleles()); //System.out.println("Event/haplotype allele mapping = " + alleleMapper); @@ -299,7 +164,7 @@ public class GenotypingEngine { for( final String sample : haplotypes.get(0).getSampleKeySet() ) { // BUGBUG: assume all haplotypes saw the same samples final int numHaplotypes = alleleMapper.size(); final double[] genotypeLikelihoods = new double[numHaplotypes * (numHaplotypes+1) / 2]; - final double[][] haplotypeLikelihoodMatrix = LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods(sample, alleleMapper); + final double[][] haplotypeLikelihoodMatrix = LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods(sample, alleleMapper, alleleOrdering); int glIndex = 0; for( int iii = 0; iii < numHaplotypes; iii++ ) { for( int jjj = 0; jjj <= iii; jjj++ ) { @@ -313,23 +178,23 @@ public class GenotypingEngine { if( call.getAlleles().size() != mergedVC.getAlleles().size() ) { // some alleles were removed so reverseTrimming might be necessary! final VariantContext vcCallTrim = VariantContextUtils.reverseTrimAlleles(call); // also, need to update the allele -> haplotype mapping - final HashMap> alleleHashMapTrim = new HashMap>(); + final HashMap> alleleHashMapTrim = new HashMap>(); for( int iii = 0; iii < vcCallTrim.getAlleles().size(); iii++ ) { // BUGBUG: this is assuming that the original and trimmed alleles maintain the same ordering in the VC - alleleHashMapTrim.put(vcCallTrim.getAlleles().get(iii), alleleHashMap.get(call.getAlleles().get(iii))); + alleleHashMapTrim.put(vcCallTrim.getAlleles().get(iii), alleleMapper.get(call.getAlleles().get(iii))); } call = vcCallTrim; - alleleHashMap = alleleHashMapTrim; + alleleMapper = alleleHashMapTrim; } - returnCalls.add( new Pair>>(call, alleleHashMap) ); + returnCalls.add( new Pair>>(call, alleleMapper) ); } } } return returnCalls; } - protected static void cleanUpSymbolicUnassembledEvents( final ArrayList haplotypes ) { + protected static void cleanUpSymbolicUnassembledEvents( final List haplotypes ) { final ArrayList haplotypesToRemove = new ArrayList(); for( final Haplotype h : haplotypes ) { for( final VariantContext vc : h.getEventMap().values() ) { @@ -348,7 +213,7 @@ public class GenotypingEngine { haplotypes.removeAll(haplotypesToRemove); } - protected void mergeConsecutiveEventsBasedOnLD( final ArrayList haplotypes, final TreeSet startPosKeySet, final byte[] ref, final GenomeLoc refLoc ) { + protected void mergeConsecutiveEventsBasedOnLD( final List haplotypes, final TreeSet startPosKeySet, final byte[] ref, final GenomeLoc refLoc ) { final int MAX_SIZE_TO_COMBINE = 15; final double MERGE_EVENTS_R2_THRESHOLD = 0.95; if( startPosKeySet.size() <= 1 ) { return; } @@ -395,7 +260,9 @@ public class GenotypingEngine { final ArrayList haplotypeList = new ArrayList(); haplotypeList.add(h); for( final String sample : haplotypes.get(0).getSampleKeySet() ) { - final double haplotypeLikelihood = LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods( haplotypeList, sample )[0][0]; + final HashSet sampleSet = new HashSet(1); + sampleSet.add(sample); + final double haplotypeLikelihood = LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods( sampleSet, haplotypeList )[0][0]; if( thisHapVC == null ) { if( nextHapVC == null ) { x11 = MathUtils.approximateLog10SumLog10(x11, haplotypeLikelihood); } else { x12 = MathUtils.approximateLog10SumLog10(x12, haplotypeLikelihood); } @@ -489,37 +356,71 @@ public class GenotypingEngine { @Requires({"haplotypes.size() >= eventsAtThisLoc.size() + 1"}) @Ensures({"result.size() == eventsAtThisLoc.size() + 1"}) - protected static ArrayList> createAlleleMapper( final int loc, final ArrayList eventsAtThisLoc, final ArrayList haplotypes ) { - final ArrayList> alleleMapper = new ArrayList>(); - final ArrayList refList = new ArrayList(); + protected static Map> createAlleleMapper( final int loc, final List eventsAtThisLoc, final List haplotypes ) { + + final Allele refAllele = eventsAtThisLoc.get(0).getReference(); + + final Map> alleleMapper = new HashMap>(eventsAtThisLoc.size()+1); for( final Haplotype h : haplotypes ) { if( h.getEventMap().get(loc) == null ) { // no event at this location so this is a reference-supporting haplotype - refList.add(h); + if ( !alleleMapper.containsKey(refAllele) ) + alleleMapper.put(refAllele, new ArrayList()); + alleleMapper.get(refAllele).add(h); + } else if ( h.isArtificialHaplotype() ) { + if ( !alleleMapper.containsKey(h.getArtificialAllele()) ) + alleleMapper.put(h.getArtificialAllele(), new ArrayList()); + alleleMapper.get(h.getArtificialAllele()).add(h); } else { - boolean foundInEventList = false; for( final VariantContext vcAtThisLoc : eventsAtThisLoc ) { if( h.getEventMap().get(loc).hasSameAllelesAs(vcAtThisLoc) ) { - foundInEventList = true; + final Allele altAllele = vcAtThisLoc.getAlternateAllele(0); + if ( !alleleMapper.containsKey(altAllele) ) + alleleMapper.put(altAllele, new ArrayList()); + alleleMapper.get(altAllele).add(h); + break; } } - if( !foundInEventList ) { // event at this location isn't one of the genotype-able options (during GGA) so this is a reference-supporting haplotype - refList.add(h); - } } } - alleleMapper.add(refList); - for( final VariantContext vcAtThisLoc : eventsAtThisLoc ) { - final ArrayList list = new ArrayList(); - for( final Haplotype h : haplotypes ) { - if( h.getEventMap().get(loc) != null && h.getEventMap().get(loc).hasSameAllelesAs(vcAtThisLoc) ) { - list.add(h); + + for( final Haplotype h : haplotypes ) { + if ( h.getEventMap().get(loc) == null || h.isArtificialHaplotype() ) + continue; + + Allele matchingAllele = null; + for ( final Map.Entry> alleleToTest : alleleMapper.entrySet() ) { + if ( alleleToTest.getKey().equals(refAllele) ) + continue; + + final Haplotype artificialHaplotype = alleleToTest.getValue().get(0); + if ( isSubSetOf(artificialHaplotype.getEventMap(), h.getEventMap()) ) { + matchingAllele = alleleToTest.getKey(); + break; } } - alleleMapper.add(list); + + if ( matchingAllele == null ) + matchingAllele = refAllele; + alleleMapper.get(matchingAllele).add(h); } + return alleleMapper; } + protected static boolean isSubSetOf(final Map subset, final Map superset) { + + for ( final Map.Entry fromSubset : subset.entrySet() ) { + final VariantContext fromSuperset = superset.get(fromSubset.getKey()); + if ( fromSuperset == null ) + return false; + + if ( !fromSuperset.hasAlternateAllele(fromSubset.getValue().getAlternateAllele(0)) ) + return false; + } + + return true; + } + @Ensures({"result.size() == haplotypeAllelesForSample.size()"}) protected static List findEventAllelesInSample( final List eventAlleles, final List haplotypeAlleles, final List haplotypeAllelesForSample, final ArrayList> alleleMapper, final ArrayList haplotypes ) { if( haplotypeAllelesForSample.contains(Allele.NO_CALL) ) { return noCall; } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index a185ba6af..2b739a321 100755 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -421,10 +421,8 @@ public class HaplotypeCaller extends ActiveRegionWalker implem // subset down to only the best haplotypes to be genotyped in all samples ( in GGA mode use all discovered haplotypes ) final ArrayList bestHaplotypes = ( UG_engine.getUAC().GenotypingMode != GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ? likelihoodCalculationEngine.selectBestHaplotypes( haplotypes ) : haplotypes ); - for( final Pair>> callResult : - ( GENOTYPE_FULL_ACTIVE_REGION && UG_engine.getUAC().GenotypingMode != GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES - ? genotypingEngine.assignGenotypeLikelihoodsAndCallHaplotypeEvents( UG_engine, bestHaplotypes, fullReferenceWithPadding, getPaddedLoc(activeRegion), activeRegion.getExtendedLoc(), getToolkit().getGenomeLocParser() ) - : genotypingEngine.assignGenotypeLikelihoodsAndCallIndependentEvents( UG_engine, bestHaplotypes, fullReferenceWithPadding, getPaddedLoc(activeRegion), activeRegion.getLocation(), getToolkit().getGenomeLocParser(), activeAllelesToGenotype ) ) ) { + for( final Pair>> callResult : + genotypingEngine.assignGenotypeLikelihoodsAndCallIndependentEvents( UG_engine, bestHaplotypes, fullReferenceWithPadding, getPaddedLoc(activeRegion), activeRegion.getLocation(), getToolkit().getGenomeLocParser(), activeAllelesToGenotype ) ) { if( DEBUG ) { System.out.println(callResult.getFirst().toStringWithoutGenotypes()); } final Map stratifiedReadMap = LikelihoodCalculationEngine.partitionReadsBasedOnLikelihoods( getToolkit().getGenomeLocParser(), perSampleReadList, perSampleFilteredReadList, callResult, UG_engine.getUAC().CONTAMINATION_FRACTION, UG_engine.getUAC().contaminationLog ); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java index a0924623b..543987e74 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java @@ -148,34 +148,21 @@ public class LikelihoodCalculationEngine { return Math.min(b1.length, b2.length); } - @Requires({"haplotypes.size() > 0"}) - @Ensures({"result.length == result[0].length", "result.length == haplotypes.size()"}) - public static double[][] computeDiploidHaplotypeLikelihoods( final ArrayList haplotypes, final String sample ) { - // set up the default 1-to-1 haplotype mapping object, BUGBUG: target for future optimization? - final ArrayList> haplotypeMapping = new ArrayList>(); - for( final Haplotype h : haplotypes ) { - final ArrayList list = new ArrayList(); - list.add(h); - haplotypeMapping.add(list); - } - return computeDiploidHaplotypeLikelihoods( sample, haplotypeMapping ); - } - // This function takes just a single sample and a haplotypeMapping @Requires({"haplotypeMapping.size() > 0"}) @Ensures({"result.length == result[0].length", "result.length == haplotypeMapping.size()"}) - public static double[][] computeDiploidHaplotypeLikelihoods( final String sample, final ArrayList> haplotypeMapping ) { + public static double[][] computeDiploidHaplotypeLikelihoods( final String sample, final Map> haplotypeMapping, final List alleleOrdering ) { final TreeSet sampleSet = new TreeSet(); sampleSet.add(sample); - return computeDiploidHaplotypeLikelihoods(sampleSet, haplotypeMapping); + return computeDiploidHaplotypeLikelihoods(sampleSet, haplotypeMapping, alleleOrdering); } // This function takes a set of samples to pool over and a haplotypeMapping @Requires({"haplotypeMapping.size() > 0"}) @Ensures({"result.length == result[0].length", "result.length == haplotypeMapping.size()"}) - public static double[][] computeDiploidHaplotypeLikelihoods( final Set samples, final ArrayList> haplotypeMapping ) { + public static double[][] computeDiploidHaplotypeLikelihoods( final Set samples, final Map> haplotypeMapping, final List alleleOrdering ) { - final int numHaplotypes = haplotypeMapping.size(); + final int numHaplotypes = alleleOrdering.size(); final double[][] haplotypeLikelihoodMatrix = new double[numHaplotypes][numHaplotypes]; for( int iii = 0; iii < numHaplotypes; iii++ ) { Arrays.fill(haplotypeLikelihoodMatrix[iii], Double.NEGATIVE_INFINITY); @@ -184,9 +171,9 @@ public class LikelihoodCalculationEngine { // compute the diploid haplotype likelihoods // todo - needs to be generalized to arbitrary ploidy, cleaned and merged with PairHMMIndelErrorModel code for( int iii = 0; iii < numHaplotypes; iii++ ) { - for( int jjj = 0; jjj <= iii; jjj++ ) { - for( final Haplotype iii_mapped : haplotypeMapping.get(iii) ) { - for( final Haplotype jjj_mapped : haplotypeMapping.get(jjj) ) { + for( int jjj = 0; jjj <= iii; jjj++ ) { + for( final Haplotype iii_mapped : haplotypeMapping.get(alleleOrdering.get(iii)) ) { + for( final Haplotype jjj_mapped : haplotypeMapping.get(alleleOrdering.get(jjj)) ) { double haplotypeLikelihood = 0.0; for( final String sample : samples ) { final double[] readLikelihoods_iii = iii_mapped.getReadLikelihoods(sample); @@ -200,12 +187,48 @@ public class LikelihoodCalculationEngine { } haplotypeLikelihoodMatrix[iii][jjj] = Math.max(haplotypeLikelihoodMatrix[iii][jjj], haplotypeLikelihood); } - } + } } } // normalize the diploid likelihoods matrix - return normalizeDiploidLikelihoodMatrixFromLog10( haplotypeLikelihoodMatrix ); + return normalizeDiploidLikelihoodMatrixFromLog10( haplotypeLikelihoodMatrix ); + } + + // This function takes a set of samples to pool over and a haplotypeMapping + @Requires({"haplotypeMapping.size() > 0"}) + @Ensures({"result.length == result[0].length", "result.length == haplotypeMapping.size()"}) + public static double[][] computeDiploidHaplotypeLikelihoods( final Set samples, final List haplotypeList ) { + + final int numHaplotypes = haplotypeList.size(); + final double[][] haplotypeLikelihoodMatrix = new double[numHaplotypes][numHaplotypes]; + for( int iii = 0; iii < numHaplotypes; iii++ ) { + Arrays.fill(haplotypeLikelihoodMatrix[iii], Double.NEGATIVE_INFINITY); + } + + // compute the diploid haplotype likelihoods + // todo - needs to be generalized to arbitrary ploidy, cleaned and merged with PairHMMIndelErrorModel code + for( int iii = 0; iii < numHaplotypes; iii++ ) { + final Haplotype iii_haplotype = haplotypeList.get(iii); + for( int jjj = 0; jjj <= iii; jjj++ ) { + final Haplotype jjj_haplotype = haplotypeList.get(jjj); + double haplotypeLikelihood = 0.0; + for( final String sample : samples ) { + final double[] readLikelihoods_iii = iii_haplotype.getReadLikelihoods(sample); + final int[] readCounts_iii = iii_haplotype.getReadCounts(sample); + final double[] readLikelihoods_jjj = jjj_haplotype.getReadLikelihoods(sample); + for( int kkk = 0; kkk < readLikelihoods_iii.length; kkk++ ) { + // Compute log10(10^x1/2 + 10^x2/2) = log10(10^x1+10^x2)-log10(2) + // First term is approximated by Jacobian log with table lookup. + haplotypeLikelihood += readCounts_iii[kkk] * ( MathUtils.approximateLog10SumLog10(readLikelihoods_iii[kkk], readLikelihoods_jjj[kkk]) + LOG_ONE_HALF ); + } + } + haplotypeLikelihoodMatrix[iii][jjj] = Math.max(haplotypeLikelihoodMatrix[iii][jjj], haplotypeLikelihood); + } + } + + // normalize the diploid likelihoods matrix + return normalizeDiploidLikelihoodMatrixFromLog10( haplotypeLikelihoodMatrix ); } @Requires({"likelihoodMatrix.length == likelihoodMatrix[0].length"}) @@ -296,14 +319,7 @@ public class LikelihoodCalculationEngine { final Set sampleKeySet = haplotypes.get(0).getSampleKeySet(); // BUGBUG: assume all haplotypes saw the same samples final ArrayList bestHaplotypesIndexList = new ArrayList(); bestHaplotypesIndexList.add( findReferenceIndex(haplotypes) ); // always start with the reference haplotype - // set up the default 1-to-1 haplotype mapping object - final ArrayList> haplotypeMapping = new ArrayList>(); - for( final Haplotype h : haplotypes ) { - final ArrayList list = new ArrayList(); - list.add(h); - haplotypeMapping.add(list); - } - final double[][] haplotypeLikelihoodMatrix = computeDiploidHaplotypeLikelihoods( sampleKeySet, haplotypeMapping ); // all samples pooled together + final double[][] haplotypeLikelihoodMatrix = computeDiploidHaplotypeLikelihoods( sampleKeySet, haplotypes ); // all samples pooled together int hap1 = 0; int hap2 = 0; @@ -347,7 +363,7 @@ public class LikelihoodCalculationEngine { public static Map partitionReadsBasedOnLikelihoods( final GenomeLocParser parser, final HashMap> perSampleReadList, final HashMap> perSampleFilteredReadList, - final Pair>> call, + final Pair>> call, final double downsamplingFraction, final PrintStream downsamplingLog ) { final Map returnMap = new HashMap(); diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngineUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngineUnitTest.java index e82946690..19ced9f42 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngineUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngineUnitTest.java @@ -10,7 +10,6 @@ import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.utils.Haplotype; import org.broadinstitute.sting.utils.MathUtils; import org.testng.Assert; -import org.testng.annotations.BeforeClass; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; @@ -102,7 +101,9 @@ public class LikelihoodCalculationEngineUnitTest extends BaseTest { haplotypes.add(haplotype); } } - return LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods(haplotypes, "myTestSample"); + final HashSet sampleSet = new HashSet(1); + sampleSet.add("myTestSample"); + return LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods(sampleSet, haplotypes); } } diff --git a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java index b30d47074..6de15e18b 100755 --- a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java +++ b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java @@ -49,6 +49,7 @@ public class Haplotype { private int alignmentStartHapwrtRef; public int leftBreakPoint = 0; public int rightBreakPoint = 0; + private Allele artificialAllele = null; /** * Create a simple consensus sequence with provided bases and a uniform quality over all bases of qual @@ -71,6 +72,11 @@ public class Haplotype { this(bases, 0); } + public Haplotype( final byte[] bases, final Allele artificialAllele ) { + this(bases, 0); + this.artificialAllele = artificialAllele; + } + public Haplotype( final byte[] bases, final GenomeLoc loc ) { this(bases); this.genomeLocation = loc; @@ -171,6 +177,14 @@ public class Haplotype { this.cigar = cigar; } + public boolean isArtificialHaplotype() { + return artificialAllele != null; + } + + public Allele getArtificialAllele() { + return artificialAllele; + } + @Requires({"refInsertLocation >= 0"}) public Haplotype insertAllele( final Allele refAllele, final Allele altAllele, final int refInsertLocation ) { // refInsertLocation is in ref haplotype offset coordinates NOT genomic coordinates @@ -182,7 +196,7 @@ public class Haplotype { newHaplotypeBases = ArrayUtils.addAll(newHaplotypeBases, ArrayUtils.subarray(bases, 0, haplotypeInsertLocation)); // bases before the variant newHaplotypeBases = ArrayUtils.addAll(newHaplotypeBases, altAllele.getBases()); // the alt allele of the variant newHaplotypeBases = ArrayUtils.addAll(newHaplotypeBases, ArrayUtils.subarray(bases, haplotypeInsertLocation + refAllele.length(), bases.length)); // bases after the variant - return new Haplotype(newHaplotypeBases); + return new Haplotype(newHaplotypeBases, altAllele); } public static class HaplotypeBaseComparator implements Comparator, Serializable {