From e40a07bb586752f23da345d3f0be69357fb8289b Mon Sep 17 00:00:00 2001 From: bradtaylor Date: Mon, 21 Oct 2013 14:08:03 -0400 Subject: [PATCH] Improve the PairHMM API for better FPGA integration Motivation: The API was different between the regular PairHMM and the FPGA-implementation via CnyPairHMM. As a result, the LikelihoodCalculationEngine had to use account for this. The goal is to change the API to be the same for all implementations, and make it easier to access. PairHMM PairHMM now accepts a list of reads and a map of alleles/haplotpes and returns a PerReadAlleleLikelihoodMap. Added a new primary method that loops the reads and haplotypes, extracts qualities, and passes them to the computeReadLikelihoodGivenHaplotypeLog10 method. Did not alter that method, or its subcompute method, at all. PairHMM also now handles its own (re)initialization, so users don't have to worry about that. CnyPairHMM Added that same new primary access method to this FPGA class. Method overrides the default implementation in PairHMM. Walks through a list of reads. Individual-read quals and the full haplotype list are fed to batchAdd(), as before. However, instead of waiting for every read to get added, and then walking through the reads again to extract results, we just get the haplotype-results array for each read as soon as it is generated, and pack it into a perReadAlleleLikelihoodMap for return. The main access method is now the same no matter whether the FPGA CnyPairHMM is used or not. LikelihoodCalculationEngine The functionality to loop through the reads and haplotypes and get individual log10-likelihoods was moved to the PairHMM, and so removed from here. However, this class does need to retain the ability to pre-process the reads, and post-process the resulting likelihoods map. Those features were separated from running the HMM and refactored into their own methods Commented out the (unused) system for finding best N haplotypes for genotyping. PairHMMIndelErrorModel Similar changes were made as to the LCE. However, in this case the haplotypes are modified based on each individual read, so the read-list we feed into the HMM only has one read. --- .../LikelihoodCalculationEngine.java | 546 +++++++++--------- .../indels/PairHMMIndelErrorModel.java | 150 +++-- .../utils/pairhmm/ArrayLoglessPairHMM.java | 1 + .../sting/utils/pairhmm/CnyPairHMM.java | 67 ++- .../sting/utils/pairhmm/LoglessPairHMM.java | 11 - .../genotyper/PerReadAlleleLikelihoodMap.java | 25 +- .../sting/utils/pairhmm/Log10PairHMM.java | 3 - .../sting/utils/pairhmm/N2MemoryPairHMM.java | 7 +- .../sting/utils/pairhmm/PairHMM.java | 87 ++- .../sting/utils/sam/GATKSAMRecord.java | 51 +- 10 files changed, 547 insertions(+), 401 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java index 0d55797bc..4eb728390 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java @@ -52,18 +52,10 @@ import net.sf.samtools.SAMUtils; import org.apache.log4j.Logger; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.QualityUtils; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.genotyper.MostLikelyAllele; import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.haplotype.Haplotype; -import org.broadinstitute.sting.utils.haplotype.HaplotypeScoreComparator; -import org.broadinstitute.sting.utils.pairhmm.ArrayLoglessPairHMM; -import org.broadinstitute.sting.utils.pairhmm.Log10PairHMM; -import org.broadinstitute.sting.utils.pairhmm.LoglessPairHMM; -import org.broadinstitute.sting.utils.pairhmm.CnyPairHMM; -import org.broadinstitute.sting.utils.pairhmm.BatchPairHMM; -import org.broadinstitute.sting.utils.pairhmm.PairHMM; +import org.broadinstitute.sting.utils.pairhmm.*; import org.broadinstitute.sting.utils.recalibration.covariates.RepeatCovariate; import org.broadinstitute.sting.utils.recalibration.covariates.RepeatLengthCovariate; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; @@ -88,7 +80,7 @@ public class LikelihoodCalculationEngine { private final PairHMM.HMM_IMPLEMENTATION hmmType; private final boolean noFpga; - private final ThreadLocal pairHMM = new ThreadLocal() { + private final ThreadLocal pairHMMThreadLocal = new ThreadLocal() { @Override protected PairHMM initialValue() { switch (hmmType) { @@ -109,6 +101,8 @@ public class LikelihoodCalculationEngine { } } }; +// Attempted to do as below, to avoid calling pairHMMThreadLocal.get() later on, but it resulted in a NullPointerException +// private final PairHMM pairHMM = pairHMMThreadLocal.get(); private final static boolean WRITE_LIKELIHOODS_TO_FILE = false; private final static String LIKELIHOODS_FILENAME = "likelihoods.txt"; @@ -173,36 +167,145 @@ public class LikelihoodCalculationEngine { if ( likelihoodsStream != null ) likelihoodsStream.close(); } - /** - * Initialize our pairHMM with parameters appropriate to the haplotypes and reads we're going to evaluate - * - * After calling this routine the PairHMM will be configured to best evaluate all reads in the samples - * against the set of haplotypes - * - * @param haplotypes a non-null list of haplotypes - * @param perSampleReadList a mapping from sample -> reads - */ - private void initializePairHMM(final List haplotypes, final Map> perSampleReadList) { - int X_METRIC_LENGTH = 0; - for( final Map.Entry> sample : perSampleReadList.entrySet() ) { - for( final GATKSAMRecord read : sample.getValue() ) { - final int readLength = read.getReadLength(); - if( readLength > X_METRIC_LENGTH ) { X_METRIC_LENGTH = readLength; } - } + private void writeDebugLikelihoods(final GATKSAMRecord processedRead, final Haplotype haplotype, final double log10l){ + if ( WRITE_LIKELIHOODS_TO_FILE ) { + likelihoodsStream.printf("%s %s %s %s %s %s %f%n", + haplotype.getBaseString(), + new String(processedRead.getReadBases() ), + SAMUtils.phredToFastq(processedRead.getBaseQualities() ), + SAMUtils.phredToFastq(processedRead.getBaseInsertionQualities() ), + SAMUtils.phredToFastq(processedRead.getBaseDeletionQualities() ), + SAMUtils.phredToFastq(constantGCP), + log10l); } - int Y_METRIC_LENGTH = 0; - for( final Haplotype h : haplotypes ) { - final int haplotypeLength = h.getBases().length; - if( haplotypeLength > Y_METRIC_LENGTH ) { Y_METRIC_LENGTH = haplotypeLength; } - } - - // initialize arrays to hold the probabilities of being in the match, insertion and deletion cases - pairHMM.get().initialize(X_METRIC_LENGTH, Y_METRIC_LENGTH); } + private Map createAlleleMap(List haplotypes){ + final int numHaplotypes = haplotypes.size(); + final Map alleleMap = new LinkedHashMap<>(numHaplotypes); + for ( final Haplotype haplotype : haplotypes ) { + final Allele allele = Allele.create(haplotype, true); + alleleMap.put(allele, haplotype); + } + return alleleMap; + } + + private Map fillGCPArrays(List reads){ + final Map GCPArrayMap = new LinkedHashMap<>(); + for (GATKSAMRecord read: reads){ + byte [] GCPArray = new byte[read.getReadBases().length]; + Arrays.fill( GCPArray, constantGCP ); // Is there a way to derive empirical estimates for this from the data? + GCPArrayMap.put(read, GCPArray); + } + return GCPArrayMap; + } + + private void capMinimumReadQualities(GATKSAMRecord read, byte[] readQuals, byte[] readInsQuals, byte[] readDelQuals) { + for( int kkk = 0; kkk < readQuals.length; kkk++ ) { + readQuals[kkk] = (byte) Math.min( 0xff & readQuals[kkk], read.getMappingQuality()); // cap base quality by mapping quality, as in UG + readQuals[kkk] = ( readQuals[kkk] < BASE_QUALITY_SCORE_THRESHOLD ? QualityUtils.MIN_USABLE_Q_SCORE : readQuals[kkk] ); + readInsQuals[kkk] = ( readInsQuals[kkk] < QualityUtils.MIN_USABLE_Q_SCORE ? QualityUtils.MIN_USABLE_Q_SCORE : readInsQuals[kkk] ); + readDelQuals[kkk] = ( readDelQuals[kkk] < QualityUtils.MIN_USABLE_Q_SCORE ? QualityUtils.MIN_USABLE_Q_SCORE : readDelQuals[kkk] ); + } + } + + /** + * Pre-processing of the reads to be evaluated at the current location from the current sample. + * We apply the PCR Error Model, and cap the minimum base, insertion, and deletion qualities of each read. + * Modified copies of reads are packed into a new list, while original reads are retained for downstream use + * + * @param reads The original list of unmodified reads + * @return processedReads. A new list of reads, in the same order, whose qualities have been altered by PCR error model and minimal quality thresholding + */ + private List modifyReadQualities(final List reads) { + List processedReads = new LinkedList<>(); + for ( GATKSAMRecord read : reads ) { + + final byte[] readBases = read.getReadBases(); + + // NOTE -- must clone anything that gets modified here so we don't screw up future uses of the read + final byte[] readQuals = read.getBaseQualities().clone(); + final byte[] readInsQuals = read.getBaseInsertionQualities().clone(); + final byte[] readDelQuals = read.getBaseDeletionQualities().clone(); + + applyPCRErrorModel(readBases, readInsQuals, readDelQuals); + capMinimumReadQualities(read, readQuals, readInsQuals, readDelQuals); + + // Create a new copy of the read and sets its base qualities to the modified versions. + // Pack this into a new list for return + final GATKSAMRecord processedRead = GATKSAMRecord.createQualityModifiedRead(read, readBases, readQuals, readInsQuals, readDelQuals); + processedReads.add(processedRead); + } + return processedReads; + } + + /** + * Post-processing of the read/allele likelihoods. + * + * We send quality-capped reads to the pairHMM for evaluation, and it returns a map containing these capped reads. + * We wish to return a map containing the original, unmodified reads. + * + * At the same time, we want to effectively set a lower cap on the reference score, based on the global mis-mapping rate. + * This protects us from the case where the assembly has produced haplotypes + * that are very divergent from reference, but are supported by only one read. In effect + * we capping how badly scoring the reference can be for any read by the chance that the read + * itself just doesn't belong here + * + * @param perReadAlleleLikelihoodMap the original map returned by the PairHMM. Contains the processed reads, the haplotype Alleles, and their log10ls + * @param reads Our original, unmodified reads + * @param processedReads Reads whose minimum base,insertion,deletion qualities have been capped; these were actually used to derive log10ls + * @param alleleHaplotypeMap The map associating the Allele and Haplotype versions of each haplotype + * + * @return processedReadAlleleLikelihoodMap; a new PRALM containing the original reads, and their haplotype log10ls including capped reference log10ls + */ + private PerReadAlleleLikelihoodMap capReferenceHaplotypeLikelihoods(PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap, List reads, List processedReads, Map alleleHaplotypeMap){ + + // a new read/allele map, to contain the uncapped reads, haplotypes, and potentially the capped reference log10ls + final PerReadAlleleLikelihoodMap processedReadAlleleLikelihoodMap = new PerReadAlleleLikelihoodMap(); + + Allele refAllele = null; + final int numReads = reads.size(); + for (int readIndex = 0; readIndex < numReads; readIndex++) { + + // Get the original and quality-modified read from their respective lists + // Note that this requires both lists to have reads in the same order + final GATKSAMRecord originalRead = reads.get(readIndex); + final GATKSAMRecord processedRead = processedReads.get(readIndex); + + // keep track of the reference likelihood and the best non-ref likelihood + double refLog10l = Double.NEGATIVE_INFINITY; + double bestNonReflog10L = Double.NEGATIVE_INFINITY; + + for ( Allele allele : alleleHaplotypeMap.keySet() ) { + final double log10l = perReadAlleleLikelihoodMap.getLikelihoodAssociatedWithReadAndAllele(processedRead, allele); + final Haplotype haplotype = alleleHaplotypeMap.get(allele); + if ( haplotype.isNonReference() ) + bestNonReflog10L = Math.max(bestNonReflog10L, log10l); + else { + refAllele = allele; + refLog10l = log10l; + } + writeDebugLikelihoods(processedRead, haplotype, log10l); + + // add the ORIGINAL (non-capped) read to the final map, along with the current haplotype and associated log10l + processedReadAlleleLikelihoodMap.add(originalRead, allele, log10l); + } + + // ensure that the reference haplotype is no worse than the best non-ref haplotype minus the global + // mismapping rate. This protects us from the case where the assembly has produced haplotypes + // that are very divergent from reference, but are supported by only one read. In effect + // we capping how badly scoring the reference can be for any read by the chance that the read + // itself just doesn't belong here + final double worstRefLog10Allowed = bestNonReflog10L + log10globalReadMismappingRate; + if ( refLog10l < (worstRefLog10Allowed) ) { + processedReadAlleleLikelihoodMap.add(originalRead, refAllele, worstRefLog10Allowed); + } + } + return processedReadAlleleLikelihoodMap; + } + + public Map computeReadLikelihoods( final List haplotypes, final Map> perSampleReadList ) { - // configure the HMM - initializePairHMM(haplotypes, perSampleReadList); // Add likelihoods for each sample's reads to our stratifiedReadMap final Map stratifiedReadMap = new LinkedHashMap<>(); @@ -218,137 +321,22 @@ public class LikelihoodCalculationEngine { } private PerReadAlleleLikelihoodMap computeReadLikelihoods( final List haplotypes, final List reads) { - // first, a little set up to get copies of the Haplotypes that are Alleles (more efficient than creating them each time) - final BatchPairHMM batchPairHMM = (pairHMM.get() instanceof BatchPairHMM) ? (BatchPairHMM)pairHMM.get() : null; - final Vector batchedReads = new Vector(reads.size()); - final int numHaplotypes = haplotypes.size(); - final Map alleleVersions = new LinkedHashMap<>(numHaplotypes); - Allele refAllele = null; - for ( final Haplotype haplotype : haplotypes ) { - final Allele allele = Allele.create(haplotype, true); - alleleVersions.put(haplotype, allele); - if ( haplotype.isReference() ) refAllele = allele; - } - final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap = new PerReadAlleleLikelihoodMap(); - for( final GATKSAMRecord read : reads ) { + // Modify the read qualities by applying the PCR error model and capping the minimum base,insertion,deletion qualities + List processedReads = modifyReadQualities(reads); - final byte[] readBases = read.getReadBases(); - final byte[] overallGCP = new byte[read.getReadLength()]; - Arrays.fill( overallGCP, constantGCP ); // Is there a way to derive empirical estimates for this from the data? + // Get alleles corresponding to our haplotypees + Map alleleHaplotypeMap = createAlleleMap(haplotypes); - // NOTE -- must clone anything that gets modified here so we don't screw up future uses of the read - final byte[] readQuals = read.getBaseQualities().clone(); - final byte[] readInsQuals = read.getBaseInsertionQualities().clone(); - final byte[] readDelQuals = read.getBaseDeletionQualities().clone(); + // Get an array containing the constantGCP for each read in our modified read list + Map GCPArrayMap = fillGCPArrays(processedReads); - applyPCRErrorModel(readBases, readInsQuals, readDelQuals); + // Run the PairHMM to calculate the log10 likelihood of each (processed) reads' arising from each haplotype + PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap = pairHMMThreadLocal.get().computeLikelihoods(processedReads, alleleHaplotypeMap, GCPArrayMap); - for( int kkk = 0; kkk < readQuals.length; kkk++ ) { - readQuals[kkk] = (byte) Math.min( 0xff & readQuals[kkk], read.getMappingQuality()); // cap base quality by mapping quality, as in UG - readQuals[kkk] = ( readQuals[kkk] < BASE_QUALITY_SCORE_THRESHOLD ? QualityUtils.MIN_USABLE_Q_SCORE : readQuals[kkk] ); - readInsQuals[kkk] = ( readInsQuals[kkk] < QualityUtils.MIN_USABLE_Q_SCORE ? QualityUtils.MIN_USABLE_Q_SCORE : readInsQuals[kkk] ); - readDelQuals[kkk] = ( readDelQuals[kkk] < QualityUtils.MIN_USABLE_Q_SCORE ? QualityUtils.MIN_USABLE_Q_SCORE : readDelQuals[kkk] ); - } + // Generate a new map containing the original, unmodified reads, and with minimal reference haplotype log10ls determined from the global mis-mapping rate - if ( batchPairHMM != null ) { - batchPairHMM.batchAdd(haplotypes, readBases, readQuals, readInsQuals, readDelQuals, overallGCP); - batchedReads.add(read); - continue; - } - - // keep track of the reference likelihood and the best non-ref likelihood - double refLog10l = Double.NEGATIVE_INFINITY; - double bestNonReflog10L = Double.NEGATIVE_INFINITY; - - // iterate over all haplotypes, calculating the likelihood of the read for each haplotype - for( int jjj = 0; jjj < numHaplotypes; jjj++ ) { - final Haplotype haplotype = haplotypes.get(jjj); - final byte[] nextHaplotypeBases = (jjj == numHaplotypes - 1) ? null : haplotypes.get(jjj+1).getBases(); - final boolean isFirstHaplotype = jjj == 0; - final double log10l = pairHMM.get().computeReadLikelihoodGivenHaplotypeLog10(haplotype.getBases(), - readBases, readQuals, readInsQuals, readDelQuals, overallGCP, isFirstHaplotype, nextHaplotypeBases); - - if ( WRITE_LIKELIHOODS_TO_FILE ) { - likelihoodsStream.printf("%s %s %s %s %s %s %f%n", - haplotype.getBaseString(), - new String(readBases), - SAMUtils.phredToFastq(readQuals), - SAMUtils.phredToFastq(readInsQuals), - SAMUtils.phredToFastq(readDelQuals), - SAMUtils.phredToFastq(overallGCP), - log10l); - } - - if ( haplotype.isNonReference() ) - bestNonReflog10L = Math.max(bestNonReflog10L, log10l); - else - refLog10l = log10l; - - perReadAlleleLikelihoodMap.add(read, alleleVersions.get(haplotype), log10l); - } - - // ensure that the reference haplotype is no worse than the best non-ref haplotype minus the global - // mismapping rate. This protects us from the case where the assembly has produced haplotypes - // that are very divergent from reference, but are supported by only one read. In effect - // we capping how badly scoring the reference can be for any read by the chance that the read - // itself just doesn't belong here - final double worstRefLog10Allowed = bestNonReflog10L + log10globalReadMismappingRate; - if ( refLog10l < (worstRefLog10Allowed) ) { - perReadAlleleLikelihoodMap.add(read, refAllele, worstRefLog10Allowed); - } - } - - if ( batchPairHMM != null ) { - for( final GATKSAMRecord read : batchedReads ) { - double refLog10l = Double.NEGATIVE_INFINITY; - double bestNonReflog10L = Double.NEGATIVE_INFINITY; - final double[] likelihoods = batchPairHMM.batchGetResult(); - for( int jjj = 0; jjj < numHaplotypes; jjj++ ) { - final Haplotype haplotype = haplotypes.get(jjj); - final double log10l = likelihoods[jjj]; - - if ( WRITE_LIKELIHOODS_TO_FILE ) { - final byte[] overallGCP = new byte[read.getReadLength()]; - Arrays.fill( overallGCP, constantGCP ); // Is there a way to derive empirical estimates for this from the data? - // NOTE -- must clone anything that gets modified here so we don't screw up future uses of the read - final byte[] readQuals = read.getBaseQualities().clone(); - final byte[] readInsQuals = read.getBaseInsertionQualities(); - final byte[] readDelQuals = read.getBaseDeletionQualities(); - for( int kkk = 0; kkk < readQuals.length; kkk++ ) { - readQuals[kkk] = (byte) Math.min( 0xff & readQuals[kkk], read.getMappingQuality()); // cap base quality by mapping quality, as in UG - //readQuals[kkk] = ( readQuals[kkk] > readInsQuals[kkk] ? readInsQuals[kkk] : readQuals[kkk] ); // cap base quality by base insertion quality, needs to be evaluated - //readQuals[kkk] = ( readQuals[kkk] > readDelQuals[kkk] ? readDelQuals[kkk] : readQuals[kkk] ); // cap base quality by base deletion quality, needs to be evaluated - // TODO -- why is Q18 hard-coded here??? - readQuals[kkk] = ( readQuals[kkk] < (byte) 18 ? QualityUtils.MIN_USABLE_Q_SCORE : readQuals[kkk] ); - } - likelihoodsStream.printf("%s %s %s %s %s %s %f%n", - haplotype.getBaseString(), - new String(read.getReadBases()), - SAMUtils.phredToFastq(readQuals), - SAMUtils.phredToFastq(readInsQuals), - SAMUtils.phredToFastq(readDelQuals), - SAMUtils.phredToFastq(overallGCP), - log10l); - } - - if ( haplotype.isNonReference() ) - bestNonReflog10L = Math.max(bestNonReflog10L, log10l); - else - refLog10l = log10l; - - - perReadAlleleLikelihoodMap.add(read, alleleVersions.get(haplotype), log10l); - } - - final double worstRefLog10Allowed = bestNonReflog10L + log10globalReadMismappingRate; - if ( refLog10l < (worstRefLog10Allowed) ) { - perReadAlleleLikelihoodMap.add(read, refAllele, worstRefLog10Allowed); - } - } - } - - return perReadAlleleLikelihoodMap; + return capReferenceHaplotypeLikelihoods(perReadAlleleLikelihoodMap, reads, processedReads, alleleHaplotypeMap); } @Requires({"alleleOrdering.size() > 0"}) @@ -421,125 +409,125 @@ public class LikelihoodCalculationEngine { // System to compute the best N haplotypes for genotyping // // -------------------------------------------------------------------------------- - - /** - * Helper function for selectBestHaplotypesFromEachSample that updates the score of haplotype haplotypeAsAllele - * @param map an annoying map object that moves us between the allele and haplotype representation - * @param haplotypeAsAllele the allele version of the haplotype - * @return the haplotype version, with its score incremented by 1 if its non-reference - */ - private Haplotype updateSelectHaplotype(final Map map, final Allele haplotypeAsAllele) { - final Haplotype h = map.get(haplotypeAsAllele); // TODO -- fixme when haplotypes are properly generic - if ( h.isNonReference() ) h.setScore(h.getScore() + 1); // ref is already at max value - return h; - } - - /** - * Take the best N haplotypes and return them as a list - * - * Only considers the haplotypes selectedHaplotypes that were actually selected by at least one sample - * as it's preferred haplotype. Takes the best N haplotypes from selectedHaplotypes in decreasing - * order of score (so higher score haplotypes are preferred). The N we take is determined by - * - * N = min(2 * nSamples + 1, maxNumHaplotypesInPopulation) - * - * where 2 * nSamples is the number of chromosomes in 2 samples including the reference, and our workload is - * bounded by maxNumHaplotypesInPopulation as that number can grow without bound - * - * @param selectedHaplotypes a non-null set of haplotypes with scores >= 1 - * @param nSamples the number of samples used to select the haplotypes - * @param maxNumHaplotypesInPopulation the maximum number of haplotypes we're allowed to take, regardless of nSamples - * @return a list of N or fewer haplotypes, with the reference haplotype first - */ - private List selectBestHaplotypesAccordingToScore(final Set selectedHaplotypes, final int nSamples, final int maxNumHaplotypesInPopulation) { - final List selectedHaplotypesList = new ArrayList(selectedHaplotypes); - Collections.sort(selectedHaplotypesList, new HaplotypeScoreComparator()); - final int numChromosomesInSamplesPlusRef = 2 * nSamples + 1; - final int haplotypesToKeep = Math.min(numChromosomesInSamplesPlusRef, maxNumHaplotypesInPopulation); - final List bestHaplotypes = selectedHaplotypesList.size() <= haplotypesToKeep ? selectedHaplotypesList : selectedHaplotypesList.subList(0, haplotypesToKeep); - if ( bestHaplotypes.get(0).isNonReference()) throw new IllegalStateException("BUG: reference haplotype should be first in list"); - return bestHaplotypes; - } - - /** - * Select the best haplotypes for genotyping the samples in stratifiedReadMap - * - * Selects these haplotypes by counting up how often each haplotype is selected as one of the most likely - * haplotypes per sample. What this means is that each sample computes the diploid genotype likelihoods for - * all possible pairs of haplotypes, and the pair with the highest likelihood has each haplotype each get - * one extra count for each haplotype (so hom-var haplotypes get two counts). After performing this calculation - * the best N haplotypes are selected (@see #selectBestHaplotypesAccordingToScore) and a list of the - * haplotypes in order of score are returned, ensuring that at least one of the haplotypes is reference. - * - * @param haplotypes a list of all haplotypes we're considering - * @param stratifiedReadMap a map from sample -> read likelihoods per haplotype - * @param maxNumHaplotypesInPopulation the max. number of haplotypes we can select from haplotypes - * @return a list of selected haplotypes with size <= maxNumHaplotypesInPopulation - */ - public List selectBestHaplotypesFromEachSample(final List haplotypes, final Map stratifiedReadMap, final int maxNumHaplotypesInPopulation) { - if ( haplotypes.size() < 2 ) throw new IllegalArgumentException("Must have at least 2 haplotypes to consider but only have " + haplotypes); - - if ( haplotypes.size() == 2 ) return haplotypes; // fast path -- we'll always want to use 2 haplotypes - - // all of the haplotypes that at least one sample called as one of the most likely - final Set selectedHaplotypes = new HashSet<>(); - selectedHaplotypes.add(findReferenceHaplotype(haplotypes)); // ref is always one of the selected - - // our annoying map from allele -> haplotype - final Map allele2Haplotype = new HashMap<>(); - for ( final Haplotype h : haplotypes ) { - h.setScore(h.isReference() ? Double.MAX_VALUE : 0.0); // set all of the scores to 0 (lowest value) for all non-ref haplotypes - allele2Haplotype.put(Allele.create(h, h.isReference()), h); - } - - // for each sample, compute the most likely pair of haplotypes - for ( final Map.Entry entry : stratifiedReadMap.entrySet() ) { - // get the two most likely haplotypes under a diploid model for this sample - final MostLikelyAllele mla = entry.getValue().getMostLikelyDiploidAlleles(); - - if ( mla != null ) { // there was something to evaluate in this sample - // note that there must be at least 2 haplotypes - final Haplotype best = updateSelectHaplotype(allele2Haplotype, mla.getMostLikelyAllele()); - final Haplotype second = updateSelectHaplotype(allele2Haplotype, mla.getSecondMostLikelyAllele()); - -// if ( DEBUG ) { -// logger.info("Chose haplotypes " + best + " " + best.getCigar() + " and " + second + " " + second.getCigar() + " for sample " + entry.getKey()); +// +// /** +// * Helper function for selectBestHaplotypesFromEachSample that updates the score of haplotype haplotypeAsAllele +// * @param map an annoying map object that moves us between the allele and haplotype representation +// * @param haplotypeAsAllele the allele version of the haplotype +// * @return the haplotype version, with its score incremented by 1 if its non-reference +// */ +// private Haplotype updateSelectHaplotype(final Map map, final Allele haplotypeAsAllele) { +// final Haplotype h = map.get(haplotypeAsAllele); // TODO -- fixme when haplotypes are properly generic +// if ( h.isNonReference() ) h.setScore(h.getScore() + 1); // ref is already at max value +// return h; +// } +// +// /** +// * Take the best N haplotypes and return them as a list +// * +// * Only considers the haplotypes selectedHaplotypes that were actually selected by at least one sample +// * as it's preferred haplotype. Takes the best N haplotypes from selectedHaplotypes in decreasing +// * order of score (so higher score haplotypes are preferred). The N we take is determined by +// * +// * N = min(2 * nSamples + 1, maxNumHaplotypesInPopulation) +// * +// * where 2 * nSamples is the number of chromosomes in 2 samples including the reference, and our workload is +// * bounded by maxNumHaplotypesInPopulation as that number can grow without bound +// * +// * @param selectedHaplotypes a non-null set of haplotypes with scores >= 1 +// * @param nSamples the number of samples used to select the haplotypes +// * @param maxNumHaplotypesInPopulation the maximum number of haplotypes we're allowed to take, regardless of nSamples +// * @return a list of N or fewer haplotypes, with the reference haplotype first +// */ +// private List selectBestHaplotypesAccordingToScore(final Set selectedHaplotypes, final int nSamples, final int maxNumHaplotypesInPopulation) { +// final List selectedHaplotypesList = new ArrayList<>(selectedHaplotypes); +// Collections.sort(selectedHaplotypesList, new HaplotypeScoreComparator()); +// final int numChromosomesInSamplesPlusRef = 2 * nSamples + 1; +// final int haplotypesToKeep = Math.min(numChromosomesInSamplesPlusRef, maxNumHaplotypesInPopulation); +// final List bestHaplotypes = selectedHaplotypesList.size() <= haplotypesToKeep ? selectedHaplotypesList : selectedHaplotypesList.subList(0, haplotypesToKeep); +// if ( bestHaplotypes.get(0).isNonReference()) throw new IllegalStateException("BUG: reference haplotype should be first in list"); +// return bestHaplotypes; +// } +// +// /** +// * Select the best haplotypes for genotyping the samples in stratifiedReadMap +// * +// * Selects these haplotypes by counting up how often each haplotype is selected as one of the most likely +// * haplotypes per sample. What this means is that each sample computes the diploid genotype likelihoods for +// * all possible pairs of haplotypes, and the pair with the highest likelihood has each haplotype each get +// * one extra count for each haplotype (so hom-var haplotypes get two counts). After performing this calculation +// * the best N haplotypes are selected (@see #selectBestHaplotypesAccordingToScore) and a list of the +// * haplotypes in order of score are returned, ensuring that at least one of the haplotypes is reference. +// * +// * @param haplotypes a list of all haplotypes we're considering +// * @param stratifiedReadMap a map from sample -> read likelihoods per haplotype +// * @param maxNumHaplotypesInPopulation the max. number of haplotypes we can select from haplotypes +// * @return a list of selected haplotypes with size <= maxNumHaplotypesInPopulation +// */ +// public List selectBestHaplotypesFromEachSample(final List haplotypes, final Map stratifiedReadMap, final int maxNumHaplotypesInPopulation) { +// if ( haplotypes.size() < 2 ) throw new IllegalArgumentException("Must have at least 2 haplotypes to consider but only have " + haplotypes); +// +// if ( haplotypes.size() == 2 ) return haplotypes; // fast path -- we'll always want to use 2 haplotypes +// +// // all of the haplotypes that at least one sample called as one of the most likely +// final Set selectedHaplotypes = new HashSet<>(); +// selectedHaplotypes.add(findReferenceHaplotype(haplotypes)); // ref is always one of the selected +// +// // our annoying map from allele -> haplotype +// final Map allele2Haplotype = new HashMap<>(); +// for ( final Haplotype h : haplotypes ) { +// h.setScore(h.isReference() ? Double.MAX_VALUE : 0.0); // set all of the scores to 0 (lowest value) for all non-ref haplotypes +// allele2Haplotype.put(Allele.create(h, h.isReference()), h); +// } +// +// // for each sample, compute the most likely pair of haplotypes +// for ( final Map.Entry entry : stratifiedReadMap.entrySet() ) { +// // get the two most likely haplotypes under a diploid model for this sample +// final MostLikelyAllele mla = entry.getValue().getMostLikelyDiploidAlleles(); +// +// if ( mla != null ) { // there was something to evaluate in this sample +// // note that there must be at least 2 haplotypes +// final Haplotype best = updateSelectHaplotype(allele2Haplotype, mla.getMostLikelyAllele()); +// final Haplotype second = updateSelectHaplotype(allele2Haplotype, mla.getSecondMostLikelyAllele()); +// +//// if ( DEBUG ) { +//// logger.info("Chose haplotypes " + best + " " + best.getCigar() + " and " + second + " " + second.getCigar() + " for sample " + entry.getKey()); +//// } +// +// // add these two haplotypes to the set of haplotypes that have been selected +// selectedHaplotypes.add(best); +// selectedHaplotypes.add(second); +// +// // we've already selected all of our haplotypes, and we don't need to prune them down +// if ( selectedHaplotypes.size() == haplotypes.size() && haplotypes.size() < maxNumHaplotypesInPopulation ) +// break; // } - - // add these two haplotypes to the set of haplotypes that have been selected - selectedHaplotypes.add(best); - selectedHaplotypes.add(second); - - // we've already selected all of our haplotypes, and we don't need to prune them down - if ( selectedHaplotypes.size() == haplotypes.size() && haplotypes.size() < maxNumHaplotypesInPopulation ) - break; - } - } - - // take the best N haplotypes forward, in order of the number of samples that choose them - final int nSamples = stratifiedReadMap.size(); - final List bestHaplotypes = selectBestHaplotypesAccordingToScore(selectedHaplotypes, nSamples, maxNumHaplotypesInPopulation); - - if ( DEBUG ) { - logger.info("Chose " + (bestHaplotypes.size() - 1) + " alternate haplotypes to genotype in all samples."); - for ( final Haplotype h : bestHaplotypes ) { - logger.info("\tHaplotype " + h.getCigar() + " selected for further genotyping" + (h.isNonReference() ? " found " + (int)h.getScore() + " haplotypes" : " as ref haplotype")); - } - } - return bestHaplotypes; - } - - /** - * Find the haplotype that isRef(), or @throw ReviewedStingException if one isn't found - * @param haplotypes non-null list of haplotypes - * @return the reference haplotype - */ - private static Haplotype findReferenceHaplotype( final List haplotypes ) { - for( final Haplotype h : haplotypes ) { - if( h.isReference() ) return h; - } - throw new ReviewedStingException( "No reference haplotype found in the list of haplotypes!" ); - } +// } +// +// // take the best N haplotypes forward, in order of the number of samples that choose them +// final int nSamples = stratifiedReadMap.size(); +// final List bestHaplotypes = selectBestHaplotypesAccordingToScore(selectedHaplotypes, nSamples, maxNumHaplotypesInPopulation); +// +// if ( DEBUG ) { +// logger.info("Chose " + (bestHaplotypes.size() - 1) + " alternate haplotypes to genotype in all samples."); +// for ( final Haplotype h : bestHaplotypes ) { +// logger.info("\tHaplotype " + h.getCigar() + " selected for further genotyping" + (h.isNonReference() ? " found " + (int)h.getScore() + " haplotypes" : " as ref haplotype")); +// } +// } +// return bestHaplotypes; +// } +// +// /** +// * Find the haplotype that isRef(), or @throw ReviewedStingException if one isn't found +// * @param haplotypes non-null list of haplotypes +// * @return the reference haplotype +// */ +// private static Haplotype findReferenceHaplotype( final List haplotypes ) { +// for( final Haplotype h : haplotypes ) { +// if( h.isReference() ) return h; +// } +// throw new ReviewedStingException( "No reference haplotype found in the list of haplotypes!" ); +// } // -------------------------------------------------------------------------------- // diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java index 3c6e409b9..97431368e 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java @@ -48,11 +48,11 @@ package org.broadinstitute.sting.gatk.walkers.indels; import com.google.java.contract.Ensures; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.utils.haplotype.Haplotype; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.clipping.ReadClipper; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.sting.utils.haplotype.Haplotype; import org.broadinstitute.sting.utils.pairhmm.ArrayLoglessPairHMM; import org.broadinstitute.sting.utils.pairhmm.Log10PairHMM; import org.broadinstitute.sting.utils.pairhmm.LoglessPairHMM; @@ -65,6 +65,7 @@ import org.broadinstitute.variant.variantcontext.Allele; import java.util.Arrays; import java.util.LinkedHashMap; +import java.util.LinkedList; import java.util.Map; //import org.broadinstitute.sting.utils.pairhmm.LoglessCachingPairHMM; @@ -206,6 +207,39 @@ public class PairHMMIndelErrorModel { } } + private LinkedHashMap trimHaplotypes(final LinkedHashMap haplotypeMap, + long startLocationInRefForHaplotypes, + long stopLocationInRefForHaplotypes, + final ReferenceContext ref){ + + final LinkedHashMap trimmedHaplotypeMap = new LinkedHashMap<>(); + for (final Allele a: haplotypeMap.keySet()) { + + final Haplotype haplotype = haplotypeMap.get(a); + + if (stopLocationInRefForHaplotypes > haplotype.getStopPosition()) + stopLocationInRefForHaplotypes = haplotype.getStopPosition(); + + if (startLocationInRefForHaplotypes < haplotype.getStartPosition()) + startLocationInRefForHaplotypes = haplotype.getStartPosition(); + else if (startLocationInRefForHaplotypes > haplotype.getStopPosition()) + startLocationInRefForHaplotypes = haplotype.getStopPosition(); + + final long indStart = startLocationInRefForHaplotypes - haplotype.getStartPosition(); + final long indStop = stopLocationInRefForHaplotypes - haplotype.getStartPosition(); + + if (DEBUG) + System.out.format("indStart: %d indStop: %d WinStart:%d WinStop:%d start: %d stop: %d\n", + indStart, indStop, ref.getWindow().getStart(), ref.getWindow().getStop(), startLocationInRefForHaplotypes, stopLocationInRefForHaplotypes); + + // get the trimmed haplotype-bases array and create a new haplotype based on it. Pack this into the new map + final byte[] trimmedHaplotypeBases = Arrays.copyOfRange(haplotype.getBases(), (int)indStart, (int)indStop); + final Haplotype trimmedHaplotype = new Haplotype(trimmedHaplotypeBases, haplotype.isReference()); + trimmedHaplotypeMap.put(a, trimmedHaplotype); + } + return trimmedHaplotypeMap; + } + public synchronized double[] computeDiploidReadHaplotypeLikelihoods(final ReadBackedPileup pileup, final LinkedHashMap haplotypeMap, @@ -231,6 +265,8 @@ public class PairHMMIndelErrorModel { final int[] readCounts) { final double readLikelihoods[][] = new double[pileup.getNumberOfElements()][haplotypeMap.size()]; + final LinkedList readList = new LinkedList<>(); + final Map readGCPArrayMap = new LinkedHashMap<>(); int readIdx=0; for (PileupElement p: pileup) { // > 1 when the read is a consensus read representing multiple independent observations @@ -369,86 +405,30 @@ public class PairHMMIndelErrorModel { baseDeletionQualities = contextLogGapOpenProbabilities; } - byte[] currentHaplotypeBases = null; - boolean firstHap = true; - double readLikelihood; - Allele currentAllele = null; - for (Allele a: haplotypeMap.keySet()) { + // Create a new read based on the current one, but with trimmed bases/quals, for use in the HMM + final GATKSAMRecord processedRead = GATKSAMRecord.createQualityModifiedRead(read, readBases, readQuals, baseInsertionQualities, baseDeletionQualities); + readList.add(processedRead); - Haplotype haplotype = haplotypeMap.get(a); + // Pack the shortened read and its associated gap-continuation-penalty array into a map, as required by PairHMM + readGCPArrayMap.put(processedRead,contextLogGapContinuationProbabilities); - if (stopLocationInRefForHaplotypes > haplotype.getStopPosition()) - stopLocationInRefForHaplotypes = haplotype.getStopPosition(); + // Create a map of alleles to a new set of haplotypes, whose bases have been trimmed to the approprate genomic locations + final Map trimmedHaplotypeMap = trimHaplotypes(haplotypeMap, startLocationInRefForHaplotypes, stopLocationInRefForHaplotypes, ref); - if (startLocationInRefForHaplotypes < haplotype.getStartPosition()) - startLocationInRefForHaplotypes = haplotype.getStartPosition(); - else if (startLocationInRefForHaplotypes > haplotype.getStopPosition()) - startLocationInRefForHaplotypes = haplotype.getStopPosition(); + // Get the likelihoods for our clipped read against each of our trimmed haplotypes. + final PerReadAlleleLikelihoodMap singleReadRawLikelihoods = pairHMM.computeLikelihoods(readList, trimmedHaplotypeMap, readGCPArrayMap); - final long indStart = startLocationInRefForHaplotypes - haplotype.getStartPosition(); - final long indStop = stopLocationInRefForHaplotypes - haplotype.getStartPosition(); - - - if (DEBUG) - System.out.format("indStart: %d indStop: %d WinStart:%d WinStop:%d start: %d stop: %d readLength: %d C:%s\n", - indStart, indStop, ref.getWindow().getStart(), ref.getWindow().getStop(), startLocationInRefForHaplotypes, stopLocationInRefForHaplotypes, read.getReadLength(), read.getCigar().toString()); - - // peak at the next haplotype in the list - final byte[] nextHaplotypeBases = Arrays.copyOfRange(haplotype.getBases(), (int)indStart, (int)indStop); - // process the current haplotype in the list - if (currentHaplotypeBases != null) { - // it's possible that the indel starts at the last base of the haplotypes - if ( currentHaplotypeBases.length == 0 ) { - readLikelihood = -Double.MAX_VALUE; - } else { - if (firstHap) { - //no need to reallocate arrays for each new haplotype, as length won't change - pairHMM.initialize(readBases.length, currentHaplotypeBases.length); - firstHap = false; - } - - readLikelihood = pairHMM.computeReadLikelihoodGivenHaplotypeLog10(currentHaplotypeBases, readBases, readQuals, - baseInsertionQualities, baseDeletionQualities, contextLogGapContinuationProbabilities, firstHap, nextHaplotypeBases); - } - - if (DEBUG) { - System.out.println("H:"+new String(currentHaplotypeBases)); - System.out.println("R:"+new String(readBases)); - System.out.format("L:%4.2f\n",readLikelihood); - } - - perReadAlleleLikelihoodMap.add(p, currentAllele, readLikelihood); - readLikelihoods[readIdx][j++] = readLikelihood; - } - // update the current haplotype - currentHaplotypeBases = nextHaplotypeBases; - currentAllele = a; - } - // process the final haplotype - if (currentHaplotypeBases != null) { - // it's possible that the indel starts at the last base of the haplotypes - if ( currentHaplotypeBases.length == 0 ) { - readLikelihood = -Double.MAX_VALUE; - } else { - if (firstHap) { - //no need to reallocate arrays for each new haplotype, as length won't change - pairHMM.initialize(readBases.length, currentHaplotypeBases.length); - firstHap = false; - } - // there is no next haplotype, so pass null for nextHaplotypeBases. - readLikelihood = pairHMM.computeReadLikelihoodGivenHaplotypeLog10(currentHaplotypeBases, readBases, readQuals, - baseInsertionQualities, baseDeletionQualities, contextLogGapContinuationProbabilities, firstHap, null); - } - - if (DEBUG) { - System.out.println("H:"+new String(currentHaplotypeBases)); - System.out.println("R:"+new String(readBases)); - System.out.format("L:%4.2f\n",readLikelihood); - } - - perReadAlleleLikelihoodMap.add(p, currentAllele, readLikelihood); + // Pack the original pilup element, each allele, and each associated log10 likelihood into a final map, and add each likelihood to the array + for (Allele a: trimmedHaplotypeMap.keySet()){ + double readLikelihood = singleReadRawLikelihoods.getLikelihoodAssociatedWithReadAndAllele(processedRead, a); + perReadAlleleLikelihoodMap.add(p, a, readLikelihood); readLikelihoods[readIdx][j++] = readLikelihood; } + // The readList for sending to the HMM should only ever contain 1 read, as each must be clipped individually + readList.remove(processedRead); + + // The same is true for the read/GCP-array map + readGCPArrayMap.remove(processedRead); } } readIdx++; @@ -472,16 +452,16 @@ public class PairHMMIndelErrorModel { return !((read.getAlignmentStart() >= eventStartPos-eventLength && read.getAlignmentStart() <= eventStartPos+1) || (read.getAlignmentEnd() >= eventStartPos && read.getAlignmentEnd() <= eventStartPos + eventLength)); } - private int computeFirstDifferingPosition(byte[] b1, byte[] b2) { - if (b1.length != b2.length) - return 0; // sanity check - - for (int i=0; i < b1.length; i++ ){ - if ( b1[i]!= b2[i] ) - return i; - } - return b1.length; - } +// private int computeFirstDifferingPosition(byte[] b1, byte[] b2) { +// if (b1.length != b2.length) +// return 0; // sanity check +// +// for (int i=0; i < b1.length; i++ ){ +// if ( b1[i]!= b2[i] ) +// return i; +// } +// return b1.length; +// } private static double[] getDiploidHaplotypeLikelihoods(final int numHaplotypes, final int readCounts[], final double readLikelihoods[][]) { final double[][] haplotypeLikehoodMatrix = new double[numHaplotypes][numHaplotypes]; diff --git a/protected/java/src/org/broadinstitute/sting/utils/pairhmm/ArrayLoglessPairHMM.java b/protected/java/src/org/broadinstitute/sting/utils/pairhmm/ArrayLoglessPairHMM.java index 4b996e770..a693ec22d 100644 --- a/protected/java/src/org/broadinstitute/sting/utils/pairhmm/ArrayLoglessPairHMM.java +++ b/protected/java/src/org/broadinstitute/sting/utils/pairhmm/ArrayLoglessPairHMM.java @@ -49,6 +49,7 @@ package org.broadinstitute.sting.utils.pairhmm; import com.google.java.contract.Ensures; import com.google.java.contract.Requires; import org.broadinstitute.sting.utils.QualityUtils; + import java.util.Arrays; /** diff --git a/protected/java/src/org/broadinstitute/sting/utils/pairhmm/CnyPairHMM.java b/protected/java/src/org/broadinstitute/sting/utils/pairhmm/CnyPairHMM.java index d92b918ba..b80036bb2 100644 --- a/protected/java/src/org/broadinstitute/sting/utils/pairhmm/CnyPairHMM.java +++ b/protected/java/src/org/broadinstitute/sting/utils/pairhmm/CnyPairHMM.java @@ -47,12 +47,16 @@ package org.broadinstitute.sting.utils.pairhmm; import org.broadinstitute.sting.utils.haplotype.Haplotype; +import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.variant.variantcontext.Allele; import java.io.File; import java.lang.reflect.Field; import java.util.Arrays; import java.util.LinkedList; import java.util.List; +import java.util.Map; public final class CnyPairHMM extends PairHMM implements BatchPairHMM { private static class HmmInput { @@ -62,14 +66,14 @@ public final class CnyPairHMM extends PairHMM implements BatchPairHMM { public byte[] deletionGOP; public byte[] overallGCP; public List haplotypes; - }; + } public static class ResultQueue { private int offset; private List batchResults; public ResultQueue() { - batchResults = new LinkedList(); + batchResults = new LinkedList<>(); offset = 0; } @@ -92,7 +96,7 @@ public final class CnyPairHMM extends PairHMM implements BatchPairHMM { final static String libName = "gmvhdl_gatk_hmm"; private static boolean loaded = false; - private List batchRequests = new LinkedList(); + private List batchRequests = new LinkedList<>(); private ResultQueue resultQueue = new ResultQueue(); static public boolean isAvailable() { @@ -184,6 +188,55 @@ public final class CnyPairHMM extends PairHMM implements BatchPairHMM { return results; } + /** + * {@inheritDoc} + */ + @Override + public PerReadAlleleLikelihoodMap computeLikelihoods(final List reads, final Map alleleHaplotypeMap, final Map GCPArrayMap){ + + // initialize the pairHMM if necessary + if (! initialized) { + int readMaxLength = findMaxReadLength(reads); + int haplotypeMaxLength = findMaxHaplotypeLength(alleleHaplotypeMap); + initialize(readMaxLength, haplotypeMaxLength); + } + + // Pass the read bases/quals, and the haplotypes as a list into the HMM + performBatchAdditions(reads, alleleHaplotypeMap, GCPArrayMap); + + // Get the log10-likelihoods for each read/haplotype ant pack into the results map + final PerReadAlleleLikelihoodMap likelihoodMap = new PerReadAlleleLikelihoodMap(); + collectLikelihoodResults(reads, alleleHaplotypeMap, likelihoodMap); + + return likelihoodMap; + } + + private void collectLikelihoodResults(List reads, Map alleleHaplotypeMap, PerReadAlleleLikelihoodMap likelihoodMap) { + for(final GATKSAMRecord read : reads){ + final double[] likelihoods = batchGetResult(); + int jjj = 0; + for (Allele allele : alleleHaplotypeMap.keySet()){ + final double log10l = likelihoods[jjj]; + likelihoodMap.add(read, allele, log10l); + jjj++; + } + } + } + + private void performBatchAdditions(List reads, Map alleleHaplotypeMap, Map GCPArrayMap) { + final List haplotypeList = getHaplotypeList(alleleHaplotypeMap); + for(final GATKSAMRecord read : reads){ + final byte[] readBases = read.getReadBases(); + final byte[] readQuals = read.getBaseQualities(); + final byte[] readInsQuals = read.getBaseInsertionQualities(); + final byte[] readDelQuals = read.getBaseDeletionQualities(); + final byte[] overallGCP = GCPArrayMap.get(read); + + batchAdd(haplotypeList, readBases, readQuals, readInsQuals, readDelQuals, overallGCP); + } + } + + protected double subComputeReadLikelihoodGivenHaplotypeLog10( final byte[] haplotypeBases, final byte[] readBases, final byte[] readQuals, @@ -196,6 +249,14 @@ public final class CnyPairHMM extends PairHMM implements BatchPairHMM { return 0.0; } + private List getHaplotypeList(Map alleleHaplotypeMap){ + final List haplotypeList = new LinkedList<>(); + for (Allele a : alleleHaplotypeMap.keySet()){ + haplotypeList.add(alleleHaplotypeMap.get(a)); + } + return haplotypeList; + } + private void enqueuePrepare(byte[] haplotypeBases, byte[] readBases) { double[] results = null; int n = dequeueRequirement(haplotypeBases.length, readBases.length); diff --git a/protected/java/src/org/broadinstitute/sting/utils/pairhmm/LoglessPairHMM.java b/protected/java/src/org/broadinstitute/sting/utils/pairhmm/LoglessPairHMM.java index e745ca1f5..de2994927 100644 --- a/protected/java/src/org/broadinstitute/sting/utils/pairhmm/LoglessPairHMM.java +++ b/protected/java/src/org/broadinstitute/sting/utils/pairhmm/LoglessPairHMM.java @@ -70,17 +70,6 @@ public final class LoglessPairHMM extends N2MemoryPairHMM { private static final int deletionToDeletion = 5; - /** - * {@inheritDoc} - */ - @Override - public void initialize(final int readMaxLength, final int haplotypeMaxLength ) { - super.initialize(readMaxLength, haplotypeMaxLength); - - transition = new double[paddedMaxReadLength][6]; - prior = new double[paddedMaxReadLength][paddedMaxHaplotypeLength]; - } - /** * {@inheritDoc} */ diff --git a/public/java/src/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMap.java b/public/java/src/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMap.java index 70be85f54..af22eb7f9 100644 --- a/public/java/src/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMap.java +++ b/public/java/src/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMap.java @@ -110,7 +110,7 @@ public class PerReadAlleleLikelihoodMap { * @return a map from each allele to a list of reads that 'support' the allele */ protected Map> getAlleleStratifiedReadMap() { - final Map> alleleReadMap = new HashMap>(alleles.size()); + final Map> alleleReadMap = new HashMap<>(alleles.size()); for ( final Allele allele : alleles ) alleleReadMap.put(allele, new ArrayList()); @@ -152,7 +152,7 @@ public class PerReadAlleleLikelihoodMap { /** * Does the current map contain the key associated with a particular SAM record in pileup? * @param p Pileup element - * @return + * @return true if the map contains pileup element, else false */ public boolean containsPileupElement(final PileupElement p) { return likelihoodReadMap.containsKey(p.getRead()); @@ -176,9 +176,9 @@ public class PerReadAlleleLikelihoodMap { return likelihoodReadMap.keySet(); } - public Collection> getLikelihoodMapValues() { - return likelihoodReadMap.values(); - } +// public Collection> getLikelihoodMapValues() { +// return likelihoodReadMap.values(); +// } public int getNumberOfStoredElements() { return likelihoodReadMap.size(); @@ -191,6 +191,21 @@ public class PerReadAlleleLikelihoodMap { return likelihoodReadMap.get(p.getRead()); } + + /** + * Get the log10 likelihood associated with an individual read/allele + * + * @param read the read whose likelihood we want + * @param allele the allele whose likelihood we want + * @return the log10 likelihood that this read matches this allele + */ + public double getLikelihoodAssociatedWithReadAndAllele(final GATKSAMRecord read, final Allele allele){ + if (!allelesSet.contains(allele) || !likelihoodReadMap.containsKey(read)) + return 0.0; + + return likelihoodReadMap.get(read).get(allele); + } + /** * Get the most likely alleles estimated across all reads in this object * diff --git a/public/java/src/org/broadinstitute/sting/utils/pairhmm/Log10PairHMM.java b/public/java/src/org/broadinstitute/sting/utils/pairhmm/Log10PairHMM.java index e7bc5cb56..83b87da95 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pairhmm/Log10PairHMM.java +++ b/public/java/src/org/broadinstitute/sting/utils/pairhmm/Log10PairHMM.java @@ -85,9 +85,6 @@ public final class Log10PairHMM extends N2MemoryPairHMM { Arrays.fill(insertionMatrix[iii], Double.NEGATIVE_INFINITY); Arrays.fill(deletionMatrix[iii], Double.NEGATIVE_INFINITY); } - - transition = new double[paddedMaxReadLength][6]; - prior = new double[paddedMaxReadLength][paddedMaxHaplotypeLength]; } /** diff --git a/public/java/src/org/broadinstitute/sting/utils/pairhmm/N2MemoryPairHMM.java b/public/java/src/org/broadinstitute/sting/utils/pairhmm/N2MemoryPairHMM.java index a091a0716..c02461c03 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pairhmm/N2MemoryPairHMM.java +++ b/public/java/src/org/broadinstitute/sting/utils/pairhmm/N2MemoryPairHMM.java @@ -26,10 +26,6 @@ package org.broadinstitute.sting.utils.pairhmm; import com.google.java.contract.Requires; -import org.apache.log4j.Logger; -import org.broadinstitute.sting.utils.MathUtils; - -import java.util.Arrays; /** * Superclass for PairHMM that want to use a full read x haplotype matrix for their match, insertion, and deletion matrix @@ -58,6 +54,9 @@ abstract class N2MemoryPairHMM extends PairHMM { matchMatrix = new double[paddedMaxReadLength][paddedMaxHaplotypeLength]; insertionMatrix = new double[paddedMaxReadLength][paddedMaxHaplotypeLength]; deletionMatrix = new double[paddedMaxReadLength][paddedMaxHaplotypeLength]; + + transition = new double[paddedMaxReadLength][6]; + prior = new double[paddedMaxReadLength][paddedMaxHaplotypeLength]; } /** diff --git a/public/java/src/org/broadinstitute/sting/utils/pairhmm/PairHMM.java b/public/java/src/org/broadinstitute/sting/utils/pairhmm/PairHMM.java index eb52f4a85..ce3d43a06 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pairhmm/PairHMM.java +++ b/public/java/src/org/broadinstitute/sting/utils/pairhmm/PairHMM.java @@ -28,9 +28,14 @@ package org.broadinstitute.sting.utils.pairhmm; import com.google.java.contract.Requires; import org.apache.log4j.Logger; import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.sting.utils.haplotype.Haplotype; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.variant.variantcontext.Allele; import java.util.Arrays; - +import java.util.List; +import java.util.Map; /** * Util class for performing the pair HMM for local alignment. Figure 4.3 in Durbin 1998 book. * @@ -59,7 +64,7 @@ public abstract class PairHMM { protected int maxHaplotypeLength, maxReadLength; protected int paddedMaxReadLength, paddedMaxHaplotypeLength; protected int paddedReadLength, paddedHaplotypeLength; - private boolean initialized = false; + protected boolean initialized = false; // only used for debugging purposes protected boolean doNotUseTristateCorrection = false; @@ -73,7 +78,7 @@ public abstract class PairHMM { * @param haplotypeMaxLength the max length of haplotypes we want to use with this PairHMM * @param readMaxLength the max length of reads we want to use with this PairHMM */ - public void initialize( final int readMaxLength, final int haplotypeMaxLength ) { + protected void initialize( final int readMaxLength, final int haplotypeMaxLength ) { if ( readMaxLength <= 0 ) throw new IllegalArgumentException("READ_MAX_LENGTH must be > 0 but got " + readMaxLength); if ( haplotypeMaxLength <= 0 ) throw new IllegalArgumentException("HAPLOTYPE_MAX_LENGTH must be > 0 but got " + haplotypeMaxLength); @@ -90,6 +95,79 @@ public abstract class PairHMM { initialized = true; } + protected int findMaxReadLength(final List reads) { + int listMaxReadLength = 0; + for(GATKSAMRecord read : reads){ + final int readLength = read.getReadLength(); + if( readLength > listMaxReadLength ) { listMaxReadLength = readLength; } + } + return listMaxReadLength; + } + + protected int findMaxHaplotypeLength(final Map haplotypeMap) { + int listMaxHaplotypeLength = 0; + for( final Allele a: haplotypeMap.keySet() ) { + final Haplotype h = haplotypeMap.get(a); + final int haplotypeLength = h.getBases().length; + if( haplotypeLength > listMaxHaplotypeLength ) { listMaxHaplotypeLength = haplotypeLength; } + } + return listMaxHaplotypeLength; + } + + /** + * Given a list of reads and haplotypes, for every read compute the total probability of said read arising from + * each haplotype given base substitution, insertion, and deletion probabilities. + * + * @param reads the list of reads + * @param alleleHaplotypeMap the list of haplotypes + * @param GCPArrayMap Each read is associated with an array containing the gap continuation penalties for use in the model. Length of each GCP-array must match that of its read. + * @return a PerReadAlleleLikelihoodMap containing each read, haplotype-allele, and the log10 probability of + * said read coming from the said haplotype under the provided error model + */ + public PerReadAlleleLikelihoodMap computeLikelihoods(final List reads, final Map alleleHaplotypeMap, final Map GCPArrayMap) { + + // (re)initialize the pairHMM only if necessary + final int readMaxLength = findMaxReadLength(reads); + final int haplotypeMaxLength = findMaxHaplotypeLength(alleleHaplotypeMap); + if (!initialized || readMaxLength > maxReadLength || haplotypeMaxLength > maxHaplotypeLength) { initialize(readMaxLength, haplotypeMaxLength); } + + final PerReadAlleleLikelihoodMap likelihoodMap = new PerReadAlleleLikelihoodMap(); + for(GATKSAMRecord read : reads){ + final byte[] readBases = read.getReadBases(); + final byte[] readQuals = read.getBaseQualities(); + final byte[] readInsQuals = read.getBaseInsertionQualities(); + final byte[] readDelQuals = read.getBaseDeletionQualities(); + final byte[] overallGCP = GCPArrayMap.get(read); + + // peak at the next haplotype in the list (necessary to get nextHaplotypeBases, which is required for caching in the array implementation) + byte[] currentHaplotypeBases = null; + boolean isFirstHaplotype = true; + Allele currentAllele = null; + double log10l; + for (final Allele allele : alleleHaplotypeMap.keySet()){ + final Haplotype haplotype = alleleHaplotypeMap.get(allele); + final byte[] nextHaplotypeBases = haplotype.getBases(); + if (currentHaplotypeBases != null) { + log10l = computeReadLikelihoodGivenHaplotypeLog10(currentHaplotypeBases, + readBases, readQuals, readInsQuals, readDelQuals, overallGCP, isFirstHaplotype, nextHaplotypeBases); + likelihoodMap.add(read, currentAllele, log10l); + } + // update the current haplotype + currentHaplotypeBases = nextHaplotypeBases; + currentAllele = allele; + } + // process the final haplotype + if (currentHaplotypeBases != null) { + + // there is no next haplotype, so pass null for nextHaplotypeBases. + log10l = computeReadLikelihoodGivenHaplotypeLog10(currentHaplotypeBases, + readBases, readQuals, readInsQuals, readDelQuals, overallGCP, isFirstHaplotype, null); + likelihoodMap.add(read, currentAllele, log10l); + } + } + return likelihoodMap; + } + /** * Compute the total probability of read arising from haplotypeBases given base substitution, insertion, and deletion * probabilities. @@ -110,7 +188,7 @@ public abstract class PairHMM { * parameters are the same, and only the haplotype bases are changing underneath us * @return the log10 probability of read coming from the haplotype under the provided error model */ - public final double computeReadLikelihoodGivenHaplotypeLog10( final byte[] haplotypeBases, + protected final double computeReadLikelihoodGivenHaplotypeLog10( final byte[] haplotypeBases, final byte[] readBases, final byte[] readQuals, final byte[] insertionGOP, @@ -118,6 +196,7 @@ public abstract class PairHMM { final byte[] overallGCP, final boolean recacheReadValues, final byte[] nextHaploytpeBases) { + if ( ! initialized ) throw new IllegalStateException("Must call initialize before calling computeReadLikelihoodGivenHaplotypeLog10"); if ( haplotypeBases == null ) throw new IllegalArgumentException("haplotypeBases cannot be null"); if ( haplotypeBases.length > maxHaplotypeLength ) throw new IllegalArgumentException("Haplotype bases is too long, got " + haplotypeBases.length + " but max is " + maxHaplotypeLength); diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java index c39245730..93718b04d 100644 --- a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java @@ -139,7 +139,7 @@ public class GATKSAMRecord extends BAMRecord { } public static GATKSAMRecord createRandomRead(int length) { - List cigarElements = new LinkedList(); + List cigarElements = new LinkedList<>(); cigarElements.add(new CigarElement(length, CigarOperator.M)); Cigar cigar = new Cigar(cigarElements); return ArtificialSAMUtils.createArtificialRead(cigar); @@ -536,10 +536,7 @@ public class GATKSAMRecord extends BAMRecord { * @return True if an attribute has been set for this key. */ public boolean containsTemporaryAttribute(Object key) { - if(temporaryAttributes != null) { - return temporaryAttributes.containsKey(key); - } - return false; + return temporaryAttributes != null && temporaryAttributes.containsKey(key); } /** @@ -556,7 +553,7 @@ public class GATKSAMRecord extends BAMRecord { */ public Object setTemporaryAttribute(Object key, Object value) { if(temporaryAttributes == null) { - temporaryAttributes = new HashMap(); + temporaryAttributes = new HashMap<>(); } return temporaryAttributes.put(key, value); } @@ -750,6 +747,46 @@ public class GATKSAMRecord extends BAMRecord { return emptyRead; } + /** + * Creates a new GATKSAMRecord with the source read's header, read group and mate + * information, but with the following fields set to user-supplied values: + * - Read Bases + * - Base Qualities + * - Base Insertion Qualities + * - Base Deletion Qualities + * + * Cigar string is empty (not-null) + * + * Use this method if you want to create a new GATKSAMRecord based on + * another GATKSAMRecord, but with modified bases and qualities + * + * @param read a read to copy the header from + * @param readBases an array containing the new bases you wish use in place of the originals + * @param baseQualities an array containing the new base qualities you wish use in place of the originals + * @param baseInsertionQualities an array containing the new base insertion qaulities + * @param baseDeletionQualities an array containing the new base deletion qualities + * @return a read with modified bases and qualities, safe for the GATK + */ + public static GATKSAMRecord createQualityModifiedRead(final GATKSAMRecord read, + final byte[] readBases, + final byte[] baseQualities, + final byte[] baseInsertionQualities, + final byte[] baseDeletionQualities) { + if ( baseQualities.length != readBases.length || baseInsertionQualities.length != readBases.length || baseDeletionQualities.length != readBases.length ) + throw new IllegalArgumentException("Read bases and read quality arrays aren't the same size: Bases:" + readBases.length + + " vs Base Q's:" + baseQualities.length + + " vs Insert Q's:" + baseInsertionQualities.length + + " vs Delete Q's:" + baseDeletionQualities.length); + + final GATKSAMRecord processedRead = GATKSAMRecord.emptyRead(read); + processedRead.setReadBases(readBases); + processedRead.setBaseQualities(baseQualities, EventType.BASE_SUBSTITUTION); + processedRead.setBaseQualities(baseInsertionQualities, EventType.BASE_INSERTION); + processedRead.setBaseQualities(baseDeletionQualities, EventType.BASE_DELETION); + + return processedRead; + } + /** * Shallow copy of everything, except for the attribute list and the temporary attributes. * A new list of the attributes is created for both, but the attributes themselves are copied by reference. @@ -762,7 +799,7 @@ public class GATKSAMRecord extends BAMRecord { public Object clone() throws CloneNotSupportedException { final GATKSAMRecord clone = (GATKSAMRecord) super.clone(); if (temporaryAttributes != null) { - clone.temporaryAttributes = new HashMap(); + clone.temporaryAttributes = new HashMap<>(); for (Object attribute : temporaryAttributes.keySet()) clone.setTemporaryAttribute(attribute, temporaryAttributes.get(attribute)); }