diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java index 0d55797bc..4eb728390 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java @@ -52,18 +52,10 @@ import net.sf.samtools.SAMUtils; import org.apache.log4j.Logger; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.QualityUtils; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.genotyper.MostLikelyAllele; import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.haplotype.Haplotype; -import org.broadinstitute.sting.utils.haplotype.HaplotypeScoreComparator; -import org.broadinstitute.sting.utils.pairhmm.ArrayLoglessPairHMM; -import org.broadinstitute.sting.utils.pairhmm.Log10PairHMM; -import org.broadinstitute.sting.utils.pairhmm.LoglessPairHMM; -import org.broadinstitute.sting.utils.pairhmm.CnyPairHMM; -import org.broadinstitute.sting.utils.pairhmm.BatchPairHMM; -import org.broadinstitute.sting.utils.pairhmm.PairHMM; +import org.broadinstitute.sting.utils.pairhmm.*; import org.broadinstitute.sting.utils.recalibration.covariates.RepeatCovariate; import org.broadinstitute.sting.utils.recalibration.covariates.RepeatLengthCovariate; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; @@ -88,7 +80,7 @@ public class LikelihoodCalculationEngine { private final PairHMM.HMM_IMPLEMENTATION hmmType; private final boolean noFpga; - private final ThreadLocal pairHMM = new ThreadLocal() { + private final ThreadLocal pairHMMThreadLocal = new ThreadLocal() { @Override protected PairHMM initialValue() { switch (hmmType) { @@ -109,6 +101,8 @@ public class LikelihoodCalculationEngine { } } }; +// Attempted to do as below, to avoid calling pairHMMThreadLocal.get() later on, but it resulted in a NullPointerException +// private final PairHMM pairHMM = pairHMMThreadLocal.get(); private final static boolean WRITE_LIKELIHOODS_TO_FILE = false; private final static String LIKELIHOODS_FILENAME = "likelihoods.txt"; @@ -173,36 +167,145 @@ public class LikelihoodCalculationEngine { if ( likelihoodsStream != null ) likelihoodsStream.close(); } - /** - * Initialize our pairHMM with parameters appropriate to the haplotypes and reads we're going to evaluate - * - * After calling this routine the PairHMM will be configured to best evaluate all reads in the samples - * against the set of haplotypes - * - * @param haplotypes a non-null list of haplotypes - * @param perSampleReadList a mapping from sample -> reads - */ - private void initializePairHMM(final List haplotypes, final Map> perSampleReadList) { - int X_METRIC_LENGTH = 0; - for( final Map.Entry> sample : perSampleReadList.entrySet() ) { - for( final GATKSAMRecord read : sample.getValue() ) { - final int readLength = read.getReadLength(); - if( readLength > X_METRIC_LENGTH ) { X_METRIC_LENGTH = readLength; } - } + private void writeDebugLikelihoods(final GATKSAMRecord processedRead, final Haplotype haplotype, final double log10l){ + if ( WRITE_LIKELIHOODS_TO_FILE ) { + likelihoodsStream.printf("%s %s %s %s %s %s %f%n", + haplotype.getBaseString(), + new String(processedRead.getReadBases() ), + SAMUtils.phredToFastq(processedRead.getBaseQualities() ), + SAMUtils.phredToFastq(processedRead.getBaseInsertionQualities() ), + SAMUtils.phredToFastq(processedRead.getBaseDeletionQualities() ), + SAMUtils.phredToFastq(constantGCP), + log10l); } - int Y_METRIC_LENGTH = 0; - for( final Haplotype h : haplotypes ) { - final int haplotypeLength = h.getBases().length; - if( haplotypeLength > Y_METRIC_LENGTH ) { Y_METRIC_LENGTH = haplotypeLength; } - } - - // initialize arrays to hold the probabilities of being in the match, insertion and deletion cases - pairHMM.get().initialize(X_METRIC_LENGTH, Y_METRIC_LENGTH); } + private Map createAlleleMap(List haplotypes){ + final int numHaplotypes = haplotypes.size(); + final Map alleleMap = new LinkedHashMap<>(numHaplotypes); + for ( final Haplotype haplotype : haplotypes ) { + final Allele allele = Allele.create(haplotype, true); + alleleMap.put(allele, haplotype); + } + return alleleMap; + } + + private Map fillGCPArrays(List reads){ + final Map GCPArrayMap = new LinkedHashMap<>(); + for (GATKSAMRecord read: reads){ + byte [] GCPArray = new byte[read.getReadBases().length]; + Arrays.fill( GCPArray, constantGCP ); // Is there a way to derive empirical estimates for this from the data? + GCPArrayMap.put(read, GCPArray); + } + return GCPArrayMap; + } + + private void capMinimumReadQualities(GATKSAMRecord read, byte[] readQuals, byte[] readInsQuals, byte[] readDelQuals) { + for( int kkk = 0; kkk < readQuals.length; kkk++ ) { + readQuals[kkk] = (byte) Math.min( 0xff & readQuals[kkk], read.getMappingQuality()); // cap base quality by mapping quality, as in UG + readQuals[kkk] = ( readQuals[kkk] < BASE_QUALITY_SCORE_THRESHOLD ? QualityUtils.MIN_USABLE_Q_SCORE : readQuals[kkk] ); + readInsQuals[kkk] = ( readInsQuals[kkk] < QualityUtils.MIN_USABLE_Q_SCORE ? QualityUtils.MIN_USABLE_Q_SCORE : readInsQuals[kkk] ); + readDelQuals[kkk] = ( readDelQuals[kkk] < QualityUtils.MIN_USABLE_Q_SCORE ? QualityUtils.MIN_USABLE_Q_SCORE : readDelQuals[kkk] ); + } + } + + /** + * Pre-processing of the reads to be evaluated at the current location from the current sample. + * We apply the PCR Error Model, and cap the minimum base, insertion, and deletion qualities of each read. + * Modified copies of reads are packed into a new list, while original reads are retained for downstream use + * + * @param reads The original list of unmodified reads + * @return processedReads. A new list of reads, in the same order, whose qualities have been altered by PCR error model and minimal quality thresholding + */ + private List modifyReadQualities(final List reads) { + List processedReads = new LinkedList<>(); + for ( GATKSAMRecord read : reads ) { + + final byte[] readBases = read.getReadBases(); + + // NOTE -- must clone anything that gets modified here so we don't screw up future uses of the read + final byte[] readQuals = read.getBaseQualities().clone(); + final byte[] readInsQuals = read.getBaseInsertionQualities().clone(); + final byte[] readDelQuals = read.getBaseDeletionQualities().clone(); + + applyPCRErrorModel(readBases, readInsQuals, readDelQuals); + capMinimumReadQualities(read, readQuals, readInsQuals, readDelQuals); + + // Create a new copy of the read and sets its base qualities to the modified versions. + // Pack this into a new list for return + final GATKSAMRecord processedRead = GATKSAMRecord.createQualityModifiedRead(read, readBases, readQuals, readInsQuals, readDelQuals); + processedReads.add(processedRead); + } + return processedReads; + } + + /** + * Post-processing of the read/allele likelihoods. + * + * We send quality-capped reads to the pairHMM for evaluation, and it returns a map containing these capped reads. + * We wish to return a map containing the original, unmodified reads. + * + * At the same time, we want to effectively set a lower cap on the reference score, based on the global mis-mapping rate. + * This protects us from the case where the assembly has produced haplotypes + * that are very divergent from reference, but are supported by only one read. In effect + * we capping how badly scoring the reference can be for any read by the chance that the read + * itself just doesn't belong here + * + * @param perReadAlleleLikelihoodMap the original map returned by the PairHMM. Contains the processed reads, the haplotype Alleles, and their log10ls + * @param reads Our original, unmodified reads + * @param processedReads Reads whose minimum base,insertion,deletion qualities have been capped; these were actually used to derive log10ls + * @param alleleHaplotypeMap The map associating the Allele and Haplotype versions of each haplotype + * + * @return processedReadAlleleLikelihoodMap; a new PRALM containing the original reads, and their haplotype log10ls including capped reference log10ls + */ + private PerReadAlleleLikelihoodMap capReferenceHaplotypeLikelihoods(PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap, List reads, List processedReads, Map alleleHaplotypeMap){ + + // a new read/allele map, to contain the uncapped reads, haplotypes, and potentially the capped reference log10ls + final PerReadAlleleLikelihoodMap processedReadAlleleLikelihoodMap = new PerReadAlleleLikelihoodMap(); + + Allele refAllele = null; + final int numReads = reads.size(); + for (int readIndex = 0; readIndex < numReads; readIndex++) { + + // Get the original and quality-modified read from their respective lists + // Note that this requires both lists to have reads in the same order + final GATKSAMRecord originalRead = reads.get(readIndex); + final GATKSAMRecord processedRead = processedReads.get(readIndex); + + // keep track of the reference likelihood and the best non-ref likelihood + double refLog10l = Double.NEGATIVE_INFINITY; + double bestNonReflog10L = Double.NEGATIVE_INFINITY; + + for ( Allele allele : alleleHaplotypeMap.keySet() ) { + final double log10l = perReadAlleleLikelihoodMap.getLikelihoodAssociatedWithReadAndAllele(processedRead, allele); + final Haplotype haplotype = alleleHaplotypeMap.get(allele); + if ( haplotype.isNonReference() ) + bestNonReflog10L = Math.max(bestNonReflog10L, log10l); + else { + refAllele = allele; + refLog10l = log10l; + } + writeDebugLikelihoods(processedRead, haplotype, log10l); + + // add the ORIGINAL (non-capped) read to the final map, along with the current haplotype and associated log10l + processedReadAlleleLikelihoodMap.add(originalRead, allele, log10l); + } + + // ensure that the reference haplotype is no worse than the best non-ref haplotype minus the global + // mismapping rate. This protects us from the case where the assembly has produced haplotypes + // that are very divergent from reference, but are supported by only one read. In effect + // we capping how badly scoring the reference can be for any read by the chance that the read + // itself just doesn't belong here + final double worstRefLog10Allowed = bestNonReflog10L + log10globalReadMismappingRate; + if ( refLog10l < (worstRefLog10Allowed) ) { + processedReadAlleleLikelihoodMap.add(originalRead, refAllele, worstRefLog10Allowed); + } + } + return processedReadAlleleLikelihoodMap; + } + + public Map computeReadLikelihoods( final List haplotypes, final Map> perSampleReadList ) { - // configure the HMM - initializePairHMM(haplotypes, perSampleReadList); // Add likelihoods for each sample's reads to our stratifiedReadMap final Map stratifiedReadMap = new LinkedHashMap<>(); @@ -218,137 +321,22 @@ public class LikelihoodCalculationEngine { } private PerReadAlleleLikelihoodMap computeReadLikelihoods( final List haplotypes, final List reads) { - // first, a little set up to get copies of the Haplotypes that are Alleles (more efficient than creating them each time) - final BatchPairHMM batchPairHMM = (pairHMM.get() instanceof BatchPairHMM) ? (BatchPairHMM)pairHMM.get() : null; - final Vector batchedReads = new Vector(reads.size()); - final int numHaplotypes = haplotypes.size(); - final Map alleleVersions = new LinkedHashMap<>(numHaplotypes); - Allele refAllele = null; - for ( final Haplotype haplotype : haplotypes ) { - final Allele allele = Allele.create(haplotype, true); - alleleVersions.put(haplotype, allele); - if ( haplotype.isReference() ) refAllele = allele; - } - final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap = new PerReadAlleleLikelihoodMap(); - for( final GATKSAMRecord read : reads ) { + // Modify the read qualities by applying the PCR error model and capping the minimum base,insertion,deletion qualities + List processedReads = modifyReadQualities(reads); - final byte[] readBases = read.getReadBases(); - final byte[] overallGCP = new byte[read.getReadLength()]; - Arrays.fill( overallGCP, constantGCP ); // Is there a way to derive empirical estimates for this from the data? + // Get alleles corresponding to our haplotypees + Map alleleHaplotypeMap = createAlleleMap(haplotypes); - // NOTE -- must clone anything that gets modified here so we don't screw up future uses of the read - final byte[] readQuals = read.getBaseQualities().clone(); - final byte[] readInsQuals = read.getBaseInsertionQualities().clone(); - final byte[] readDelQuals = read.getBaseDeletionQualities().clone(); + // Get an array containing the constantGCP for each read in our modified read list + Map GCPArrayMap = fillGCPArrays(processedReads); - applyPCRErrorModel(readBases, readInsQuals, readDelQuals); + // Run the PairHMM to calculate the log10 likelihood of each (processed) reads' arising from each haplotype + PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap = pairHMMThreadLocal.get().computeLikelihoods(processedReads, alleleHaplotypeMap, GCPArrayMap); - for( int kkk = 0; kkk < readQuals.length; kkk++ ) { - readQuals[kkk] = (byte) Math.min( 0xff & readQuals[kkk], read.getMappingQuality()); // cap base quality by mapping quality, as in UG - readQuals[kkk] = ( readQuals[kkk] < BASE_QUALITY_SCORE_THRESHOLD ? QualityUtils.MIN_USABLE_Q_SCORE : readQuals[kkk] ); - readInsQuals[kkk] = ( readInsQuals[kkk] < QualityUtils.MIN_USABLE_Q_SCORE ? QualityUtils.MIN_USABLE_Q_SCORE : readInsQuals[kkk] ); - readDelQuals[kkk] = ( readDelQuals[kkk] < QualityUtils.MIN_USABLE_Q_SCORE ? QualityUtils.MIN_USABLE_Q_SCORE : readDelQuals[kkk] ); - } + // Generate a new map containing the original, unmodified reads, and with minimal reference haplotype log10ls determined from the global mis-mapping rate - if ( batchPairHMM != null ) { - batchPairHMM.batchAdd(haplotypes, readBases, readQuals, readInsQuals, readDelQuals, overallGCP); - batchedReads.add(read); - continue; - } - - // keep track of the reference likelihood and the best non-ref likelihood - double refLog10l = Double.NEGATIVE_INFINITY; - double bestNonReflog10L = Double.NEGATIVE_INFINITY; - - // iterate over all haplotypes, calculating the likelihood of the read for each haplotype - for( int jjj = 0; jjj < numHaplotypes; jjj++ ) { - final Haplotype haplotype = haplotypes.get(jjj); - final byte[] nextHaplotypeBases = (jjj == numHaplotypes - 1) ? null : haplotypes.get(jjj+1).getBases(); - final boolean isFirstHaplotype = jjj == 0; - final double log10l = pairHMM.get().computeReadLikelihoodGivenHaplotypeLog10(haplotype.getBases(), - readBases, readQuals, readInsQuals, readDelQuals, overallGCP, isFirstHaplotype, nextHaplotypeBases); - - if ( WRITE_LIKELIHOODS_TO_FILE ) { - likelihoodsStream.printf("%s %s %s %s %s %s %f%n", - haplotype.getBaseString(), - new String(readBases), - SAMUtils.phredToFastq(readQuals), - SAMUtils.phredToFastq(readInsQuals), - SAMUtils.phredToFastq(readDelQuals), - SAMUtils.phredToFastq(overallGCP), - log10l); - } - - if ( haplotype.isNonReference() ) - bestNonReflog10L = Math.max(bestNonReflog10L, log10l); - else - refLog10l = log10l; - - perReadAlleleLikelihoodMap.add(read, alleleVersions.get(haplotype), log10l); - } - - // ensure that the reference haplotype is no worse than the best non-ref haplotype minus the global - // mismapping rate. This protects us from the case where the assembly has produced haplotypes - // that are very divergent from reference, but are supported by only one read. In effect - // we capping how badly scoring the reference can be for any read by the chance that the read - // itself just doesn't belong here - final double worstRefLog10Allowed = bestNonReflog10L + log10globalReadMismappingRate; - if ( refLog10l < (worstRefLog10Allowed) ) { - perReadAlleleLikelihoodMap.add(read, refAllele, worstRefLog10Allowed); - } - } - - if ( batchPairHMM != null ) { - for( final GATKSAMRecord read : batchedReads ) { - double refLog10l = Double.NEGATIVE_INFINITY; - double bestNonReflog10L = Double.NEGATIVE_INFINITY; - final double[] likelihoods = batchPairHMM.batchGetResult(); - for( int jjj = 0; jjj < numHaplotypes; jjj++ ) { - final Haplotype haplotype = haplotypes.get(jjj); - final double log10l = likelihoods[jjj]; - - if ( WRITE_LIKELIHOODS_TO_FILE ) { - final byte[] overallGCP = new byte[read.getReadLength()]; - Arrays.fill( overallGCP, constantGCP ); // Is there a way to derive empirical estimates for this from the data? - // NOTE -- must clone anything that gets modified here so we don't screw up future uses of the read - final byte[] readQuals = read.getBaseQualities().clone(); - final byte[] readInsQuals = read.getBaseInsertionQualities(); - final byte[] readDelQuals = read.getBaseDeletionQualities(); - for( int kkk = 0; kkk < readQuals.length; kkk++ ) { - readQuals[kkk] = (byte) Math.min( 0xff & readQuals[kkk], read.getMappingQuality()); // cap base quality by mapping quality, as in UG - //readQuals[kkk] = ( readQuals[kkk] > readInsQuals[kkk] ? readInsQuals[kkk] : readQuals[kkk] ); // cap base quality by base insertion quality, needs to be evaluated - //readQuals[kkk] = ( readQuals[kkk] > readDelQuals[kkk] ? readDelQuals[kkk] : readQuals[kkk] ); // cap base quality by base deletion quality, needs to be evaluated - // TODO -- why is Q18 hard-coded here??? - readQuals[kkk] = ( readQuals[kkk] < (byte) 18 ? QualityUtils.MIN_USABLE_Q_SCORE : readQuals[kkk] ); - } - likelihoodsStream.printf("%s %s %s %s %s %s %f%n", - haplotype.getBaseString(), - new String(read.getReadBases()), - SAMUtils.phredToFastq(readQuals), - SAMUtils.phredToFastq(readInsQuals), - SAMUtils.phredToFastq(readDelQuals), - SAMUtils.phredToFastq(overallGCP), - log10l); - } - - if ( haplotype.isNonReference() ) - bestNonReflog10L = Math.max(bestNonReflog10L, log10l); - else - refLog10l = log10l; - - - perReadAlleleLikelihoodMap.add(read, alleleVersions.get(haplotype), log10l); - } - - final double worstRefLog10Allowed = bestNonReflog10L + log10globalReadMismappingRate; - if ( refLog10l < (worstRefLog10Allowed) ) { - perReadAlleleLikelihoodMap.add(read, refAllele, worstRefLog10Allowed); - } - } - } - - return perReadAlleleLikelihoodMap; + return capReferenceHaplotypeLikelihoods(perReadAlleleLikelihoodMap, reads, processedReads, alleleHaplotypeMap); } @Requires({"alleleOrdering.size() > 0"}) @@ -421,125 +409,125 @@ public class LikelihoodCalculationEngine { // System to compute the best N haplotypes for genotyping // // -------------------------------------------------------------------------------- - - /** - * Helper function for selectBestHaplotypesFromEachSample that updates the score of haplotype haplotypeAsAllele - * @param map an annoying map object that moves us between the allele and haplotype representation - * @param haplotypeAsAllele the allele version of the haplotype - * @return the haplotype version, with its score incremented by 1 if its non-reference - */ - private Haplotype updateSelectHaplotype(final Map map, final Allele haplotypeAsAllele) { - final Haplotype h = map.get(haplotypeAsAllele); // TODO -- fixme when haplotypes are properly generic - if ( h.isNonReference() ) h.setScore(h.getScore() + 1); // ref is already at max value - return h; - } - - /** - * Take the best N haplotypes and return them as a list - * - * Only considers the haplotypes selectedHaplotypes that were actually selected by at least one sample - * as it's preferred haplotype. Takes the best N haplotypes from selectedHaplotypes in decreasing - * order of score (so higher score haplotypes are preferred). The N we take is determined by - * - * N = min(2 * nSamples + 1, maxNumHaplotypesInPopulation) - * - * where 2 * nSamples is the number of chromosomes in 2 samples including the reference, and our workload is - * bounded by maxNumHaplotypesInPopulation as that number can grow without bound - * - * @param selectedHaplotypes a non-null set of haplotypes with scores >= 1 - * @param nSamples the number of samples used to select the haplotypes - * @param maxNumHaplotypesInPopulation the maximum number of haplotypes we're allowed to take, regardless of nSamples - * @return a list of N or fewer haplotypes, with the reference haplotype first - */ - private List selectBestHaplotypesAccordingToScore(final Set selectedHaplotypes, final int nSamples, final int maxNumHaplotypesInPopulation) { - final List selectedHaplotypesList = new ArrayList(selectedHaplotypes); - Collections.sort(selectedHaplotypesList, new HaplotypeScoreComparator()); - final int numChromosomesInSamplesPlusRef = 2 * nSamples + 1; - final int haplotypesToKeep = Math.min(numChromosomesInSamplesPlusRef, maxNumHaplotypesInPopulation); - final List bestHaplotypes = selectedHaplotypesList.size() <= haplotypesToKeep ? selectedHaplotypesList : selectedHaplotypesList.subList(0, haplotypesToKeep); - if ( bestHaplotypes.get(0).isNonReference()) throw new IllegalStateException("BUG: reference haplotype should be first in list"); - return bestHaplotypes; - } - - /** - * Select the best haplotypes for genotyping the samples in stratifiedReadMap - * - * Selects these haplotypes by counting up how often each haplotype is selected as one of the most likely - * haplotypes per sample. What this means is that each sample computes the diploid genotype likelihoods for - * all possible pairs of haplotypes, and the pair with the highest likelihood has each haplotype each get - * one extra count for each haplotype (so hom-var haplotypes get two counts). After performing this calculation - * the best N haplotypes are selected (@see #selectBestHaplotypesAccordingToScore) and a list of the - * haplotypes in order of score are returned, ensuring that at least one of the haplotypes is reference. - * - * @param haplotypes a list of all haplotypes we're considering - * @param stratifiedReadMap a map from sample -> read likelihoods per haplotype - * @param maxNumHaplotypesInPopulation the max. number of haplotypes we can select from haplotypes - * @return a list of selected haplotypes with size <= maxNumHaplotypesInPopulation - */ - public List selectBestHaplotypesFromEachSample(final List haplotypes, final Map stratifiedReadMap, final int maxNumHaplotypesInPopulation) { - if ( haplotypes.size() < 2 ) throw new IllegalArgumentException("Must have at least 2 haplotypes to consider but only have " + haplotypes); - - if ( haplotypes.size() == 2 ) return haplotypes; // fast path -- we'll always want to use 2 haplotypes - - // all of the haplotypes that at least one sample called as one of the most likely - final Set selectedHaplotypes = new HashSet<>(); - selectedHaplotypes.add(findReferenceHaplotype(haplotypes)); // ref is always one of the selected - - // our annoying map from allele -> haplotype - final Map allele2Haplotype = new HashMap<>(); - for ( final Haplotype h : haplotypes ) { - h.setScore(h.isReference() ? Double.MAX_VALUE : 0.0); // set all of the scores to 0 (lowest value) for all non-ref haplotypes - allele2Haplotype.put(Allele.create(h, h.isReference()), h); - } - - // for each sample, compute the most likely pair of haplotypes - for ( final Map.Entry entry : stratifiedReadMap.entrySet() ) { - // get the two most likely haplotypes under a diploid model for this sample - final MostLikelyAllele mla = entry.getValue().getMostLikelyDiploidAlleles(); - - if ( mla != null ) { // there was something to evaluate in this sample - // note that there must be at least 2 haplotypes - final Haplotype best = updateSelectHaplotype(allele2Haplotype, mla.getMostLikelyAllele()); - final Haplotype second = updateSelectHaplotype(allele2Haplotype, mla.getSecondMostLikelyAllele()); - -// if ( DEBUG ) { -// logger.info("Chose haplotypes " + best + " " + best.getCigar() + " and " + second + " " + second.getCigar() + " for sample " + entry.getKey()); +// +// /** +// * Helper function for selectBestHaplotypesFromEachSample that updates the score of haplotype haplotypeAsAllele +// * @param map an annoying map object that moves us between the allele and haplotype representation +// * @param haplotypeAsAllele the allele version of the haplotype +// * @return the haplotype version, with its score incremented by 1 if its non-reference +// */ +// private Haplotype updateSelectHaplotype(final Map map, final Allele haplotypeAsAllele) { +// final Haplotype h = map.get(haplotypeAsAllele); // TODO -- fixme when haplotypes are properly generic +// if ( h.isNonReference() ) h.setScore(h.getScore() + 1); // ref is already at max value +// return h; +// } +// +// /** +// * Take the best N haplotypes and return them as a list +// * +// * Only considers the haplotypes selectedHaplotypes that were actually selected by at least one sample +// * as it's preferred haplotype. Takes the best N haplotypes from selectedHaplotypes in decreasing +// * order of score (so higher score haplotypes are preferred). The N we take is determined by +// * +// * N = min(2 * nSamples + 1, maxNumHaplotypesInPopulation) +// * +// * where 2 * nSamples is the number of chromosomes in 2 samples including the reference, and our workload is +// * bounded by maxNumHaplotypesInPopulation as that number can grow without bound +// * +// * @param selectedHaplotypes a non-null set of haplotypes with scores >= 1 +// * @param nSamples the number of samples used to select the haplotypes +// * @param maxNumHaplotypesInPopulation the maximum number of haplotypes we're allowed to take, regardless of nSamples +// * @return a list of N or fewer haplotypes, with the reference haplotype first +// */ +// private List selectBestHaplotypesAccordingToScore(final Set selectedHaplotypes, final int nSamples, final int maxNumHaplotypesInPopulation) { +// final List selectedHaplotypesList = new ArrayList<>(selectedHaplotypes); +// Collections.sort(selectedHaplotypesList, new HaplotypeScoreComparator()); +// final int numChromosomesInSamplesPlusRef = 2 * nSamples + 1; +// final int haplotypesToKeep = Math.min(numChromosomesInSamplesPlusRef, maxNumHaplotypesInPopulation); +// final List bestHaplotypes = selectedHaplotypesList.size() <= haplotypesToKeep ? selectedHaplotypesList : selectedHaplotypesList.subList(0, haplotypesToKeep); +// if ( bestHaplotypes.get(0).isNonReference()) throw new IllegalStateException("BUG: reference haplotype should be first in list"); +// return bestHaplotypes; +// } +// +// /** +// * Select the best haplotypes for genotyping the samples in stratifiedReadMap +// * +// * Selects these haplotypes by counting up how often each haplotype is selected as one of the most likely +// * haplotypes per sample. What this means is that each sample computes the diploid genotype likelihoods for +// * all possible pairs of haplotypes, and the pair with the highest likelihood has each haplotype each get +// * one extra count for each haplotype (so hom-var haplotypes get two counts). After performing this calculation +// * the best N haplotypes are selected (@see #selectBestHaplotypesAccordingToScore) and a list of the +// * haplotypes in order of score are returned, ensuring that at least one of the haplotypes is reference. +// * +// * @param haplotypes a list of all haplotypes we're considering +// * @param stratifiedReadMap a map from sample -> read likelihoods per haplotype +// * @param maxNumHaplotypesInPopulation the max. number of haplotypes we can select from haplotypes +// * @return a list of selected haplotypes with size <= maxNumHaplotypesInPopulation +// */ +// public List selectBestHaplotypesFromEachSample(final List haplotypes, final Map stratifiedReadMap, final int maxNumHaplotypesInPopulation) { +// if ( haplotypes.size() < 2 ) throw new IllegalArgumentException("Must have at least 2 haplotypes to consider but only have " + haplotypes); +// +// if ( haplotypes.size() == 2 ) return haplotypes; // fast path -- we'll always want to use 2 haplotypes +// +// // all of the haplotypes that at least one sample called as one of the most likely +// final Set selectedHaplotypes = new HashSet<>(); +// selectedHaplotypes.add(findReferenceHaplotype(haplotypes)); // ref is always one of the selected +// +// // our annoying map from allele -> haplotype +// final Map allele2Haplotype = new HashMap<>(); +// for ( final Haplotype h : haplotypes ) { +// h.setScore(h.isReference() ? Double.MAX_VALUE : 0.0); // set all of the scores to 0 (lowest value) for all non-ref haplotypes +// allele2Haplotype.put(Allele.create(h, h.isReference()), h); +// } +// +// // for each sample, compute the most likely pair of haplotypes +// for ( final Map.Entry entry : stratifiedReadMap.entrySet() ) { +// // get the two most likely haplotypes under a diploid model for this sample +// final MostLikelyAllele mla = entry.getValue().getMostLikelyDiploidAlleles(); +// +// if ( mla != null ) { // there was something to evaluate in this sample +// // note that there must be at least 2 haplotypes +// final Haplotype best = updateSelectHaplotype(allele2Haplotype, mla.getMostLikelyAllele()); +// final Haplotype second = updateSelectHaplotype(allele2Haplotype, mla.getSecondMostLikelyAllele()); +// +//// if ( DEBUG ) { +//// logger.info("Chose haplotypes " + best + " " + best.getCigar() + " and " + second + " " + second.getCigar() + " for sample " + entry.getKey()); +//// } +// +// // add these two haplotypes to the set of haplotypes that have been selected +// selectedHaplotypes.add(best); +// selectedHaplotypes.add(second); +// +// // we've already selected all of our haplotypes, and we don't need to prune them down +// if ( selectedHaplotypes.size() == haplotypes.size() && haplotypes.size() < maxNumHaplotypesInPopulation ) +// break; // } - - // add these two haplotypes to the set of haplotypes that have been selected - selectedHaplotypes.add(best); - selectedHaplotypes.add(second); - - // we've already selected all of our haplotypes, and we don't need to prune them down - if ( selectedHaplotypes.size() == haplotypes.size() && haplotypes.size() < maxNumHaplotypesInPopulation ) - break; - } - } - - // take the best N haplotypes forward, in order of the number of samples that choose them - final int nSamples = stratifiedReadMap.size(); - final List bestHaplotypes = selectBestHaplotypesAccordingToScore(selectedHaplotypes, nSamples, maxNumHaplotypesInPopulation); - - if ( DEBUG ) { - logger.info("Chose " + (bestHaplotypes.size() - 1) + " alternate haplotypes to genotype in all samples."); - for ( final Haplotype h : bestHaplotypes ) { - logger.info("\tHaplotype " + h.getCigar() + " selected for further genotyping" + (h.isNonReference() ? " found " + (int)h.getScore() + " haplotypes" : " as ref haplotype")); - } - } - return bestHaplotypes; - } - - /** - * Find the haplotype that isRef(), or @throw ReviewedStingException if one isn't found - * @param haplotypes non-null list of haplotypes - * @return the reference haplotype - */ - private static Haplotype findReferenceHaplotype( final List haplotypes ) { - for( final Haplotype h : haplotypes ) { - if( h.isReference() ) return h; - } - throw new ReviewedStingException( "No reference haplotype found in the list of haplotypes!" ); - } +// } +// +// // take the best N haplotypes forward, in order of the number of samples that choose them +// final int nSamples = stratifiedReadMap.size(); +// final List bestHaplotypes = selectBestHaplotypesAccordingToScore(selectedHaplotypes, nSamples, maxNumHaplotypesInPopulation); +// +// if ( DEBUG ) { +// logger.info("Chose " + (bestHaplotypes.size() - 1) + " alternate haplotypes to genotype in all samples."); +// for ( final Haplotype h : bestHaplotypes ) { +// logger.info("\tHaplotype " + h.getCigar() + " selected for further genotyping" + (h.isNonReference() ? " found " + (int)h.getScore() + " haplotypes" : " as ref haplotype")); +// } +// } +// return bestHaplotypes; +// } +// +// /** +// * Find the haplotype that isRef(), or @throw ReviewedStingException if one isn't found +// * @param haplotypes non-null list of haplotypes +// * @return the reference haplotype +// */ +// private static Haplotype findReferenceHaplotype( final List haplotypes ) { +// for( final Haplotype h : haplotypes ) { +// if( h.isReference() ) return h; +// } +// throw new ReviewedStingException( "No reference haplotype found in the list of haplotypes!" ); +// } // -------------------------------------------------------------------------------- // diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java index 3c6e409b9..97431368e 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java @@ -48,11 +48,11 @@ package org.broadinstitute.sting.gatk.walkers.indels; import com.google.java.contract.Ensures; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.utils.haplotype.Haplotype; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.clipping.ReadClipper; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.sting.utils.haplotype.Haplotype; import org.broadinstitute.sting.utils.pairhmm.ArrayLoglessPairHMM; import org.broadinstitute.sting.utils.pairhmm.Log10PairHMM; import org.broadinstitute.sting.utils.pairhmm.LoglessPairHMM; @@ -65,6 +65,7 @@ import org.broadinstitute.variant.variantcontext.Allele; import java.util.Arrays; import java.util.LinkedHashMap; +import java.util.LinkedList; import java.util.Map; //import org.broadinstitute.sting.utils.pairhmm.LoglessCachingPairHMM; @@ -206,6 +207,39 @@ public class PairHMMIndelErrorModel { } } + private LinkedHashMap trimHaplotypes(final LinkedHashMap haplotypeMap, + long startLocationInRefForHaplotypes, + long stopLocationInRefForHaplotypes, + final ReferenceContext ref){ + + final LinkedHashMap trimmedHaplotypeMap = new LinkedHashMap<>(); + for (final Allele a: haplotypeMap.keySet()) { + + final Haplotype haplotype = haplotypeMap.get(a); + + if (stopLocationInRefForHaplotypes > haplotype.getStopPosition()) + stopLocationInRefForHaplotypes = haplotype.getStopPosition(); + + if (startLocationInRefForHaplotypes < haplotype.getStartPosition()) + startLocationInRefForHaplotypes = haplotype.getStartPosition(); + else if (startLocationInRefForHaplotypes > haplotype.getStopPosition()) + startLocationInRefForHaplotypes = haplotype.getStopPosition(); + + final long indStart = startLocationInRefForHaplotypes - haplotype.getStartPosition(); + final long indStop = stopLocationInRefForHaplotypes - haplotype.getStartPosition(); + + if (DEBUG) + System.out.format("indStart: %d indStop: %d WinStart:%d WinStop:%d start: %d stop: %d\n", + indStart, indStop, ref.getWindow().getStart(), ref.getWindow().getStop(), startLocationInRefForHaplotypes, stopLocationInRefForHaplotypes); + + // get the trimmed haplotype-bases array and create a new haplotype based on it. Pack this into the new map + final byte[] trimmedHaplotypeBases = Arrays.copyOfRange(haplotype.getBases(), (int)indStart, (int)indStop); + final Haplotype trimmedHaplotype = new Haplotype(trimmedHaplotypeBases, haplotype.isReference()); + trimmedHaplotypeMap.put(a, trimmedHaplotype); + } + return trimmedHaplotypeMap; + } + public synchronized double[] computeDiploidReadHaplotypeLikelihoods(final ReadBackedPileup pileup, final LinkedHashMap haplotypeMap, @@ -231,6 +265,8 @@ public class PairHMMIndelErrorModel { final int[] readCounts) { final double readLikelihoods[][] = new double[pileup.getNumberOfElements()][haplotypeMap.size()]; + final LinkedList readList = new LinkedList<>(); + final Map readGCPArrayMap = new LinkedHashMap<>(); int readIdx=0; for (PileupElement p: pileup) { // > 1 when the read is a consensus read representing multiple independent observations @@ -369,86 +405,30 @@ public class PairHMMIndelErrorModel { baseDeletionQualities = contextLogGapOpenProbabilities; } - byte[] currentHaplotypeBases = null; - boolean firstHap = true; - double readLikelihood; - Allele currentAllele = null; - for (Allele a: haplotypeMap.keySet()) { + // Create a new read based on the current one, but with trimmed bases/quals, for use in the HMM + final GATKSAMRecord processedRead = GATKSAMRecord.createQualityModifiedRead(read, readBases, readQuals, baseInsertionQualities, baseDeletionQualities); + readList.add(processedRead); - Haplotype haplotype = haplotypeMap.get(a); + // Pack the shortened read and its associated gap-continuation-penalty array into a map, as required by PairHMM + readGCPArrayMap.put(processedRead,contextLogGapContinuationProbabilities); - if (stopLocationInRefForHaplotypes > haplotype.getStopPosition()) - stopLocationInRefForHaplotypes = haplotype.getStopPosition(); + // Create a map of alleles to a new set of haplotypes, whose bases have been trimmed to the approprate genomic locations + final Map trimmedHaplotypeMap = trimHaplotypes(haplotypeMap, startLocationInRefForHaplotypes, stopLocationInRefForHaplotypes, ref); - if (startLocationInRefForHaplotypes < haplotype.getStartPosition()) - startLocationInRefForHaplotypes = haplotype.getStartPosition(); - else if (startLocationInRefForHaplotypes > haplotype.getStopPosition()) - startLocationInRefForHaplotypes = haplotype.getStopPosition(); + // Get the likelihoods for our clipped read against each of our trimmed haplotypes. + final PerReadAlleleLikelihoodMap singleReadRawLikelihoods = pairHMM.computeLikelihoods(readList, trimmedHaplotypeMap, readGCPArrayMap); - final long indStart = startLocationInRefForHaplotypes - haplotype.getStartPosition(); - final long indStop = stopLocationInRefForHaplotypes - haplotype.getStartPosition(); - - - if (DEBUG) - System.out.format("indStart: %d indStop: %d WinStart:%d WinStop:%d start: %d stop: %d readLength: %d C:%s\n", - indStart, indStop, ref.getWindow().getStart(), ref.getWindow().getStop(), startLocationInRefForHaplotypes, stopLocationInRefForHaplotypes, read.getReadLength(), read.getCigar().toString()); - - // peak at the next haplotype in the list - final byte[] nextHaplotypeBases = Arrays.copyOfRange(haplotype.getBases(), (int)indStart, (int)indStop); - // process the current haplotype in the list - if (currentHaplotypeBases != null) { - // it's possible that the indel starts at the last base of the haplotypes - if ( currentHaplotypeBases.length == 0 ) { - readLikelihood = -Double.MAX_VALUE; - } else { - if (firstHap) { - //no need to reallocate arrays for each new haplotype, as length won't change - pairHMM.initialize(readBases.length, currentHaplotypeBases.length); - firstHap = false; - } - - readLikelihood = pairHMM.computeReadLikelihoodGivenHaplotypeLog10(currentHaplotypeBases, readBases, readQuals, - baseInsertionQualities, baseDeletionQualities, contextLogGapContinuationProbabilities, firstHap, nextHaplotypeBases); - } - - if (DEBUG) { - System.out.println("H:"+new String(currentHaplotypeBases)); - System.out.println("R:"+new String(readBases)); - System.out.format("L:%4.2f\n",readLikelihood); - } - - perReadAlleleLikelihoodMap.add(p, currentAllele, readLikelihood); - readLikelihoods[readIdx][j++] = readLikelihood; - } - // update the current haplotype - currentHaplotypeBases = nextHaplotypeBases; - currentAllele = a; - } - // process the final haplotype - if (currentHaplotypeBases != null) { - // it's possible that the indel starts at the last base of the haplotypes - if ( currentHaplotypeBases.length == 0 ) { - readLikelihood = -Double.MAX_VALUE; - } else { - if (firstHap) { - //no need to reallocate arrays for each new haplotype, as length won't change - pairHMM.initialize(readBases.length, currentHaplotypeBases.length); - firstHap = false; - } - // there is no next haplotype, so pass null for nextHaplotypeBases. - readLikelihood = pairHMM.computeReadLikelihoodGivenHaplotypeLog10(currentHaplotypeBases, readBases, readQuals, - baseInsertionQualities, baseDeletionQualities, contextLogGapContinuationProbabilities, firstHap, null); - } - - if (DEBUG) { - System.out.println("H:"+new String(currentHaplotypeBases)); - System.out.println("R:"+new String(readBases)); - System.out.format("L:%4.2f\n",readLikelihood); - } - - perReadAlleleLikelihoodMap.add(p, currentAllele, readLikelihood); + // Pack the original pilup element, each allele, and each associated log10 likelihood into a final map, and add each likelihood to the array + for (Allele a: trimmedHaplotypeMap.keySet()){ + double readLikelihood = singleReadRawLikelihoods.getLikelihoodAssociatedWithReadAndAllele(processedRead, a); + perReadAlleleLikelihoodMap.add(p, a, readLikelihood); readLikelihoods[readIdx][j++] = readLikelihood; } + // The readList for sending to the HMM should only ever contain 1 read, as each must be clipped individually + readList.remove(processedRead); + + // The same is true for the read/GCP-array map + readGCPArrayMap.remove(processedRead); } } readIdx++; @@ -472,16 +452,16 @@ public class PairHMMIndelErrorModel { return !((read.getAlignmentStart() >= eventStartPos-eventLength && read.getAlignmentStart() <= eventStartPos+1) || (read.getAlignmentEnd() >= eventStartPos && read.getAlignmentEnd() <= eventStartPos + eventLength)); } - private int computeFirstDifferingPosition(byte[] b1, byte[] b2) { - if (b1.length != b2.length) - return 0; // sanity check - - for (int i=0; i < b1.length; i++ ){ - if ( b1[i]!= b2[i] ) - return i; - } - return b1.length; - } +// private int computeFirstDifferingPosition(byte[] b1, byte[] b2) { +// if (b1.length != b2.length) +// return 0; // sanity check +// +// for (int i=0; i < b1.length; i++ ){ +// if ( b1[i]!= b2[i] ) +// return i; +// } +// return b1.length; +// } private static double[] getDiploidHaplotypeLikelihoods(final int numHaplotypes, final int readCounts[], final double readLikelihoods[][]) { final double[][] haplotypeLikehoodMatrix = new double[numHaplotypes][numHaplotypes]; diff --git a/protected/java/src/org/broadinstitute/sting/utils/pairhmm/ArrayLoglessPairHMM.java b/protected/java/src/org/broadinstitute/sting/utils/pairhmm/ArrayLoglessPairHMM.java index 4b996e770..a693ec22d 100644 --- a/protected/java/src/org/broadinstitute/sting/utils/pairhmm/ArrayLoglessPairHMM.java +++ b/protected/java/src/org/broadinstitute/sting/utils/pairhmm/ArrayLoglessPairHMM.java @@ -49,6 +49,7 @@ package org.broadinstitute.sting.utils.pairhmm; import com.google.java.contract.Ensures; import com.google.java.contract.Requires; import org.broadinstitute.sting.utils.QualityUtils; + import java.util.Arrays; /** diff --git a/protected/java/src/org/broadinstitute/sting/utils/pairhmm/CnyPairHMM.java b/protected/java/src/org/broadinstitute/sting/utils/pairhmm/CnyPairHMM.java index d92b918ba..b80036bb2 100644 --- a/protected/java/src/org/broadinstitute/sting/utils/pairhmm/CnyPairHMM.java +++ b/protected/java/src/org/broadinstitute/sting/utils/pairhmm/CnyPairHMM.java @@ -47,12 +47,16 @@ package org.broadinstitute.sting.utils.pairhmm; import org.broadinstitute.sting.utils.haplotype.Haplotype; +import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.variant.variantcontext.Allele; import java.io.File; import java.lang.reflect.Field; import java.util.Arrays; import java.util.LinkedList; import java.util.List; +import java.util.Map; public final class CnyPairHMM extends PairHMM implements BatchPairHMM { private static class HmmInput { @@ -62,14 +66,14 @@ public final class CnyPairHMM extends PairHMM implements BatchPairHMM { public byte[] deletionGOP; public byte[] overallGCP; public List haplotypes; - }; + } public static class ResultQueue { private int offset; private List batchResults; public ResultQueue() { - batchResults = new LinkedList(); + batchResults = new LinkedList<>(); offset = 0; } @@ -92,7 +96,7 @@ public final class CnyPairHMM extends PairHMM implements BatchPairHMM { final static String libName = "gmvhdl_gatk_hmm"; private static boolean loaded = false; - private List batchRequests = new LinkedList(); + private List batchRequests = new LinkedList<>(); private ResultQueue resultQueue = new ResultQueue(); static public boolean isAvailable() { @@ -184,6 +188,55 @@ public final class CnyPairHMM extends PairHMM implements BatchPairHMM { return results; } + /** + * {@inheritDoc} + */ + @Override + public PerReadAlleleLikelihoodMap computeLikelihoods(final List reads, final Map alleleHaplotypeMap, final Map GCPArrayMap){ + + // initialize the pairHMM if necessary + if (! initialized) { + int readMaxLength = findMaxReadLength(reads); + int haplotypeMaxLength = findMaxHaplotypeLength(alleleHaplotypeMap); + initialize(readMaxLength, haplotypeMaxLength); + } + + // Pass the read bases/quals, and the haplotypes as a list into the HMM + performBatchAdditions(reads, alleleHaplotypeMap, GCPArrayMap); + + // Get the log10-likelihoods for each read/haplotype ant pack into the results map + final PerReadAlleleLikelihoodMap likelihoodMap = new PerReadAlleleLikelihoodMap(); + collectLikelihoodResults(reads, alleleHaplotypeMap, likelihoodMap); + + return likelihoodMap; + } + + private void collectLikelihoodResults(List reads, Map alleleHaplotypeMap, PerReadAlleleLikelihoodMap likelihoodMap) { + for(final GATKSAMRecord read : reads){ + final double[] likelihoods = batchGetResult(); + int jjj = 0; + for (Allele allele : alleleHaplotypeMap.keySet()){ + final double log10l = likelihoods[jjj]; + likelihoodMap.add(read, allele, log10l); + jjj++; + } + } + } + + private void performBatchAdditions(List reads, Map alleleHaplotypeMap, Map GCPArrayMap) { + final List haplotypeList = getHaplotypeList(alleleHaplotypeMap); + for(final GATKSAMRecord read : reads){ + final byte[] readBases = read.getReadBases(); + final byte[] readQuals = read.getBaseQualities(); + final byte[] readInsQuals = read.getBaseInsertionQualities(); + final byte[] readDelQuals = read.getBaseDeletionQualities(); + final byte[] overallGCP = GCPArrayMap.get(read); + + batchAdd(haplotypeList, readBases, readQuals, readInsQuals, readDelQuals, overallGCP); + } + } + + protected double subComputeReadLikelihoodGivenHaplotypeLog10( final byte[] haplotypeBases, final byte[] readBases, final byte[] readQuals, @@ -196,6 +249,14 @@ public final class CnyPairHMM extends PairHMM implements BatchPairHMM { return 0.0; } + private List getHaplotypeList(Map alleleHaplotypeMap){ + final List haplotypeList = new LinkedList<>(); + for (Allele a : alleleHaplotypeMap.keySet()){ + haplotypeList.add(alleleHaplotypeMap.get(a)); + } + return haplotypeList; + } + private void enqueuePrepare(byte[] haplotypeBases, byte[] readBases) { double[] results = null; int n = dequeueRequirement(haplotypeBases.length, readBases.length); diff --git a/protected/java/src/org/broadinstitute/sting/utils/pairhmm/LoglessPairHMM.java b/protected/java/src/org/broadinstitute/sting/utils/pairhmm/LoglessPairHMM.java index e745ca1f5..de2994927 100644 --- a/protected/java/src/org/broadinstitute/sting/utils/pairhmm/LoglessPairHMM.java +++ b/protected/java/src/org/broadinstitute/sting/utils/pairhmm/LoglessPairHMM.java @@ -70,17 +70,6 @@ public final class LoglessPairHMM extends N2MemoryPairHMM { private static final int deletionToDeletion = 5; - /** - * {@inheritDoc} - */ - @Override - public void initialize(final int readMaxLength, final int haplotypeMaxLength ) { - super.initialize(readMaxLength, haplotypeMaxLength); - - transition = new double[paddedMaxReadLength][6]; - prior = new double[paddedMaxReadLength][paddedMaxHaplotypeLength]; - } - /** * {@inheritDoc} */ diff --git a/public/java/src/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMap.java b/public/java/src/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMap.java index 70be85f54..af22eb7f9 100644 --- a/public/java/src/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMap.java +++ b/public/java/src/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMap.java @@ -110,7 +110,7 @@ public class PerReadAlleleLikelihoodMap { * @return a map from each allele to a list of reads that 'support' the allele */ protected Map> getAlleleStratifiedReadMap() { - final Map> alleleReadMap = new HashMap>(alleles.size()); + final Map> alleleReadMap = new HashMap<>(alleles.size()); for ( final Allele allele : alleles ) alleleReadMap.put(allele, new ArrayList()); @@ -152,7 +152,7 @@ public class PerReadAlleleLikelihoodMap { /** * Does the current map contain the key associated with a particular SAM record in pileup? * @param p Pileup element - * @return + * @return true if the map contains pileup element, else false */ public boolean containsPileupElement(final PileupElement p) { return likelihoodReadMap.containsKey(p.getRead()); @@ -176,9 +176,9 @@ public class PerReadAlleleLikelihoodMap { return likelihoodReadMap.keySet(); } - public Collection> getLikelihoodMapValues() { - return likelihoodReadMap.values(); - } +// public Collection> getLikelihoodMapValues() { +// return likelihoodReadMap.values(); +// } public int getNumberOfStoredElements() { return likelihoodReadMap.size(); @@ -191,6 +191,21 @@ public class PerReadAlleleLikelihoodMap { return likelihoodReadMap.get(p.getRead()); } + + /** + * Get the log10 likelihood associated with an individual read/allele + * + * @param read the read whose likelihood we want + * @param allele the allele whose likelihood we want + * @return the log10 likelihood that this read matches this allele + */ + public double getLikelihoodAssociatedWithReadAndAllele(final GATKSAMRecord read, final Allele allele){ + if (!allelesSet.contains(allele) || !likelihoodReadMap.containsKey(read)) + return 0.0; + + return likelihoodReadMap.get(read).get(allele); + } + /** * Get the most likely alleles estimated across all reads in this object * diff --git a/public/java/src/org/broadinstitute/sting/utils/pairhmm/Log10PairHMM.java b/public/java/src/org/broadinstitute/sting/utils/pairhmm/Log10PairHMM.java index e7bc5cb56..83b87da95 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pairhmm/Log10PairHMM.java +++ b/public/java/src/org/broadinstitute/sting/utils/pairhmm/Log10PairHMM.java @@ -85,9 +85,6 @@ public final class Log10PairHMM extends N2MemoryPairHMM { Arrays.fill(insertionMatrix[iii], Double.NEGATIVE_INFINITY); Arrays.fill(deletionMatrix[iii], Double.NEGATIVE_INFINITY); } - - transition = new double[paddedMaxReadLength][6]; - prior = new double[paddedMaxReadLength][paddedMaxHaplotypeLength]; } /** diff --git a/public/java/src/org/broadinstitute/sting/utils/pairhmm/N2MemoryPairHMM.java b/public/java/src/org/broadinstitute/sting/utils/pairhmm/N2MemoryPairHMM.java index a091a0716..c02461c03 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pairhmm/N2MemoryPairHMM.java +++ b/public/java/src/org/broadinstitute/sting/utils/pairhmm/N2MemoryPairHMM.java @@ -26,10 +26,6 @@ package org.broadinstitute.sting.utils.pairhmm; import com.google.java.contract.Requires; -import org.apache.log4j.Logger; -import org.broadinstitute.sting.utils.MathUtils; - -import java.util.Arrays; /** * Superclass for PairHMM that want to use a full read x haplotype matrix for their match, insertion, and deletion matrix @@ -58,6 +54,9 @@ abstract class N2MemoryPairHMM extends PairHMM { matchMatrix = new double[paddedMaxReadLength][paddedMaxHaplotypeLength]; insertionMatrix = new double[paddedMaxReadLength][paddedMaxHaplotypeLength]; deletionMatrix = new double[paddedMaxReadLength][paddedMaxHaplotypeLength]; + + transition = new double[paddedMaxReadLength][6]; + prior = new double[paddedMaxReadLength][paddedMaxHaplotypeLength]; } /** diff --git a/public/java/src/org/broadinstitute/sting/utils/pairhmm/PairHMM.java b/public/java/src/org/broadinstitute/sting/utils/pairhmm/PairHMM.java index eb52f4a85..ce3d43a06 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pairhmm/PairHMM.java +++ b/public/java/src/org/broadinstitute/sting/utils/pairhmm/PairHMM.java @@ -28,9 +28,14 @@ package org.broadinstitute.sting.utils.pairhmm; import com.google.java.contract.Requires; import org.apache.log4j.Logger; import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.sting.utils.haplotype.Haplotype; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.variant.variantcontext.Allele; import java.util.Arrays; - +import java.util.List; +import java.util.Map; /** * Util class for performing the pair HMM for local alignment. Figure 4.3 in Durbin 1998 book. * @@ -59,7 +64,7 @@ public abstract class PairHMM { protected int maxHaplotypeLength, maxReadLength; protected int paddedMaxReadLength, paddedMaxHaplotypeLength; protected int paddedReadLength, paddedHaplotypeLength; - private boolean initialized = false; + protected boolean initialized = false; // only used for debugging purposes protected boolean doNotUseTristateCorrection = false; @@ -73,7 +78,7 @@ public abstract class PairHMM { * @param haplotypeMaxLength the max length of haplotypes we want to use with this PairHMM * @param readMaxLength the max length of reads we want to use with this PairHMM */ - public void initialize( final int readMaxLength, final int haplotypeMaxLength ) { + protected void initialize( final int readMaxLength, final int haplotypeMaxLength ) { if ( readMaxLength <= 0 ) throw new IllegalArgumentException("READ_MAX_LENGTH must be > 0 but got " + readMaxLength); if ( haplotypeMaxLength <= 0 ) throw new IllegalArgumentException("HAPLOTYPE_MAX_LENGTH must be > 0 but got " + haplotypeMaxLength); @@ -90,6 +95,79 @@ public abstract class PairHMM { initialized = true; } + protected int findMaxReadLength(final List reads) { + int listMaxReadLength = 0; + for(GATKSAMRecord read : reads){ + final int readLength = read.getReadLength(); + if( readLength > listMaxReadLength ) { listMaxReadLength = readLength; } + } + return listMaxReadLength; + } + + protected int findMaxHaplotypeLength(final Map haplotypeMap) { + int listMaxHaplotypeLength = 0; + for( final Allele a: haplotypeMap.keySet() ) { + final Haplotype h = haplotypeMap.get(a); + final int haplotypeLength = h.getBases().length; + if( haplotypeLength > listMaxHaplotypeLength ) { listMaxHaplotypeLength = haplotypeLength; } + } + return listMaxHaplotypeLength; + } + + /** + * Given a list of reads and haplotypes, for every read compute the total probability of said read arising from + * each haplotype given base substitution, insertion, and deletion probabilities. + * + * @param reads the list of reads + * @param alleleHaplotypeMap the list of haplotypes + * @param GCPArrayMap Each read is associated with an array containing the gap continuation penalties for use in the model. Length of each GCP-array must match that of its read. + * @return a PerReadAlleleLikelihoodMap containing each read, haplotype-allele, and the log10 probability of + * said read coming from the said haplotype under the provided error model + */ + public PerReadAlleleLikelihoodMap computeLikelihoods(final List reads, final Map alleleHaplotypeMap, final Map GCPArrayMap) { + + // (re)initialize the pairHMM only if necessary + final int readMaxLength = findMaxReadLength(reads); + final int haplotypeMaxLength = findMaxHaplotypeLength(alleleHaplotypeMap); + if (!initialized || readMaxLength > maxReadLength || haplotypeMaxLength > maxHaplotypeLength) { initialize(readMaxLength, haplotypeMaxLength); } + + final PerReadAlleleLikelihoodMap likelihoodMap = new PerReadAlleleLikelihoodMap(); + for(GATKSAMRecord read : reads){ + final byte[] readBases = read.getReadBases(); + final byte[] readQuals = read.getBaseQualities(); + final byte[] readInsQuals = read.getBaseInsertionQualities(); + final byte[] readDelQuals = read.getBaseDeletionQualities(); + final byte[] overallGCP = GCPArrayMap.get(read); + + // peak at the next haplotype in the list (necessary to get nextHaplotypeBases, which is required for caching in the array implementation) + byte[] currentHaplotypeBases = null; + boolean isFirstHaplotype = true; + Allele currentAllele = null; + double log10l; + for (final Allele allele : alleleHaplotypeMap.keySet()){ + final Haplotype haplotype = alleleHaplotypeMap.get(allele); + final byte[] nextHaplotypeBases = haplotype.getBases(); + if (currentHaplotypeBases != null) { + log10l = computeReadLikelihoodGivenHaplotypeLog10(currentHaplotypeBases, + readBases, readQuals, readInsQuals, readDelQuals, overallGCP, isFirstHaplotype, nextHaplotypeBases); + likelihoodMap.add(read, currentAllele, log10l); + } + // update the current haplotype + currentHaplotypeBases = nextHaplotypeBases; + currentAllele = allele; + } + // process the final haplotype + if (currentHaplotypeBases != null) { + + // there is no next haplotype, so pass null for nextHaplotypeBases. + log10l = computeReadLikelihoodGivenHaplotypeLog10(currentHaplotypeBases, + readBases, readQuals, readInsQuals, readDelQuals, overallGCP, isFirstHaplotype, null); + likelihoodMap.add(read, currentAllele, log10l); + } + } + return likelihoodMap; + } + /** * Compute the total probability of read arising from haplotypeBases given base substitution, insertion, and deletion * probabilities. @@ -110,7 +188,7 @@ public abstract class PairHMM { * parameters are the same, and only the haplotype bases are changing underneath us * @return the log10 probability of read coming from the haplotype under the provided error model */ - public final double computeReadLikelihoodGivenHaplotypeLog10( final byte[] haplotypeBases, + protected final double computeReadLikelihoodGivenHaplotypeLog10( final byte[] haplotypeBases, final byte[] readBases, final byte[] readQuals, final byte[] insertionGOP, @@ -118,6 +196,7 @@ public abstract class PairHMM { final byte[] overallGCP, final boolean recacheReadValues, final byte[] nextHaploytpeBases) { + if ( ! initialized ) throw new IllegalStateException("Must call initialize before calling computeReadLikelihoodGivenHaplotypeLog10"); if ( haplotypeBases == null ) throw new IllegalArgumentException("haplotypeBases cannot be null"); if ( haplotypeBases.length > maxHaplotypeLength ) throw new IllegalArgumentException("Haplotype bases is too long, got " + haplotypeBases.length + " but max is " + maxHaplotypeLength); diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java index c39245730..93718b04d 100644 --- a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java @@ -139,7 +139,7 @@ public class GATKSAMRecord extends BAMRecord { } public static GATKSAMRecord createRandomRead(int length) { - List cigarElements = new LinkedList(); + List cigarElements = new LinkedList<>(); cigarElements.add(new CigarElement(length, CigarOperator.M)); Cigar cigar = new Cigar(cigarElements); return ArtificialSAMUtils.createArtificialRead(cigar); @@ -536,10 +536,7 @@ public class GATKSAMRecord extends BAMRecord { * @return True if an attribute has been set for this key. */ public boolean containsTemporaryAttribute(Object key) { - if(temporaryAttributes != null) { - return temporaryAttributes.containsKey(key); - } - return false; + return temporaryAttributes != null && temporaryAttributes.containsKey(key); } /** @@ -556,7 +553,7 @@ public class GATKSAMRecord extends BAMRecord { */ public Object setTemporaryAttribute(Object key, Object value) { if(temporaryAttributes == null) { - temporaryAttributes = new HashMap(); + temporaryAttributes = new HashMap<>(); } return temporaryAttributes.put(key, value); } @@ -750,6 +747,46 @@ public class GATKSAMRecord extends BAMRecord { return emptyRead; } + /** + * Creates a new GATKSAMRecord with the source read's header, read group and mate + * information, but with the following fields set to user-supplied values: + * - Read Bases + * - Base Qualities + * - Base Insertion Qualities + * - Base Deletion Qualities + * + * Cigar string is empty (not-null) + * + * Use this method if you want to create a new GATKSAMRecord based on + * another GATKSAMRecord, but with modified bases and qualities + * + * @param read a read to copy the header from + * @param readBases an array containing the new bases you wish use in place of the originals + * @param baseQualities an array containing the new base qualities you wish use in place of the originals + * @param baseInsertionQualities an array containing the new base insertion qaulities + * @param baseDeletionQualities an array containing the new base deletion qualities + * @return a read with modified bases and qualities, safe for the GATK + */ + public static GATKSAMRecord createQualityModifiedRead(final GATKSAMRecord read, + final byte[] readBases, + final byte[] baseQualities, + final byte[] baseInsertionQualities, + final byte[] baseDeletionQualities) { + if ( baseQualities.length != readBases.length || baseInsertionQualities.length != readBases.length || baseDeletionQualities.length != readBases.length ) + throw new IllegalArgumentException("Read bases and read quality arrays aren't the same size: Bases:" + readBases.length + + " vs Base Q's:" + baseQualities.length + + " vs Insert Q's:" + baseInsertionQualities.length + + " vs Delete Q's:" + baseDeletionQualities.length); + + final GATKSAMRecord processedRead = GATKSAMRecord.emptyRead(read); + processedRead.setReadBases(readBases); + processedRead.setBaseQualities(baseQualities, EventType.BASE_SUBSTITUTION); + processedRead.setBaseQualities(baseInsertionQualities, EventType.BASE_INSERTION); + processedRead.setBaseQualities(baseDeletionQualities, EventType.BASE_DELETION); + + return processedRead; + } + /** * Shallow copy of everything, except for the attribute list and the temporary attributes. * A new list of the attributes is created for both, but the attributes themselves are copied by reference. @@ -762,7 +799,7 @@ public class GATKSAMRecord extends BAMRecord { public Object clone() throws CloneNotSupportedException { final GATKSAMRecord clone = (GATKSAMRecord) super.clone(); if (temporaryAttributes != null) { - clone.temporaryAttributes = new HashMap(); + clone.temporaryAttributes = new HashMap<>(); for (Object attribute : temporaryAttributes.keySet()) clone.setTemporaryAttribute(attribute, temporaryAttributes.get(attribute)); }