Improve the PairHMM API for better FPGA integration

Motivation:
The API was different between the regular PairHMM and the FPGA-implementation
via CnyPairHMM. As a result, the LikelihoodCalculationEngine had
to use account for this. The goal is to change the API to be the same
for all implementations, and make it easier to access.

PairHMM
PairHMM now accepts a list of reads and a map of alleles/haplotpes and returns a PerReadAlleleLikelihoodMap.
Added a new primary method that loops the reads and haplotypes, extracts qualities,
and passes them to the computeReadLikelihoodGivenHaplotypeLog10 method.
Did not alter that method, or its subcompute method, at all.
PairHMM also now handles its own (re)initialization, so users don't have to worry about that.

CnyPairHMM
Added that same new primary access method to this FPGA class.
Method overrides the default implementation in PairHMM. Walks through a list of reads.
Individual-read quals and the full haplotype list are fed to batchAdd(), as before.
However, instead of waiting for every read to get added, and then walking through the reads
again to extract results, we just get the haplotype-results array for each read as soon as it
is generated, and pack it into a perReadAlleleLikelihoodMap for return.
The main access method is now the same no matter whether the FPGA CnyPairHMM is used or not.

LikelihoodCalculationEngine
The functionality to loop through the reads and haplotypes and get individual log10-likelihoods
was moved to the PairHMM, and so removed from here. However, this class does need to retain
the ability to pre-process the reads, and post-process the resulting likelihoods map.
Those features were separated from running the HMM and refactored into their own methods
Commented out the (unused) system for finding best N haplotypes for genotyping.

PairHMMIndelErrorModel
Similar changes were made as to the LCE. However, in this case the haplotypes are modified
based on each individual read, so the read-list we feed into the HMM only has one read.
This commit is contained in:
bradtaylor 2013-10-21 14:08:03 -04:00
parent 96024403bf
commit e40a07bb58
10 changed files with 547 additions and 401 deletions

View File

@ -52,18 +52,10 @@ import net.sf.samtools.SAMUtils;
import org.apache.log4j.Logger; import org.apache.log4j.Logger;
import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.genotyper.MostLikelyAllele;
import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
import org.broadinstitute.sting.utils.haplotype.Haplotype; import org.broadinstitute.sting.utils.haplotype.Haplotype;
import org.broadinstitute.sting.utils.haplotype.HaplotypeScoreComparator; import org.broadinstitute.sting.utils.pairhmm.*;
import org.broadinstitute.sting.utils.pairhmm.ArrayLoglessPairHMM;
import org.broadinstitute.sting.utils.pairhmm.Log10PairHMM;
import org.broadinstitute.sting.utils.pairhmm.LoglessPairHMM;
import org.broadinstitute.sting.utils.pairhmm.CnyPairHMM;
import org.broadinstitute.sting.utils.pairhmm.BatchPairHMM;
import org.broadinstitute.sting.utils.pairhmm.PairHMM;
import org.broadinstitute.sting.utils.recalibration.covariates.RepeatCovariate; import org.broadinstitute.sting.utils.recalibration.covariates.RepeatCovariate;
import org.broadinstitute.sting.utils.recalibration.covariates.RepeatLengthCovariate; import org.broadinstitute.sting.utils.recalibration.covariates.RepeatLengthCovariate;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
@ -88,7 +80,7 @@ public class LikelihoodCalculationEngine {
private final PairHMM.HMM_IMPLEMENTATION hmmType; private final PairHMM.HMM_IMPLEMENTATION hmmType;
private final boolean noFpga; private final boolean noFpga;
private final ThreadLocal<PairHMM> pairHMM = new ThreadLocal<PairHMM>() { private final ThreadLocal<PairHMM> pairHMMThreadLocal = new ThreadLocal<PairHMM>() {
@Override @Override
protected PairHMM initialValue() { protected PairHMM initialValue() {
switch (hmmType) { switch (hmmType) {
@ -109,6 +101,8 @@ public class LikelihoodCalculationEngine {
} }
} }
}; };
// Attempted to do as below, to avoid calling pairHMMThreadLocal.get() later on, but it resulted in a NullPointerException
// private final PairHMM pairHMM = pairHMMThreadLocal.get();
private final static boolean WRITE_LIKELIHOODS_TO_FILE = false; private final static boolean WRITE_LIKELIHOODS_TO_FILE = false;
private final static String LIKELIHOODS_FILENAME = "likelihoods.txt"; private final static String LIKELIHOODS_FILENAME = "likelihoods.txt";
@ -173,36 +167,145 @@ public class LikelihoodCalculationEngine {
if ( likelihoodsStream != null ) likelihoodsStream.close(); if ( likelihoodsStream != null ) likelihoodsStream.close();
} }
/** private void writeDebugLikelihoods(final GATKSAMRecord processedRead, final Haplotype haplotype, final double log10l){
* Initialize our pairHMM with parameters appropriate to the haplotypes and reads we're going to evaluate if ( WRITE_LIKELIHOODS_TO_FILE ) {
* likelihoodsStream.printf("%s %s %s %s %s %s %f%n",
* After calling this routine the PairHMM will be configured to best evaluate all reads in the samples haplotype.getBaseString(),
* against the set of haplotypes new String(processedRead.getReadBases() ),
* SAMUtils.phredToFastq(processedRead.getBaseQualities() ),
* @param haplotypes a non-null list of haplotypes SAMUtils.phredToFastq(processedRead.getBaseInsertionQualities() ),
* @param perSampleReadList a mapping from sample -> reads SAMUtils.phredToFastq(processedRead.getBaseDeletionQualities() ),
*/ SAMUtils.phredToFastq(constantGCP),
private void initializePairHMM(final List<Haplotype> haplotypes, final Map<String, List<GATKSAMRecord>> perSampleReadList) { log10l);
int X_METRIC_LENGTH = 0;
for( final Map.Entry<String, List<GATKSAMRecord>> sample : perSampleReadList.entrySet() ) {
for( final GATKSAMRecord read : sample.getValue() ) {
final int readLength = read.getReadLength();
if( readLength > X_METRIC_LENGTH ) { X_METRIC_LENGTH = readLength; }
}
} }
int Y_METRIC_LENGTH = 0;
for( final Haplotype h : haplotypes ) {
final int haplotypeLength = h.getBases().length;
if( haplotypeLength > Y_METRIC_LENGTH ) { Y_METRIC_LENGTH = haplotypeLength; }
}
// initialize arrays to hold the probabilities of being in the match, insertion and deletion cases
pairHMM.get().initialize(X_METRIC_LENGTH, Y_METRIC_LENGTH);
} }
private Map<Allele, Haplotype> createAlleleMap(List<Haplotype> haplotypes){
final int numHaplotypes = haplotypes.size();
final Map<Allele, Haplotype> alleleMap = new LinkedHashMap<>(numHaplotypes);
for ( final Haplotype haplotype : haplotypes ) {
final Allele allele = Allele.create(haplotype, true);
alleleMap.put(allele, haplotype);
}
return alleleMap;
}
private Map<GATKSAMRecord, byte[]> fillGCPArrays(List<GATKSAMRecord> reads){
final Map<GATKSAMRecord, byte []> GCPArrayMap = new LinkedHashMap<>();
for (GATKSAMRecord read: reads){
byte [] GCPArray = new byte[read.getReadBases().length];
Arrays.fill( GCPArray, constantGCP ); // Is there a way to derive empirical estimates for this from the data?
GCPArrayMap.put(read, GCPArray);
}
return GCPArrayMap;
}
private void capMinimumReadQualities(GATKSAMRecord read, byte[] readQuals, byte[] readInsQuals, byte[] readDelQuals) {
for( int kkk = 0; kkk < readQuals.length; kkk++ ) {
readQuals[kkk] = (byte) Math.min( 0xff & readQuals[kkk], read.getMappingQuality()); // cap base quality by mapping quality, as in UG
readQuals[kkk] = ( readQuals[kkk] < BASE_QUALITY_SCORE_THRESHOLD ? QualityUtils.MIN_USABLE_Q_SCORE : readQuals[kkk] );
readInsQuals[kkk] = ( readInsQuals[kkk] < QualityUtils.MIN_USABLE_Q_SCORE ? QualityUtils.MIN_USABLE_Q_SCORE : readInsQuals[kkk] );
readDelQuals[kkk] = ( readDelQuals[kkk] < QualityUtils.MIN_USABLE_Q_SCORE ? QualityUtils.MIN_USABLE_Q_SCORE : readDelQuals[kkk] );
}
}
/**
* Pre-processing of the reads to be evaluated at the current location from the current sample.
* We apply the PCR Error Model, and cap the minimum base, insertion, and deletion qualities of each read.
* Modified copies of reads are packed into a new list, while original reads are retained for downstream use
*
* @param reads The original list of unmodified reads
* @return processedReads. A new list of reads, in the same order, whose qualities have been altered by PCR error model and minimal quality thresholding
*/
private List<GATKSAMRecord> modifyReadQualities(final List<GATKSAMRecord> reads) {
List<GATKSAMRecord> processedReads = new LinkedList<>();
for ( GATKSAMRecord read : reads ) {
final byte[] readBases = read.getReadBases();
// NOTE -- must clone anything that gets modified here so we don't screw up future uses of the read
final byte[] readQuals = read.getBaseQualities().clone();
final byte[] readInsQuals = read.getBaseInsertionQualities().clone();
final byte[] readDelQuals = read.getBaseDeletionQualities().clone();
applyPCRErrorModel(readBases, readInsQuals, readDelQuals);
capMinimumReadQualities(read, readQuals, readInsQuals, readDelQuals);
// Create a new copy of the read and sets its base qualities to the modified versions.
// Pack this into a new list for return
final GATKSAMRecord processedRead = GATKSAMRecord.createQualityModifiedRead(read, readBases, readQuals, readInsQuals, readDelQuals);
processedReads.add(processedRead);
}
return processedReads;
}
/**
* Post-processing of the read/allele likelihoods.
*
* We send quality-capped reads to the pairHMM for evaluation, and it returns a map containing these capped reads.
* We wish to return a map containing the original, unmodified reads.
*
* At the same time, we want to effectively set a lower cap on the reference score, based on the global mis-mapping rate.
* This protects us from the case where the assembly has produced haplotypes
* that are very divergent from reference, but are supported by only one read. In effect
* we capping how badly scoring the reference can be for any read by the chance that the read
* itself just doesn't belong here
*
* @param perReadAlleleLikelihoodMap the original map returned by the PairHMM. Contains the processed reads, the haplotype Alleles, and their log10ls
* @param reads Our original, unmodified reads
* @param processedReads Reads whose minimum base,insertion,deletion qualities have been capped; these were actually used to derive log10ls
* @param alleleHaplotypeMap The map associating the Allele and Haplotype versions of each haplotype
*
* @return processedReadAlleleLikelihoodMap; a new PRALM containing the original reads, and their haplotype log10ls including capped reference log10ls
*/
private PerReadAlleleLikelihoodMap capReferenceHaplotypeLikelihoods(PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap, List<GATKSAMRecord> reads, List<GATKSAMRecord> processedReads, Map<Allele, Haplotype> alleleHaplotypeMap){
// a new read/allele map, to contain the uncapped reads, haplotypes, and potentially the capped reference log10ls
final PerReadAlleleLikelihoodMap processedReadAlleleLikelihoodMap = new PerReadAlleleLikelihoodMap();
Allele refAllele = null;
final int numReads = reads.size();
for (int readIndex = 0; readIndex < numReads; readIndex++) {
// Get the original and quality-modified read from their respective lists
// Note that this requires both lists to have reads in the same order
final GATKSAMRecord originalRead = reads.get(readIndex);
final GATKSAMRecord processedRead = processedReads.get(readIndex);
// keep track of the reference likelihood and the best non-ref likelihood
double refLog10l = Double.NEGATIVE_INFINITY;
double bestNonReflog10L = Double.NEGATIVE_INFINITY;
for ( Allele allele : alleleHaplotypeMap.keySet() ) {
final double log10l = perReadAlleleLikelihoodMap.getLikelihoodAssociatedWithReadAndAllele(processedRead, allele);
final Haplotype haplotype = alleleHaplotypeMap.get(allele);
if ( haplotype.isNonReference() )
bestNonReflog10L = Math.max(bestNonReflog10L, log10l);
else {
refAllele = allele;
refLog10l = log10l;
}
writeDebugLikelihoods(processedRead, haplotype, log10l);
// add the ORIGINAL (non-capped) read to the final map, along with the current haplotype and associated log10l
processedReadAlleleLikelihoodMap.add(originalRead, allele, log10l);
}
// ensure that the reference haplotype is no worse than the best non-ref haplotype minus the global
// mismapping rate. This protects us from the case where the assembly has produced haplotypes
// that are very divergent from reference, but are supported by only one read. In effect
// we capping how badly scoring the reference can be for any read by the chance that the read
// itself just doesn't belong here
final double worstRefLog10Allowed = bestNonReflog10L + log10globalReadMismappingRate;
if ( refLog10l < (worstRefLog10Allowed) ) {
processedReadAlleleLikelihoodMap.add(originalRead, refAllele, worstRefLog10Allowed);
}
}
return processedReadAlleleLikelihoodMap;
}
public Map<String, PerReadAlleleLikelihoodMap> computeReadLikelihoods( final List<Haplotype> haplotypes, final Map<String, List<GATKSAMRecord>> perSampleReadList ) { public Map<String, PerReadAlleleLikelihoodMap> computeReadLikelihoods( final List<Haplotype> haplotypes, final Map<String, List<GATKSAMRecord>> perSampleReadList ) {
// configure the HMM
initializePairHMM(haplotypes, perSampleReadList);
// Add likelihoods for each sample's reads to our stratifiedReadMap // Add likelihoods for each sample's reads to our stratifiedReadMap
final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap = new LinkedHashMap<>(); final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap = new LinkedHashMap<>();
@ -218,137 +321,22 @@ public class LikelihoodCalculationEngine {
} }
private PerReadAlleleLikelihoodMap computeReadLikelihoods( final List<Haplotype> haplotypes, final List<GATKSAMRecord> reads) { private PerReadAlleleLikelihoodMap computeReadLikelihoods( final List<Haplotype> haplotypes, final List<GATKSAMRecord> reads) {
// first, a little set up to get copies of the Haplotypes that are Alleles (more efficient than creating them each time)
final BatchPairHMM batchPairHMM = (pairHMM.get() instanceof BatchPairHMM) ? (BatchPairHMM)pairHMM.get() : null;
final Vector<GATKSAMRecord> batchedReads = new Vector<GATKSAMRecord>(reads.size());
final int numHaplotypes = haplotypes.size();
final Map<Haplotype, Allele> alleleVersions = new LinkedHashMap<>(numHaplotypes);
Allele refAllele = null;
for ( final Haplotype haplotype : haplotypes ) {
final Allele allele = Allele.create(haplotype, true);
alleleVersions.put(haplotype, allele);
if ( haplotype.isReference() ) refAllele = allele;
}
final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap = new PerReadAlleleLikelihoodMap(); // Modify the read qualities by applying the PCR error model and capping the minimum base,insertion,deletion qualities
for( final GATKSAMRecord read : reads ) { List<GATKSAMRecord> processedReads = modifyReadQualities(reads);
final byte[] readBases = read.getReadBases(); // Get alleles corresponding to our haplotypees
final byte[] overallGCP = new byte[read.getReadLength()]; Map<Allele, Haplotype> alleleHaplotypeMap = createAlleleMap(haplotypes);
Arrays.fill( overallGCP, constantGCP ); // Is there a way to derive empirical estimates for this from the data?
// NOTE -- must clone anything that gets modified here so we don't screw up future uses of the read // Get an array containing the constantGCP for each read in our modified read list
final byte[] readQuals = read.getBaseQualities().clone(); Map<GATKSAMRecord,byte[]> GCPArrayMap = fillGCPArrays(processedReads);
final byte[] readInsQuals = read.getBaseInsertionQualities().clone();
final byte[] readDelQuals = read.getBaseDeletionQualities().clone();
applyPCRErrorModel(readBases, readInsQuals, readDelQuals); // Run the PairHMM to calculate the log10 likelihood of each (processed) reads' arising from each haplotype
PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap = pairHMMThreadLocal.get().computeLikelihoods(processedReads, alleleHaplotypeMap, GCPArrayMap);
for( int kkk = 0; kkk < readQuals.length; kkk++ ) { // Generate a new map containing the original, unmodified reads, and with minimal reference haplotype log10ls determined from the global mis-mapping rate
readQuals[kkk] = (byte) Math.min( 0xff & readQuals[kkk], read.getMappingQuality()); // cap base quality by mapping quality, as in UG
readQuals[kkk] = ( readQuals[kkk] < BASE_QUALITY_SCORE_THRESHOLD ? QualityUtils.MIN_USABLE_Q_SCORE : readQuals[kkk] );
readInsQuals[kkk] = ( readInsQuals[kkk] < QualityUtils.MIN_USABLE_Q_SCORE ? QualityUtils.MIN_USABLE_Q_SCORE : readInsQuals[kkk] );
readDelQuals[kkk] = ( readDelQuals[kkk] < QualityUtils.MIN_USABLE_Q_SCORE ? QualityUtils.MIN_USABLE_Q_SCORE : readDelQuals[kkk] );
}
if ( batchPairHMM != null ) { return capReferenceHaplotypeLikelihoods(perReadAlleleLikelihoodMap, reads, processedReads, alleleHaplotypeMap);
batchPairHMM.batchAdd(haplotypes, readBases, readQuals, readInsQuals, readDelQuals, overallGCP);
batchedReads.add(read);
continue;
}
// keep track of the reference likelihood and the best non-ref likelihood
double refLog10l = Double.NEGATIVE_INFINITY;
double bestNonReflog10L = Double.NEGATIVE_INFINITY;
// iterate over all haplotypes, calculating the likelihood of the read for each haplotype
for( int jjj = 0; jjj < numHaplotypes; jjj++ ) {
final Haplotype haplotype = haplotypes.get(jjj);
final byte[] nextHaplotypeBases = (jjj == numHaplotypes - 1) ? null : haplotypes.get(jjj+1).getBases();
final boolean isFirstHaplotype = jjj == 0;
final double log10l = pairHMM.get().computeReadLikelihoodGivenHaplotypeLog10(haplotype.getBases(),
readBases, readQuals, readInsQuals, readDelQuals, overallGCP, isFirstHaplotype, nextHaplotypeBases);
if ( WRITE_LIKELIHOODS_TO_FILE ) {
likelihoodsStream.printf("%s %s %s %s %s %s %f%n",
haplotype.getBaseString(),
new String(readBases),
SAMUtils.phredToFastq(readQuals),
SAMUtils.phredToFastq(readInsQuals),
SAMUtils.phredToFastq(readDelQuals),
SAMUtils.phredToFastq(overallGCP),
log10l);
}
if ( haplotype.isNonReference() )
bestNonReflog10L = Math.max(bestNonReflog10L, log10l);
else
refLog10l = log10l;
perReadAlleleLikelihoodMap.add(read, alleleVersions.get(haplotype), log10l);
}
// ensure that the reference haplotype is no worse than the best non-ref haplotype minus the global
// mismapping rate. This protects us from the case where the assembly has produced haplotypes
// that are very divergent from reference, but are supported by only one read. In effect
// we capping how badly scoring the reference can be for any read by the chance that the read
// itself just doesn't belong here
final double worstRefLog10Allowed = bestNonReflog10L + log10globalReadMismappingRate;
if ( refLog10l < (worstRefLog10Allowed) ) {
perReadAlleleLikelihoodMap.add(read, refAllele, worstRefLog10Allowed);
}
}
if ( batchPairHMM != null ) {
for( final GATKSAMRecord read : batchedReads ) {
double refLog10l = Double.NEGATIVE_INFINITY;
double bestNonReflog10L = Double.NEGATIVE_INFINITY;
final double[] likelihoods = batchPairHMM.batchGetResult();
for( int jjj = 0; jjj < numHaplotypes; jjj++ ) {
final Haplotype haplotype = haplotypes.get(jjj);
final double log10l = likelihoods[jjj];
if ( WRITE_LIKELIHOODS_TO_FILE ) {
final byte[] overallGCP = new byte[read.getReadLength()];
Arrays.fill( overallGCP, constantGCP ); // Is there a way to derive empirical estimates for this from the data?
// NOTE -- must clone anything that gets modified here so we don't screw up future uses of the read
final byte[] readQuals = read.getBaseQualities().clone();
final byte[] readInsQuals = read.getBaseInsertionQualities();
final byte[] readDelQuals = read.getBaseDeletionQualities();
for( int kkk = 0; kkk < readQuals.length; kkk++ ) {
readQuals[kkk] = (byte) Math.min( 0xff & readQuals[kkk], read.getMappingQuality()); // cap base quality by mapping quality, as in UG
//readQuals[kkk] = ( readQuals[kkk] > readInsQuals[kkk] ? readInsQuals[kkk] : readQuals[kkk] ); // cap base quality by base insertion quality, needs to be evaluated
//readQuals[kkk] = ( readQuals[kkk] > readDelQuals[kkk] ? readDelQuals[kkk] : readQuals[kkk] ); // cap base quality by base deletion quality, needs to be evaluated
// TODO -- why is Q18 hard-coded here???
readQuals[kkk] = ( readQuals[kkk] < (byte) 18 ? QualityUtils.MIN_USABLE_Q_SCORE : readQuals[kkk] );
}
likelihoodsStream.printf("%s %s %s %s %s %s %f%n",
haplotype.getBaseString(),
new String(read.getReadBases()),
SAMUtils.phredToFastq(readQuals),
SAMUtils.phredToFastq(readInsQuals),
SAMUtils.phredToFastq(readDelQuals),
SAMUtils.phredToFastq(overallGCP),
log10l);
}
if ( haplotype.isNonReference() )
bestNonReflog10L = Math.max(bestNonReflog10L, log10l);
else
refLog10l = log10l;
perReadAlleleLikelihoodMap.add(read, alleleVersions.get(haplotype), log10l);
}
final double worstRefLog10Allowed = bestNonReflog10L + log10globalReadMismappingRate;
if ( refLog10l < (worstRefLog10Allowed) ) {
perReadAlleleLikelihoodMap.add(read, refAllele, worstRefLog10Allowed);
}
}
}
return perReadAlleleLikelihoodMap;
} }
@Requires({"alleleOrdering.size() > 0"}) @Requires({"alleleOrdering.size() > 0"})
@ -421,125 +409,125 @@ public class LikelihoodCalculationEngine {
// System to compute the best N haplotypes for genotyping // System to compute the best N haplotypes for genotyping
// //
// -------------------------------------------------------------------------------- // --------------------------------------------------------------------------------
//
/** // /**
* Helper function for selectBestHaplotypesFromEachSample that updates the score of haplotype haplotypeAsAllele // * Helper function for selectBestHaplotypesFromEachSample that updates the score of haplotype haplotypeAsAllele
* @param map an annoying map object that moves us between the allele and haplotype representation // * @param map an annoying map object that moves us between the allele and haplotype representation
* @param haplotypeAsAllele the allele version of the haplotype // * @param haplotypeAsAllele the allele version of the haplotype
* @return the haplotype version, with its score incremented by 1 if its non-reference // * @return the haplotype version, with its score incremented by 1 if its non-reference
*/ // */
private Haplotype updateSelectHaplotype(final Map<Allele, Haplotype> map, final Allele haplotypeAsAllele) { // private Haplotype updateSelectHaplotype(final Map<Allele, Haplotype> map, final Allele haplotypeAsAllele) {
final Haplotype h = map.get(haplotypeAsAllele); // TODO -- fixme when haplotypes are properly generic // final Haplotype h = map.get(haplotypeAsAllele); // TODO -- fixme when haplotypes are properly generic
if ( h.isNonReference() ) h.setScore(h.getScore() + 1); // ref is already at max value // if ( h.isNonReference() ) h.setScore(h.getScore() + 1); // ref is already at max value
return h; // return h;
} // }
//
/** // /**
* Take the best N haplotypes and return them as a list // * Take the best N haplotypes and return them as a list
* // *
* Only considers the haplotypes selectedHaplotypes that were actually selected by at least one sample // * Only considers the haplotypes selectedHaplotypes that were actually selected by at least one sample
* as it's preferred haplotype. Takes the best N haplotypes from selectedHaplotypes in decreasing // * as it's preferred haplotype. Takes the best N haplotypes from selectedHaplotypes in decreasing
* order of score (so higher score haplotypes are preferred). The N we take is determined by // * order of score (so higher score haplotypes are preferred). The N we take is determined by
* // *
* N = min(2 * nSamples + 1, maxNumHaplotypesInPopulation) // * N = min(2 * nSamples + 1, maxNumHaplotypesInPopulation)
* // *
* where 2 * nSamples is the number of chromosomes in 2 samples including the reference, and our workload is // * where 2 * nSamples is the number of chromosomes in 2 samples including the reference, and our workload is
* bounded by maxNumHaplotypesInPopulation as that number can grow without bound // * bounded by maxNumHaplotypesInPopulation as that number can grow without bound
* // *
* @param selectedHaplotypes a non-null set of haplotypes with scores >= 1 // * @param selectedHaplotypes a non-null set of haplotypes with scores >= 1
* @param nSamples the number of samples used to select the haplotypes // * @param nSamples the number of samples used to select the haplotypes
* @param maxNumHaplotypesInPopulation the maximum number of haplotypes we're allowed to take, regardless of nSamples // * @param maxNumHaplotypesInPopulation the maximum number of haplotypes we're allowed to take, regardless of nSamples
* @return a list of N or fewer haplotypes, with the reference haplotype first // * @return a list of N or fewer haplotypes, with the reference haplotype first
*/ // */
private List<Haplotype> selectBestHaplotypesAccordingToScore(final Set<Haplotype> selectedHaplotypes, final int nSamples, final int maxNumHaplotypesInPopulation) { // private List<Haplotype> selectBestHaplotypesAccordingToScore(final Set<Haplotype> selectedHaplotypes, final int nSamples, final int maxNumHaplotypesInPopulation) {
final List<Haplotype> selectedHaplotypesList = new ArrayList<Haplotype>(selectedHaplotypes); // final List<Haplotype> selectedHaplotypesList = new ArrayList<>(selectedHaplotypes);
Collections.sort(selectedHaplotypesList, new HaplotypeScoreComparator()); // Collections.sort(selectedHaplotypesList, new HaplotypeScoreComparator());
final int numChromosomesInSamplesPlusRef = 2 * nSamples + 1; // final int numChromosomesInSamplesPlusRef = 2 * nSamples + 1;
final int haplotypesToKeep = Math.min(numChromosomesInSamplesPlusRef, maxNumHaplotypesInPopulation); // final int haplotypesToKeep = Math.min(numChromosomesInSamplesPlusRef, maxNumHaplotypesInPopulation);
final List<Haplotype> bestHaplotypes = selectedHaplotypesList.size() <= haplotypesToKeep ? selectedHaplotypesList : selectedHaplotypesList.subList(0, haplotypesToKeep); // final List<Haplotype> bestHaplotypes = selectedHaplotypesList.size() <= haplotypesToKeep ? selectedHaplotypesList : selectedHaplotypesList.subList(0, haplotypesToKeep);
if ( bestHaplotypes.get(0).isNonReference()) throw new IllegalStateException("BUG: reference haplotype should be first in list"); // if ( bestHaplotypes.get(0).isNonReference()) throw new IllegalStateException("BUG: reference haplotype should be first in list");
return bestHaplotypes; // return bestHaplotypes;
} // }
//
/** // /**
* Select the best haplotypes for genotyping the samples in stratifiedReadMap // * Select the best haplotypes for genotyping the samples in stratifiedReadMap
* // *
* Selects these haplotypes by counting up how often each haplotype is selected as one of the most likely // * Selects these haplotypes by counting up how often each haplotype is selected as one of the most likely
* haplotypes per sample. What this means is that each sample computes the diploid genotype likelihoods for // * haplotypes per sample. What this means is that each sample computes the diploid genotype likelihoods for
* all possible pairs of haplotypes, and the pair with the highest likelihood has each haplotype each get // * all possible pairs of haplotypes, and the pair with the highest likelihood has each haplotype each get
* one extra count for each haplotype (so hom-var haplotypes get two counts). After performing this calculation // * one extra count for each haplotype (so hom-var haplotypes get two counts). After performing this calculation
* the best N haplotypes are selected (@see #selectBestHaplotypesAccordingToScore) and a list of the // * the best N haplotypes are selected (@see #selectBestHaplotypesAccordingToScore) and a list of the
* haplotypes in order of score are returned, ensuring that at least one of the haplotypes is reference. // * haplotypes in order of score are returned, ensuring that at least one of the haplotypes is reference.
* // *
* @param haplotypes a list of all haplotypes we're considering // * @param haplotypes a list of all haplotypes we're considering
* @param stratifiedReadMap a map from sample -> read likelihoods per haplotype // * @param stratifiedReadMap a map from sample -> read likelihoods per haplotype
* @param maxNumHaplotypesInPopulation the max. number of haplotypes we can select from haplotypes // * @param maxNumHaplotypesInPopulation the max. number of haplotypes we can select from haplotypes
* @return a list of selected haplotypes with size <= maxNumHaplotypesInPopulation // * @return a list of selected haplotypes with size <= maxNumHaplotypesInPopulation
*/ // */
public List<Haplotype> selectBestHaplotypesFromEachSample(final List<Haplotype> haplotypes, final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap, final int maxNumHaplotypesInPopulation) { // public List<Haplotype> selectBestHaplotypesFromEachSample(final List<Haplotype> haplotypes, final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap, final int maxNumHaplotypesInPopulation) {
if ( haplotypes.size() < 2 ) throw new IllegalArgumentException("Must have at least 2 haplotypes to consider but only have " + haplotypes); // if ( haplotypes.size() < 2 ) throw new IllegalArgumentException("Must have at least 2 haplotypes to consider but only have " + haplotypes);
//
if ( haplotypes.size() == 2 ) return haplotypes; // fast path -- we'll always want to use 2 haplotypes // if ( haplotypes.size() == 2 ) return haplotypes; // fast path -- we'll always want to use 2 haplotypes
//
// all of the haplotypes that at least one sample called as one of the most likely // // all of the haplotypes that at least one sample called as one of the most likely
final Set<Haplotype> selectedHaplotypes = new HashSet<>(); // final Set<Haplotype> selectedHaplotypes = new HashSet<>();
selectedHaplotypes.add(findReferenceHaplotype(haplotypes)); // ref is always one of the selected // selectedHaplotypes.add(findReferenceHaplotype(haplotypes)); // ref is always one of the selected
//
// our annoying map from allele -> haplotype // // our annoying map from allele -> haplotype
final Map<Allele, Haplotype> allele2Haplotype = new HashMap<>(); // final Map<Allele, Haplotype> allele2Haplotype = new HashMap<>();
for ( final Haplotype h : haplotypes ) { // for ( final Haplotype h : haplotypes ) {
h.setScore(h.isReference() ? Double.MAX_VALUE : 0.0); // set all of the scores to 0 (lowest value) for all non-ref haplotypes // h.setScore(h.isReference() ? Double.MAX_VALUE : 0.0); // set all of the scores to 0 (lowest value) for all non-ref haplotypes
allele2Haplotype.put(Allele.create(h, h.isReference()), h); // allele2Haplotype.put(Allele.create(h, h.isReference()), h);
} // }
//
// for each sample, compute the most likely pair of haplotypes // // for each sample, compute the most likely pair of haplotypes
for ( final Map.Entry<String, PerReadAlleleLikelihoodMap> entry : stratifiedReadMap.entrySet() ) { // for ( final Map.Entry<String, PerReadAlleleLikelihoodMap> entry : stratifiedReadMap.entrySet() ) {
// get the two most likely haplotypes under a diploid model for this sample // // get the two most likely haplotypes under a diploid model for this sample
final MostLikelyAllele mla = entry.getValue().getMostLikelyDiploidAlleles(); // final MostLikelyAllele mla = entry.getValue().getMostLikelyDiploidAlleles();
//
if ( mla != null ) { // there was something to evaluate in this sample // if ( mla != null ) { // there was something to evaluate in this sample
// note that there must be at least 2 haplotypes // // note that there must be at least 2 haplotypes
final Haplotype best = updateSelectHaplotype(allele2Haplotype, mla.getMostLikelyAllele()); // final Haplotype best = updateSelectHaplotype(allele2Haplotype, mla.getMostLikelyAllele());
final Haplotype second = updateSelectHaplotype(allele2Haplotype, mla.getSecondMostLikelyAllele()); // final Haplotype second = updateSelectHaplotype(allele2Haplotype, mla.getSecondMostLikelyAllele());
//
// if ( DEBUG ) { //// if ( DEBUG ) {
// logger.info("Chose haplotypes " + best + " " + best.getCigar() + " and " + second + " " + second.getCigar() + " for sample " + entry.getKey()); //// logger.info("Chose haplotypes " + best + " " + best.getCigar() + " and " + second + " " + second.getCigar() + " for sample " + entry.getKey());
//// }
//
// // add these two haplotypes to the set of haplotypes that have been selected
// selectedHaplotypes.add(best);
// selectedHaplotypes.add(second);
//
// // we've already selected all of our haplotypes, and we don't need to prune them down
// if ( selectedHaplotypes.size() == haplotypes.size() && haplotypes.size() < maxNumHaplotypesInPopulation )
// break;
// } // }
// }
// add these two haplotypes to the set of haplotypes that have been selected //
selectedHaplotypes.add(best); // // take the best N haplotypes forward, in order of the number of samples that choose them
selectedHaplotypes.add(second); // final int nSamples = stratifiedReadMap.size();
// final List<Haplotype> bestHaplotypes = selectBestHaplotypesAccordingToScore(selectedHaplotypes, nSamples, maxNumHaplotypesInPopulation);
// we've already selected all of our haplotypes, and we don't need to prune them down //
if ( selectedHaplotypes.size() == haplotypes.size() && haplotypes.size() < maxNumHaplotypesInPopulation ) // if ( DEBUG ) {
break; // logger.info("Chose " + (bestHaplotypes.size() - 1) + " alternate haplotypes to genotype in all samples.");
} // for ( final Haplotype h : bestHaplotypes ) {
} // logger.info("\tHaplotype " + h.getCigar() + " selected for further genotyping" + (h.isNonReference() ? " found " + (int)h.getScore() + " haplotypes" : " as ref haplotype"));
// }
// take the best N haplotypes forward, in order of the number of samples that choose them // }
final int nSamples = stratifiedReadMap.size(); // return bestHaplotypes;
final List<Haplotype> bestHaplotypes = selectBestHaplotypesAccordingToScore(selectedHaplotypes, nSamples, maxNumHaplotypesInPopulation); // }
//
if ( DEBUG ) { // /**
logger.info("Chose " + (bestHaplotypes.size() - 1) + " alternate haplotypes to genotype in all samples."); // * Find the haplotype that isRef(), or @throw ReviewedStingException if one isn't found
for ( final Haplotype h : bestHaplotypes ) { // * @param haplotypes non-null list of haplotypes
logger.info("\tHaplotype " + h.getCigar() + " selected for further genotyping" + (h.isNonReference() ? " found " + (int)h.getScore() + " haplotypes" : " as ref haplotype")); // * @return the reference haplotype
} // */
} // private static Haplotype findReferenceHaplotype( final List<Haplotype> haplotypes ) {
return bestHaplotypes; // for( final Haplotype h : haplotypes ) {
} // if( h.isReference() ) return h;
// }
/** // throw new ReviewedStingException( "No reference haplotype found in the list of haplotypes!" );
* Find the haplotype that isRef(), or @throw ReviewedStingException if one isn't found // }
* @param haplotypes non-null list of haplotypes
* @return the reference haplotype
*/
private static Haplotype findReferenceHaplotype( final List<Haplotype> haplotypes ) {
for( final Haplotype h : haplotypes ) {
if( h.isReference() ) return h;
}
throw new ReviewedStingException( "No reference haplotype found in the list of haplotypes!" );
}
// -------------------------------------------------------------------------------- // --------------------------------------------------------------------------------
// //

View File

@ -48,11 +48,11 @@ package org.broadinstitute.sting.gatk.walkers.indels;
import com.google.java.contract.Ensures; import com.google.java.contract.Ensures;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.utils.haplotype.Haplotype;
import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.clipping.ReadClipper; import org.broadinstitute.sting.utils.clipping.ReadClipper;
import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
import org.broadinstitute.sting.utils.haplotype.Haplotype;
import org.broadinstitute.sting.utils.pairhmm.ArrayLoglessPairHMM; import org.broadinstitute.sting.utils.pairhmm.ArrayLoglessPairHMM;
import org.broadinstitute.sting.utils.pairhmm.Log10PairHMM; import org.broadinstitute.sting.utils.pairhmm.Log10PairHMM;
import org.broadinstitute.sting.utils.pairhmm.LoglessPairHMM; import org.broadinstitute.sting.utils.pairhmm.LoglessPairHMM;
@ -65,6 +65,7 @@ import org.broadinstitute.variant.variantcontext.Allele;
import java.util.Arrays; import java.util.Arrays;
import java.util.LinkedHashMap; import java.util.LinkedHashMap;
import java.util.LinkedList;
import java.util.Map; import java.util.Map;
//import org.broadinstitute.sting.utils.pairhmm.LoglessCachingPairHMM; //import org.broadinstitute.sting.utils.pairhmm.LoglessCachingPairHMM;
@ -206,6 +207,39 @@ public class PairHMMIndelErrorModel {
} }
} }
private LinkedHashMap<Allele, Haplotype> trimHaplotypes(final LinkedHashMap<Allele, Haplotype> haplotypeMap,
long startLocationInRefForHaplotypes,
long stopLocationInRefForHaplotypes,
final ReferenceContext ref){
final LinkedHashMap<Allele, Haplotype> trimmedHaplotypeMap = new LinkedHashMap<>();
for (final Allele a: haplotypeMap.keySet()) {
final Haplotype haplotype = haplotypeMap.get(a);
if (stopLocationInRefForHaplotypes > haplotype.getStopPosition())
stopLocationInRefForHaplotypes = haplotype.getStopPosition();
if (startLocationInRefForHaplotypes < haplotype.getStartPosition())
startLocationInRefForHaplotypes = haplotype.getStartPosition();
else if (startLocationInRefForHaplotypes > haplotype.getStopPosition())
startLocationInRefForHaplotypes = haplotype.getStopPosition();
final long indStart = startLocationInRefForHaplotypes - haplotype.getStartPosition();
final long indStop = stopLocationInRefForHaplotypes - haplotype.getStartPosition();
if (DEBUG)
System.out.format("indStart: %d indStop: %d WinStart:%d WinStop:%d start: %d stop: %d\n",
indStart, indStop, ref.getWindow().getStart(), ref.getWindow().getStop(), startLocationInRefForHaplotypes, stopLocationInRefForHaplotypes);
// get the trimmed haplotype-bases array and create a new haplotype based on it. Pack this into the new map
final byte[] trimmedHaplotypeBases = Arrays.copyOfRange(haplotype.getBases(), (int)indStart, (int)indStop);
final Haplotype trimmedHaplotype = new Haplotype(trimmedHaplotypeBases, haplotype.isReference());
trimmedHaplotypeMap.put(a, trimmedHaplotype);
}
return trimmedHaplotypeMap;
}
public synchronized double[] computeDiploidReadHaplotypeLikelihoods(final ReadBackedPileup pileup, public synchronized double[] computeDiploidReadHaplotypeLikelihoods(final ReadBackedPileup pileup,
final LinkedHashMap<Allele, Haplotype> haplotypeMap, final LinkedHashMap<Allele, Haplotype> haplotypeMap,
@ -231,6 +265,8 @@ public class PairHMMIndelErrorModel {
final int[] readCounts) { final int[] readCounts) {
final double readLikelihoods[][] = new double[pileup.getNumberOfElements()][haplotypeMap.size()]; final double readLikelihoods[][] = new double[pileup.getNumberOfElements()][haplotypeMap.size()];
final LinkedList<GATKSAMRecord> readList = new LinkedList<>();
final Map<GATKSAMRecord, byte[]> readGCPArrayMap = new LinkedHashMap<>();
int readIdx=0; int readIdx=0;
for (PileupElement p: pileup) { for (PileupElement p: pileup) {
// > 1 when the read is a consensus read representing multiple independent observations // > 1 when the read is a consensus read representing multiple independent observations
@ -369,86 +405,30 @@ public class PairHMMIndelErrorModel {
baseDeletionQualities = contextLogGapOpenProbabilities; baseDeletionQualities = contextLogGapOpenProbabilities;
} }
byte[] currentHaplotypeBases = null; // Create a new read based on the current one, but with trimmed bases/quals, for use in the HMM
boolean firstHap = true; final GATKSAMRecord processedRead = GATKSAMRecord.createQualityModifiedRead(read, readBases, readQuals, baseInsertionQualities, baseDeletionQualities);
double readLikelihood; readList.add(processedRead);
Allele currentAllele = null;
for (Allele a: haplotypeMap.keySet()) {
Haplotype haplotype = haplotypeMap.get(a); // Pack the shortened read and its associated gap-continuation-penalty array into a map, as required by PairHMM
readGCPArrayMap.put(processedRead,contextLogGapContinuationProbabilities);
if (stopLocationInRefForHaplotypes > haplotype.getStopPosition()) // Create a map of alleles to a new set of haplotypes, whose bases have been trimmed to the approprate genomic locations
stopLocationInRefForHaplotypes = haplotype.getStopPosition(); final Map<Allele, Haplotype> trimmedHaplotypeMap = trimHaplotypes(haplotypeMap, startLocationInRefForHaplotypes, stopLocationInRefForHaplotypes, ref);
if (startLocationInRefForHaplotypes < haplotype.getStartPosition()) // Get the likelihoods for our clipped read against each of our trimmed haplotypes.
startLocationInRefForHaplotypes = haplotype.getStartPosition(); final PerReadAlleleLikelihoodMap singleReadRawLikelihoods = pairHMM.computeLikelihoods(readList, trimmedHaplotypeMap, readGCPArrayMap);
else if (startLocationInRefForHaplotypes > haplotype.getStopPosition())
startLocationInRefForHaplotypes = haplotype.getStopPosition();
final long indStart = startLocationInRefForHaplotypes - haplotype.getStartPosition(); // Pack the original pilup element, each allele, and each associated log10 likelihood into a final map, and add each likelihood to the array
final long indStop = stopLocationInRefForHaplotypes - haplotype.getStartPosition(); for (Allele a: trimmedHaplotypeMap.keySet()){
double readLikelihood = singleReadRawLikelihoods.getLikelihoodAssociatedWithReadAndAllele(processedRead, a);
perReadAlleleLikelihoodMap.add(p, a, readLikelihood);
if (DEBUG)
System.out.format("indStart: %d indStop: %d WinStart:%d WinStop:%d start: %d stop: %d readLength: %d C:%s\n",
indStart, indStop, ref.getWindow().getStart(), ref.getWindow().getStop(), startLocationInRefForHaplotypes, stopLocationInRefForHaplotypes, read.getReadLength(), read.getCigar().toString());
// peak at the next haplotype in the list
final byte[] nextHaplotypeBases = Arrays.copyOfRange(haplotype.getBases(), (int)indStart, (int)indStop);
// process the current haplotype in the list
if (currentHaplotypeBases != null) {
// it's possible that the indel starts at the last base of the haplotypes
if ( currentHaplotypeBases.length == 0 ) {
readLikelihood = -Double.MAX_VALUE;
} else {
if (firstHap) {
//no need to reallocate arrays for each new haplotype, as length won't change
pairHMM.initialize(readBases.length, currentHaplotypeBases.length);
firstHap = false;
}
readLikelihood = pairHMM.computeReadLikelihoodGivenHaplotypeLog10(currentHaplotypeBases, readBases, readQuals,
baseInsertionQualities, baseDeletionQualities, contextLogGapContinuationProbabilities, firstHap, nextHaplotypeBases);
}
if (DEBUG) {
System.out.println("H:"+new String(currentHaplotypeBases));
System.out.println("R:"+new String(readBases));
System.out.format("L:%4.2f\n",readLikelihood);
}
perReadAlleleLikelihoodMap.add(p, currentAllele, readLikelihood);
readLikelihoods[readIdx][j++] = readLikelihood;
}
// update the current haplotype
currentHaplotypeBases = nextHaplotypeBases;
currentAllele = a;
}
// process the final haplotype
if (currentHaplotypeBases != null) {
// it's possible that the indel starts at the last base of the haplotypes
if ( currentHaplotypeBases.length == 0 ) {
readLikelihood = -Double.MAX_VALUE;
} else {
if (firstHap) {
//no need to reallocate arrays for each new haplotype, as length won't change
pairHMM.initialize(readBases.length, currentHaplotypeBases.length);
firstHap = false;
}
// there is no next haplotype, so pass null for nextHaplotypeBases.
readLikelihood = pairHMM.computeReadLikelihoodGivenHaplotypeLog10(currentHaplotypeBases, readBases, readQuals,
baseInsertionQualities, baseDeletionQualities, contextLogGapContinuationProbabilities, firstHap, null);
}
if (DEBUG) {
System.out.println("H:"+new String(currentHaplotypeBases));
System.out.println("R:"+new String(readBases));
System.out.format("L:%4.2f\n",readLikelihood);
}
perReadAlleleLikelihoodMap.add(p, currentAllele, readLikelihood);
readLikelihoods[readIdx][j++] = readLikelihood; readLikelihoods[readIdx][j++] = readLikelihood;
} }
// The readList for sending to the HMM should only ever contain 1 read, as each must be clipped individually
readList.remove(processedRead);
// The same is true for the read/GCP-array map
readGCPArrayMap.remove(processedRead);
} }
} }
readIdx++; readIdx++;
@ -472,16 +452,16 @@ public class PairHMMIndelErrorModel {
return !((read.getAlignmentStart() >= eventStartPos-eventLength && read.getAlignmentStart() <= eventStartPos+1) || (read.getAlignmentEnd() >= eventStartPos && read.getAlignmentEnd() <= eventStartPos + eventLength)); return !((read.getAlignmentStart() >= eventStartPos-eventLength && read.getAlignmentStart() <= eventStartPos+1) || (read.getAlignmentEnd() >= eventStartPos && read.getAlignmentEnd() <= eventStartPos + eventLength));
} }
private int computeFirstDifferingPosition(byte[] b1, byte[] b2) { // private int computeFirstDifferingPosition(byte[] b1, byte[] b2) {
if (b1.length != b2.length) // if (b1.length != b2.length)
return 0; // sanity check // return 0; // sanity check
//
for (int i=0; i < b1.length; i++ ){ // for (int i=0; i < b1.length; i++ ){
if ( b1[i]!= b2[i] ) // if ( b1[i]!= b2[i] )
return i; // return i;
} // }
return b1.length; // return b1.length;
} // }
private static double[] getDiploidHaplotypeLikelihoods(final int numHaplotypes, final int readCounts[], final double readLikelihoods[][]) { private static double[] getDiploidHaplotypeLikelihoods(final int numHaplotypes, final int readCounts[], final double readLikelihoods[][]) {
final double[][] haplotypeLikehoodMatrix = new double[numHaplotypes][numHaplotypes]; final double[][] haplotypeLikehoodMatrix = new double[numHaplotypes][numHaplotypes];

View File

@ -49,6 +49,7 @@ package org.broadinstitute.sting.utils.pairhmm;
import com.google.java.contract.Ensures; import com.google.java.contract.Ensures;
import com.google.java.contract.Requires; import com.google.java.contract.Requires;
import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.QualityUtils;
import java.util.Arrays; import java.util.Arrays;
/** /**

View File

@ -47,12 +47,16 @@
package org.broadinstitute.sting.utils.pairhmm; package org.broadinstitute.sting.utils.pairhmm;
import org.broadinstitute.sting.utils.haplotype.Haplotype; import org.broadinstitute.sting.utils.haplotype.Haplotype;
import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.variant.variantcontext.Allele;
import java.io.File; import java.io.File;
import java.lang.reflect.Field; import java.lang.reflect.Field;
import java.util.Arrays; import java.util.Arrays;
import java.util.LinkedList; import java.util.LinkedList;
import java.util.List; import java.util.List;
import java.util.Map;
public final class CnyPairHMM extends PairHMM implements BatchPairHMM { public final class CnyPairHMM extends PairHMM implements BatchPairHMM {
private static class HmmInput { private static class HmmInput {
@ -62,14 +66,14 @@ public final class CnyPairHMM extends PairHMM implements BatchPairHMM {
public byte[] deletionGOP; public byte[] deletionGOP;
public byte[] overallGCP; public byte[] overallGCP;
public List<Haplotype> haplotypes; public List<Haplotype> haplotypes;
}; }
public static class ResultQueue { public static class ResultQueue {
private int offset; private int offset;
private List<double[]> batchResults; private List<double[]> batchResults;
public ResultQueue() { public ResultQueue() {
batchResults = new LinkedList<double[]>(); batchResults = new LinkedList<>();
offset = 0; offset = 0;
} }
@ -92,7 +96,7 @@ public final class CnyPairHMM extends PairHMM implements BatchPairHMM {
final static String libName = "gmvhdl_gatk_hmm"; final static String libName = "gmvhdl_gatk_hmm";
private static boolean loaded = false; private static boolean loaded = false;
private List<HmmInput> batchRequests = new LinkedList<HmmInput>(); private List<HmmInput> batchRequests = new LinkedList<>();
private ResultQueue resultQueue = new ResultQueue(); private ResultQueue resultQueue = new ResultQueue();
static public boolean isAvailable() { static public boolean isAvailable() {
@ -184,6 +188,55 @@ public final class CnyPairHMM extends PairHMM implements BatchPairHMM {
return results; return results;
} }
/**
* {@inheritDoc}
*/
@Override
public PerReadAlleleLikelihoodMap computeLikelihoods(final List<GATKSAMRecord> reads, final Map<Allele, Haplotype> alleleHaplotypeMap, final Map<GATKSAMRecord, byte[]> GCPArrayMap){
// initialize the pairHMM if necessary
if (! initialized) {
int readMaxLength = findMaxReadLength(reads);
int haplotypeMaxLength = findMaxHaplotypeLength(alleleHaplotypeMap);
initialize(readMaxLength, haplotypeMaxLength);
}
// Pass the read bases/quals, and the haplotypes as a list into the HMM
performBatchAdditions(reads, alleleHaplotypeMap, GCPArrayMap);
// Get the log10-likelihoods for each read/haplotype ant pack into the results map
final PerReadAlleleLikelihoodMap likelihoodMap = new PerReadAlleleLikelihoodMap();
collectLikelihoodResults(reads, alleleHaplotypeMap, likelihoodMap);
return likelihoodMap;
}
private void collectLikelihoodResults(List<GATKSAMRecord> reads, Map<Allele, Haplotype> alleleHaplotypeMap, PerReadAlleleLikelihoodMap likelihoodMap) {
for(final GATKSAMRecord read : reads){
final double[] likelihoods = batchGetResult();
int jjj = 0;
for (Allele allele : alleleHaplotypeMap.keySet()){
final double log10l = likelihoods[jjj];
likelihoodMap.add(read, allele, log10l);
jjj++;
}
}
}
private void performBatchAdditions(List<GATKSAMRecord> reads, Map<Allele, Haplotype> alleleHaplotypeMap, Map<GATKSAMRecord, byte[]> GCPArrayMap) {
final List<Haplotype> haplotypeList = getHaplotypeList(alleleHaplotypeMap);
for(final GATKSAMRecord read : reads){
final byte[] readBases = read.getReadBases();
final byte[] readQuals = read.getBaseQualities();
final byte[] readInsQuals = read.getBaseInsertionQualities();
final byte[] readDelQuals = read.getBaseDeletionQualities();
final byte[] overallGCP = GCPArrayMap.get(read);
batchAdd(haplotypeList, readBases, readQuals, readInsQuals, readDelQuals, overallGCP);
}
}
protected double subComputeReadLikelihoodGivenHaplotypeLog10( final byte[] haplotypeBases, protected double subComputeReadLikelihoodGivenHaplotypeLog10( final byte[] haplotypeBases,
final byte[] readBases, final byte[] readBases,
final byte[] readQuals, final byte[] readQuals,
@ -196,6 +249,14 @@ public final class CnyPairHMM extends PairHMM implements BatchPairHMM {
return 0.0; return 0.0;
} }
private List<Haplotype> getHaplotypeList(Map<Allele, Haplotype> alleleHaplotypeMap){
final List<Haplotype> haplotypeList = new LinkedList<>();
for (Allele a : alleleHaplotypeMap.keySet()){
haplotypeList.add(alleleHaplotypeMap.get(a));
}
return haplotypeList;
}
private void enqueuePrepare(byte[] haplotypeBases, byte[] readBases) { private void enqueuePrepare(byte[] haplotypeBases, byte[] readBases) {
double[] results = null; double[] results = null;
int n = dequeueRequirement(haplotypeBases.length, readBases.length); int n = dequeueRequirement(haplotypeBases.length, readBases.length);

View File

@ -70,17 +70,6 @@ public final class LoglessPairHMM extends N2MemoryPairHMM {
private static final int deletionToDeletion = 5; private static final int deletionToDeletion = 5;
/**
* {@inheritDoc}
*/
@Override
public void initialize(final int readMaxLength, final int haplotypeMaxLength ) {
super.initialize(readMaxLength, haplotypeMaxLength);
transition = new double[paddedMaxReadLength][6];
prior = new double[paddedMaxReadLength][paddedMaxHaplotypeLength];
}
/** /**
* {@inheritDoc} * {@inheritDoc}
*/ */

View File

@ -110,7 +110,7 @@ public class PerReadAlleleLikelihoodMap {
* @return a map from each allele to a list of reads that 'support' the allele * @return a map from each allele to a list of reads that 'support' the allele
*/ */
protected Map<Allele,List<GATKSAMRecord>> getAlleleStratifiedReadMap() { protected Map<Allele,List<GATKSAMRecord>> getAlleleStratifiedReadMap() {
final Map<Allele, List<GATKSAMRecord>> alleleReadMap = new HashMap<Allele, List<GATKSAMRecord>>(alleles.size()); final Map<Allele, List<GATKSAMRecord>> alleleReadMap = new HashMap<>(alleles.size());
for ( final Allele allele : alleles ) for ( final Allele allele : alleles )
alleleReadMap.put(allele, new ArrayList<GATKSAMRecord>()); alleleReadMap.put(allele, new ArrayList<GATKSAMRecord>());
@ -152,7 +152,7 @@ public class PerReadAlleleLikelihoodMap {
/** /**
* Does the current map contain the key associated with a particular SAM record in pileup? * Does the current map contain the key associated with a particular SAM record in pileup?
* @param p Pileup element * @param p Pileup element
* @return * @return true if the map contains pileup element, else false
*/ */
public boolean containsPileupElement(final PileupElement p) { public boolean containsPileupElement(final PileupElement p) {
return likelihoodReadMap.containsKey(p.getRead()); return likelihoodReadMap.containsKey(p.getRead());
@ -176,9 +176,9 @@ public class PerReadAlleleLikelihoodMap {
return likelihoodReadMap.keySet(); return likelihoodReadMap.keySet();
} }
public Collection<Map<Allele,Double>> getLikelihoodMapValues() { // public Collection<Map<Allele,Double>> getLikelihoodMapValues() {
return likelihoodReadMap.values(); // return likelihoodReadMap.values();
} // }
public int getNumberOfStoredElements() { public int getNumberOfStoredElements() {
return likelihoodReadMap.size(); return likelihoodReadMap.size();
@ -191,6 +191,21 @@ public class PerReadAlleleLikelihoodMap {
return likelihoodReadMap.get(p.getRead()); return likelihoodReadMap.get(p.getRead());
} }
/**
* Get the log10 likelihood associated with an individual read/allele
*
* @param read the read whose likelihood we want
* @param allele the allele whose likelihood we want
* @return the log10 likelihood that this read matches this allele
*/
public double getLikelihoodAssociatedWithReadAndAllele(final GATKSAMRecord read, final Allele allele){
if (!allelesSet.contains(allele) || !likelihoodReadMap.containsKey(read))
return 0.0;
return likelihoodReadMap.get(read).get(allele);
}
/** /**
* Get the most likely alleles estimated across all reads in this object * Get the most likely alleles estimated across all reads in this object
* *

View File

@ -85,9 +85,6 @@ public final class Log10PairHMM extends N2MemoryPairHMM {
Arrays.fill(insertionMatrix[iii], Double.NEGATIVE_INFINITY); Arrays.fill(insertionMatrix[iii], Double.NEGATIVE_INFINITY);
Arrays.fill(deletionMatrix[iii], Double.NEGATIVE_INFINITY); Arrays.fill(deletionMatrix[iii], Double.NEGATIVE_INFINITY);
} }
transition = new double[paddedMaxReadLength][6];
prior = new double[paddedMaxReadLength][paddedMaxHaplotypeLength];
} }
/** /**

View File

@ -26,10 +26,6 @@
package org.broadinstitute.sting.utils.pairhmm; package org.broadinstitute.sting.utils.pairhmm;
import com.google.java.contract.Requires; import com.google.java.contract.Requires;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.utils.MathUtils;
import java.util.Arrays;
/** /**
* Superclass for PairHMM that want to use a full read x haplotype matrix for their match, insertion, and deletion matrix * Superclass for PairHMM that want to use a full read x haplotype matrix for their match, insertion, and deletion matrix
@ -58,6 +54,9 @@ abstract class N2MemoryPairHMM extends PairHMM {
matchMatrix = new double[paddedMaxReadLength][paddedMaxHaplotypeLength]; matchMatrix = new double[paddedMaxReadLength][paddedMaxHaplotypeLength];
insertionMatrix = new double[paddedMaxReadLength][paddedMaxHaplotypeLength]; insertionMatrix = new double[paddedMaxReadLength][paddedMaxHaplotypeLength];
deletionMatrix = new double[paddedMaxReadLength][paddedMaxHaplotypeLength]; deletionMatrix = new double[paddedMaxReadLength][paddedMaxHaplotypeLength];
transition = new double[paddedMaxReadLength][6];
prior = new double[paddedMaxReadLength][paddedMaxHaplotypeLength];
} }
/** /**

View File

@ -28,9 +28,14 @@ package org.broadinstitute.sting.utils.pairhmm;
import com.google.java.contract.Requires; import com.google.java.contract.Requires;
import org.apache.log4j.Logger; import org.apache.log4j.Logger;
import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
import org.broadinstitute.sting.utils.haplotype.Haplotype;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.variant.variantcontext.Allele;
import java.util.Arrays; import java.util.Arrays;
import java.util.List;
import java.util.Map;
/** /**
* Util class for performing the pair HMM for local alignment. Figure 4.3 in Durbin 1998 book. * Util class for performing the pair HMM for local alignment. Figure 4.3 in Durbin 1998 book.
* *
@ -59,7 +64,7 @@ public abstract class PairHMM {
protected int maxHaplotypeLength, maxReadLength; protected int maxHaplotypeLength, maxReadLength;
protected int paddedMaxReadLength, paddedMaxHaplotypeLength; protected int paddedMaxReadLength, paddedMaxHaplotypeLength;
protected int paddedReadLength, paddedHaplotypeLength; protected int paddedReadLength, paddedHaplotypeLength;
private boolean initialized = false; protected boolean initialized = false;
// only used for debugging purposes // only used for debugging purposes
protected boolean doNotUseTristateCorrection = false; protected boolean doNotUseTristateCorrection = false;
@ -73,7 +78,7 @@ public abstract class PairHMM {
* @param haplotypeMaxLength the max length of haplotypes we want to use with this PairHMM * @param haplotypeMaxLength the max length of haplotypes we want to use with this PairHMM
* @param readMaxLength the max length of reads we want to use with this PairHMM * @param readMaxLength the max length of reads we want to use with this PairHMM
*/ */
public void initialize( final int readMaxLength, final int haplotypeMaxLength ) { protected void initialize( final int readMaxLength, final int haplotypeMaxLength ) {
if ( readMaxLength <= 0 ) throw new IllegalArgumentException("READ_MAX_LENGTH must be > 0 but got " + readMaxLength); if ( readMaxLength <= 0 ) throw new IllegalArgumentException("READ_MAX_LENGTH must be > 0 but got " + readMaxLength);
if ( haplotypeMaxLength <= 0 ) throw new IllegalArgumentException("HAPLOTYPE_MAX_LENGTH must be > 0 but got " + haplotypeMaxLength); if ( haplotypeMaxLength <= 0 ) throw new IllegalArgumentException("HAPLOTYPE_MAX_LENGTH must be > 0 but got " + haplotypeMaxLength);
@ -90,6 +95,79 @@ public abstract class PairHMM {
initialized = true; initialized = true;
} }
protected int findMaxReadLength(final List<GATKSAMRecord> reads) {
int listMaxReadLength = 0;
for(GATKSAMRecord read : reads){
final int readLength = read.getReadLength();
if( readLength > listMaxReadLength ) { listMaxReadLength = readLength; }
}
return listMaxReadLength;
}
protected int findMaxHaplotypeLength(final Map<Allele, Haplotype> haplotypeMap) {
int listMaxHaplotypeLength = 0;
for( final Allele a: haplotypeMap.keySet() ) {
final Haplotype h = haplotypeMap.get(a);
final int haplotypeLength = h.getBases().length;
if( haplotypeLength > listMaxHaplotypeLength ) { listMaxHaplotypeLength = haplotypeLength; }
}
return listMaxHaplotypeLength;
}
/**
* Given a list of reads and haplotypes, for every read compute the total probability of said read arising from
* each haplotype given base substitution, insertion, and deletion probabilities.
*
* @param reads the list of reads
* @param alleleHaplotypeMap the list of haplotypes
* @param GCPArrayMap Each read is associated with an array containing the gap continuation penalties for use in the model. Length of each GCP-array must match that of its read.
* @return a PerReadAlleleLikelihoodMap containing each read, haplotype-allele, and the log10 probability of
* said read coming from the said haplotype under the provided error model
*/
public PerReadAlleleLikelihoodMap computeLikelihoods(final List<GATKSAMRecord> reads, final Map<Allele, Haplotype> alleleHaplotypeMap, final Map<GATKSAMRecord, byte[]> GCPArrayMap) {
// (re)initialize the pairHMM only if necessary
final int readMaxLength = findMaxReadLength(reads);
final int haplotypeMaxLength = findMaxHaplotypeLength(alleleHaplotypeMap);
if (!initialized || readMaxLength > maxReadLength || haplotypeMaxLength > maxHaplotypeLength) { initialize(readMaxLength, haplotypeMaxLength); }
final PerReadAlleleLikelihoodMap likelihoodMap = new PerReadAlleleLikelihoodMap();
for(GATKSAMRecord read : reads){
final byte[] readBases = read.getReadBases();
final byte[] readQuals = read.getBaseQualities();
final byte[] readInsQuals = read.getBaseInsertionQualities();
final byte[] readDelQuals = read.getBaseDeletionQualities();
final byte[] overallGCP = GCPArrayMap.get(read);
// peak at the next haplotype in the list (necessary to get nextHaplotypeBases, which is required for caching in the array implementation)
byte[] currentHaplotypeBases = null;
boolean isFirstHaplotype = true;
Allele currentAllele = null;
double log10l;
for (final Allele allele : alleleHaplotypeMap.keySet()){
final Haplotype haplotype = alleleHaplotypeMap.get(allele);
final byte[] nextHaplotypeBases = haplotype.getBases();
if (currentHaplotypeBases != null) {
log10l = computeReadLikelihoodGivenHaplotypeLog10(currentHaplotypeBases,
readBases, readQuals, readInsQuals, readDelQuals, overallGCP, isFirstHaplotype, nextHaplotypeBases);
likelihoodMap.add(read, currentAllele, log10l);
}
// update the current haplotype
currentHaplotypeBases = nextHaplotypeBases;
currentAllele = allele;
}
// process the final haplotype
if (currentHaplotypeBases != null) {
// there is no next haplotype, so pass null for nextHaplotypeBases.
log10l = computeReadLikelihoodGivenHaplotypeLog10(currentHaplotypeBases,
readBases, readQuals, readInsQuals, readDelQuals, overallGCP, isFirstHaplotype, null);
likelihoodMap.add(read, currentAllele, log10l);
}
}
return likelihoodMap;
}
/** /**
* Compute the total probability of read arising from haplotypeBases given base substitution, insertion, and deletion * Compute the total probability of read arising from haplotypeBases given base substitution, insertion, and deletion
* probabilities. * probabilities.
@ -110,7 +188,7 @@ public abstract class PairHMM {
* parameters are the same, and only the haplotype bases are changing underneath us * parameters are the same, and only the haplotype bases are changing underneath us
* @return the log10 probability of read coming from the haplotype under the provided error model * @return the log10 probability of read coming from the haplotype under the provided error model
*/ */
public final double computeReadLikelihoodGivenHaplotypeLog10( final byte[] haplotypeBases, protected final double computeReadLikelihoodGivenHaplotypeLog10( final byte[] haplotypeBases,
final byte[] readBases, final byte[] readBases,
final byte[] readQuals, final byte[] readQuals,
final byte[] insertionGOP, final byte[] insertionGOP,
@ -118,6 +196,7 @@ public abstract class PairHMM {
final byte[] overallGCP, final byte[] overallGCP,
final boolean recacheReadValues, final boolean recacheReadValues,
final byte[] nextHaploytpeBases) { final byte[] nextHaploytpeBases) {
if ( ! initialized ) throw new IllegalStateException("Must call initialize before calling computeReadLikelihoodGivenHaplotypeLog10"); if ( ! initialized ) throw new IllegalStateException("Must call initialize before calling computeReadLikelihoodGivenHaplotypeLog10");
if ( haplotypeBases == null ) throw new IllegalArgumentException("haplotypeBases cannot be null"); if ( haplotypeBases == null ) throw new IllegalArgumentException("haplotypeBases cannot be null");
if ( haplotypeBases.length > maxHaplotypeLength ) throw new IllegalArgumentException("Haplotype bases is too long, got " + haplotypeBases.length + " but max is " + maxHaplotypeLength); if ( haplotypeBases.length > maxHaplotypeLength ) throw new IllegalArgumentException("Haplotype bases is too long, got " + haplotypeBases.length + " but max is " + maxHaplotypeLength);

View File

@ -139,7 +139,7 @@ public class GATKSAMRecord extends BAMRecord {
} }
public static GATKSAMRecord createRandomRead(int length) { public static GATKSAMRecord createRandomRead(int length) {
List<CigarElement> cigarElements = new LinkedList<CigarElement>(); List<CigarElement> cigarElements = new LinkedList<>();
cigarElements.add(new CigarElement(length, CigarOperator.M)); cigarElements.add(new CigarElement(length, CigarOperator.M));
Cigar cigar = new Cigar(cigarElements); Cigar cigar = new Cigar(cigarElements);
return ArtificialSAMUtils.createArtificialRead(cigar); return ArtificialSAMUtils.createArtificialRead(cigar);
@ -536,10 +536,7 @@ public class GATKSAMRecord extends BAMRecord {
* @return True if an attribute has been set for this key. * @return True if an attribute has been set for this key.
*/ */
public boolean containsTemporaryAttribute(Object key) { public boolean containsTemporaryAttribute(Object key) {
if(temporaryAttributes != null) { return temporaryAttributes != null && temporaryAttributes.containsKey(key);
return temporaryAttributes.containsKey(key);
}
return false;
} }
/** /**
@ -556,7 +553,7 @@ public class GATKSAMRecord extends BAMRecord {
*/ */
public Object setTemporaryAttribute(Object key, Object value) { public Object setTemporaryAttribute(Object key, Object value) {
if(temporaryAttributes == null) { if(temporaryAttributes == null) {
temporaryAttributes = new HashMap<Object, Object>(); temporaryAttributes = new HashMap<>();
} }
return temporaryAttributes.put(key, value); return temporaryAttributes.put(key, value);
} }
@ -750,6 +747,46 @@ public class GATKSAMRecord extends BAMRecord {
return emptyRead; return emptyRead;
} }
/**
* Creates a new GATKSAMRecord with the source read's header, read group and mate
* information, but with the following fields set to user-supplied values:
* - Read Bases
* - Base Qualities
* - Base Insertion Qualities
* - Base Deletion Qualities
*
* Cigar string is empty (not-null)
*
* Use this method if you want to create a new GATKSAMRecord based on
* another GATKSAMRecord, but with modified bases and qualities
*
* @param read a read to copy the header from
* @param readBases an array containing the new bases you wish use in place of the originals
* @param baseQualities an array containing the new base qualities you wish use in place of the originals
* @param baseInsertionQualities an array containing the new base insertion qaulities
* @param baseDeletionQualities an array containing the new base deletion qualities
* @return a read with modified bases and qualities, safe for the GATK
*/
public static GATKSAMRecord createQualityModifiedRead(final GATKSAMRecord read,
final byte[] readBases,
final byte[] baseQualities,
final byte[] baseInsertionQualities,
final byte[] baseDeletionQualities) {
if ( baseQualities.length != readBases.length || baseInsertionQualities.length != readBases.length || baseDeletionQualities.length != readBases.length )
throw new IllegalArgumentException("Read bases and read quality arrays aren't the same size: Bases:" + readBases.length
+ " vs Base Q's:" + baseQualities.length
+ " vs Insert Q's:" + baseInsertionQualities.length
+ " vs Delete Q's:" + baseDeletionQualities.length);
final GATKSAMRecord processedRead = GATKSAMRecord.emptyRead(read);
processedRead.setReadBases(readBases);
processedRead.setBaseQualities(baseQualities, EventType.BASE_SUBSTITUTION);
processedRead.setBaseQualities(baseInsertionQualities, EventType.BASE_INSERTION);
processedRead.setBaseQualities(baseDeletionQualities, EventType.BASE_DELETION);
return processedRead;
}
/** /**
* Shallow copy of everything, except for the attribute list and the temporary attributes. * Shallow copy of everything, except for the attribute list and the temporary attributes.
* A new list of the attributes is created for both, but the attributes themselves are copied by reference. * A new list of the attributes is created for both, but the attributes themselves are copied by reference.
@ -762,7 +799,7 @@ public class GATKSAMRecord extends BAMRecord {
public Object clone() throws CloneNotSupportedException { public Object clone() throws CloneNotSupportedException {
final GATKSAMRecord clone = (GATKSAMRecord) super.clone(); final GATKSAMRecord clone = (GATKSAMRecord) super.clone();
if (temporaryAttributes != null) { if (temporaryAttributes != null) {
clone.temporaryAttributes = new HashMap<Object, Object>(); clone.temporaryAttributes = new HashMap<>();
for (Object attribute : temporaryAttributes.keySet()) for (Object attribute : temporaryAttributes.keySet())
clone.setTemporaryAttribute(attribute, temporaryAttributes.get(attribute)); clone.setTemporaryAttribute(attribute, temporaryAttributes.get(attribute));
} }