Improve the PairHMM API for better FPGA integration

Motivation:
The API was different between the regular PairHMM and the FPGA-implementation
via CnyPairHMM. As a result, the LikelihoodCalculationEngine had
to use account for this. The goal is to change the API to be the same
for all implementations, and make it easier to access.

PairHMM
PairHMM now accepts a list of reads and a map of alleles/haplotpes and returns a PerReadAlleleLikelihoodMap.
Added a new primary method that loops the reads and haplotypes, extracts qualities,
and passes them to the computeReadLikelihoodGivenHaplotypeLog10 method.
Did not alter that method, or its subcompute method, at all.
PairHMM also now handles its own (re)initialization, so users don't have to worry about that.

CnyPairHMM
Added that same new primary access method to this FPGA class.
Method overrides the default implementation in PairHMM. Walks through a list of reads.
Individual-read quals and the full haplotype list are fed to batchAdd(), as before.
However, instead of waiting for every read to get added, and then walking through the reads
again to extract results, we just get the haplotype-results array for each read as soon as it
is generated, and pack it into a perReadAlleleLikelihoodMap for return.
The main access method is now the same no matter whether the FPGA CnyPairHMM is used or not.

LikelihoodCalculationEngine
The functionality to loop through the reads and haplotypes and get individual log10-likelihoods
was moved to the PairHMM, and so removed from here. However, this class does need to retain
the ability to pre-process the reads, and post-process the resulting likelihoods map.
Those features were separated from running the HMM and refactored into their own methods
Commented out the (unused) system for finding best N haplotypes for genotyping.

PairHMMIndelErrorModel
Similar changes were made as to the LCE. However, in this case the haplotypes are modified
based on each individual read, so the read-list we feed into the HMM only has one read.
This commit is contained in:
bradtaylor 2013-10-21 14:08:03 -04:00
parent 96024403bf
commit e40a07bb58
10 changed files with 547 additions and 401 deletions

View File

@ -52,18 +52,10 @@ import net.sf.samtools.SAMUtils;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.genotyper.MostLikelyAllele;
import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
import org.broadinstitute.sting.utils.haplotype.Haplotype;
import org.broadinstitute.sting.utils.haplotype.HaplotypeScoreComparator;
import org.broadinstitute.sting.utils.pairhmm.ArrayLoglessPairHMM;
import org.broadinstitute.sting.utils.pairhmm.Log10PairHMM;
import org.broadinstitute.sting.utils.pairhmm.LoglessPairHMM;
import org.broadinstitute.sting.utils.pairhmm.CnyPairHMM;
import org.broadinstitute.sting.utils.pairhmm.BatchPairHMM;
import org.broadinstitute.sting.utils.pairhmm.PairHMM;
import org.broadinstitute.sting.utils.pairhmm.*;
import org.broadinstitute.sting.utils.recalibration.covariates.RepeatCovariate;
import org.broadinstitute.sting.utils.recalibration.covariates.RepeatLengthCovariate;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
@ -88,7 +80,7 @@ public class LikelihoodCalculationEngine {
private final PairHMM.HMM_IMPLEMENTATION hmmType;
private final boolean noFpga;
private final ThreadLocal<PairHMM> pairHMM = new ThreadLocal<PairHMM>() {
private final ThreadLocal<PairHMM> pairHMMThreadLocal = new ThreadLocal<PairHMM>() {
@Override
protected PairHMM initialValue() {
switch (hmmType) {
@ -109,6 +101,8 @@ public class LikelihoodCalculationEngine {
}
}
};
// Attempted to do as below, to avoid calling pairHMMThreadLocal.get() later on, but it resulted in a NullPointerException
// private final PairHMM pairHMM = pairHMMThreadLocal.get();
private final static boolean WRITE_LIKELIHOODS_TO_FILE = false;
private final static String LIKELIHOODS_FILENAME = "likelihoods.txt";
@ -173,36 +167,145 @@ public class LikelihoodCalculationEngine {
if ( likelihoodsStream != null ) likelihoodsStream.close();
}
/**
* Initialize our pairHMM with parameters appropriate to the haplotypes and reads we're going to evaluate
*
* After calling this routine the PairHMM will be configured to best evaluate all reads in the samples
* against the set of haplotypes
*
* @param haplotypes a non-null list of haplotypes
* @param perSampleReadList a mapping from sample -> reads
*/
private void initializePairHMM(final List<Haplotype> haplotypes, final Map<String, List<GATKSAMRecord>> perSampleReadList) {
int X_METRIC_LENGTH = 0;
for( final Map.Entry<String, List<GATKSAMRecord>> sample : perSampleReadList.entrySet() ) {
for( final GATKSAMRecord read : sample.getValue() ) {
final int readLength = read.getReadLength();
if( readLength > X_METRIC_LENGTH ) { X_METRIC_LENGTH = readLength; }
}
private void writeDebugLikelihoods(final GATKSAMRecord processedRead, final Haplotype haplotype, final double log10l){
if ( WRITE_LIKELIHOODS_TO_FILE ) {
likelihoodsStream.printf("%s %s %s %s %s %s %f%n",
haplotype.getBaseString(),
new String(processedRead.getReadBases() ),
SAMUtils.phredToFastq(processedRead.getBaseQualities() ),
SAMUtils.phredToFastq(processedRead.getBaseInsertionQualities() ),
SAMUtils.phredToFastq(processedRead.getBaseDeletionQualities() ),
SAMUtils.phredToFastq(constantGCP),
log10l);
}
int Y_METRIC_LENGTH = 0;
for( final Haplotype h : haplotypes ) {
final int haplotypeLength = h.getBases().length;
if( haplotypeLength > Y_METRIC_LENGTH ) { Y_METRIC_LENGTH = haplotypeLength; }
}
// initialize arrays to hold the probabilities of being in the match, insertion and deletion cases
pairHMM.get().initialize(X_METRIC_LENGTH, Y_METRIC_LENGTH);
}
private Map<Allele, Haplotype> createAlleleMap(List<Haplotype> haplotypes){
final int numHaplotypes = haplotypes.size();
final Map<Allele, Haplotype> alleleMap = new LinkedHashMap<>(numHaplotypes);
for ( final Haplotype haplotype : haplotypes ) {
final Allele allele = Allele.create(haplotype, true);
alleleMap.put(allele, haplotype);
}
return alleleMap;
}
private Map<GATKSAMRecord, byte[]> fillGCPArrays(List<GATKSAMRecord> reads){
final Map<GATKSAMRecord, byte []> GCPArrayMap = new LinkedHashMap<>();
for (GATKSAMRecord read: reads){
byte [] GCPArray = new byte[read.getReadBases().length];
Arrays.fill( GCPArray, constantGCP ); // Is there a way to derive empirical estimates for this from the data?
GCPArrayMap.put(read, GCPArray);
}
return GCPArrayMap;
}
private void capMinimumReadQualities(GATKSAMRecord read, byte[] readQuals, byte[] readInsQuals, byte[] readDelQuals) {
for( int kkk = 0; kkk < readQuals.length; kkk++ ) {
readQuals[kkk] = (byte) Math.min( 0xff & readQuals[kkk], read.getMappingQuality()); // cap base quality by mapping quality, as in UG
readQuals[kkk] = ( readQuals[kkk] < BASE_QUALITY_SCORE_THRESHOLD ? QualityUtils.MIN_USABLE_Q_SCORE : readQuals[kkk] );
readInsQuals[kkk] = ( readInsQuals[kkk] < QualityUtils.MIN_USABLE_Q_SCORE ? QualityUtils.MIN_USABLE_Q_SCORE : readInsQuals[kkk] );
readDelQuals[kkk] = ( readDelQuals[kkk] < QualityUtils.MIN_USABLE_Q_SCORE ? QualityUtils.MIN_USABLE_Q_SCORE : readDelQuals[kkk] );
}
}
/**
* Pre-processing of the reads to be evaluated at the current location from the current sample.
* We apply the PCR Error Model, and cap the minimum base, insertion, and deletion qualities of each read.
* Modified copies of reads are packed into a new list, while original reads are retained for downstream use
*
* @param reads The original list of unmodified reads
* @return processedReads. A new list of reads, in the same order, whose qualities have been altered by PCR error model and minimal quality thresholding
*/
private List<GATKSAMRecord> modifyReadQualities(final List<GATKSAMRecord> reads) {
List<GATKSAMRecord> processedReads = new LinkedList<>();
for ( GATKSAMRecord read : reads ) {
final byte[] readBases = read.getReadBases();
// NOTE -- must clone anything that gets modified here so we don't screw up future uses of the read
final byte[] readQuals = read.getBaseQualities().clone();
final byte[] readInsQuals = read.getBaseInsertionQualities().clone();
final byte[] readDelQuals = read.getBaseDeletionQualities().clone();
applyPCRErrorModel(readBases, readInsQuals, readDelQuals);
capMinimumReadQualities(read, readQuals, readInsQuals, readDelQuals);
// Create a new copy of the read and sets its base qualities to the modified versions.
// Pack this into a new list for return
final GATKSAMRecord processedRead = GATKSAMRecord.createQualityModifiedRead(read, readBases, readQuals, readInsQuals, readDelQuals);
processedReads.add(processedRead);
}
return processedReads;
}
/**
* Post-processing of the read/allele likelihoods.
*
* We send quality-capped reads to the pairHMM for evaluation, and it returns a map containing these capped reads.
* We wish to return a map containing the original, unmodified reads.
*
* At the same time, we want to effectively set a lower cap on the reference score, based on the global mis-mapping rate.
* This protects us from the case where the assembly has produced haplotypes
* that are very divergent from reference, but are supported by only one read. In effect
* we capping how badly scoring the reference can be for any read by the chance that the read
* itself just doesn't belong here
*
* @param perReadAlleleLikelihoodMap the original map returned by the PairHMM. Contains the processed reads, the haplotype Alleles, and their log10ls
* @param reads Our original, unmodified reads
* @param processedReads Reads whose minimum base,insertion,deletion qualities have been capped; these were actually used to derive log10ls
* @param alleleHaplotypeMap The map associating the Allele and Haplotype versions of each haplotype
*
* @return processedReadAlleleLikelihoodMap; a new PRALM containing the original reads, and their haplotype log10ls including capped reference log10ls
*/
private PerReadAlleleLikelihoodMap capReferenceHaplotypeLikelihoods(PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap, List<GATKSAMRecord> reads, List<GATKSAMRecord> processedReads, Map<Allele, Haplotype> alleleHaplotypeMap){
// a new read/allele map, to contain the uncapped reads, haplotypes, and potentially the capped reference log10ls
final PerReadAlleleLikelihoodMap processedReadAlleleLikelihoodMap = new PerReadAlleleLikelihoodMap();
Allele refAllele = null;
final int numReads = reads.size();
for (int readIndex = 0; readIndex < numReads; readIndex++) {
// Get the original and quality-modified read from their respective lists
// Note that this requires both lists to have reads in the same order
final GATKSAMRecord originalRead = reads.get(readIndex);
final GATKSAMRecord processedRead = processedReads.get(readIndex);
// keep track of the reference likelihood and the best non-ref likelihood
double refLog10l = Double.NEGATIVE_INFINITY;
double bestNonReflog10L = Double.NEGATIVE_INFINITY;
for ( Allele allele : alleleHaplotypeMap.keySet() ) {
final double log10l = perReadAlleleLikelihoodMap.getLikelihoodAssociatedWithReadAndAllele(processedRead, allele);
final Haplotype haplotype = alleleHaplotypeMap.get(allele);
if ( haplotype.isNonReference() )
bestNonReflog10L = Math.max(bestNonReflog10L, log10l);
else {
refAllele = allele;
refLog10l = log10l;
}
writeDebugLikelihoods(processedRead, haplotype, log10l);
// add the ORIGINAL (non-capped) read to the final map, along with the current haplotype and associated log10l
processedReadAlleleLikelihoodMap.add(originalRead, allele, log10l);
}
// ensure that the reference haplotype is no worse than the best non-ref haplotype minus the global
// mismapping rate. This protects us from the case where the assembly has produced haplotypes
// that are very divergent from reference, but are supported by only one read. In effect
// we capping how badly scoring the reference can be for any read by the chance that the read
// itself just doesn't belong here
final double worstRefLog10Allowed = bestNonReflog10L + log10globalReadMismappingRate;
if ( refLog10l < (worstRefLog10Allowed) ) {
processedReadAlleleLikelihoodMap.add(originalRead, refAllele, worstRefLog10Allowed);
}
}
return processedReadAlleleLikelihoodMap;
}
public Map<String, PerReadAlleleLikelihoodMap> computeReadLikelihoods( final List<Haplotype> haplotypes, final Map<String, List<GATKSAMRecord>> perSampleReadList ) {
// configure the HMM
initializePairHMM(haplotypes, perSampleReadList);
// Add likelihoods for each sample's reads to our stratifiedReadMap
final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap = new LinkedHashMap<>();
@ -218,137 +321,22 @@ public class LikelihoodCalculationEngine {
}
private PerReadAlleleLikelihoodMap computeReadLikelihoods( final List<Haplotype> haplotypes, final List<GATKSAMRecord> reads) {
// first, a little set up to get copies of the Haplotypes that are Alleles (more efficient than creating them each time)
final BatchPairHMM batchPairHMM = (pairHMM.get() instanceof BatchPairHMM) ? (BatchPairHMM)pairHMM.get() : null;
final Vector<GATKSAMRecord> batchedReads = new Vector<GATKSAMRecord>(reads.size());
final int numHaplotypes = haplotypes.size();
final Map<Haplotype, Allele> alleleVersions = new LinkedHashMap<>(numHaplotypes);
Allele refAllele = null;
for ( final Haplotype haplotype : haplotypes ) {
final Allele allele = Allele.create(haplotype, true);
alleleVersions.put(haplotype, allele);
if ( haplotype.isReference() ) refAllele = allele;
}
final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap = new PerReadAlleleLikelihoodMap();
for( final GATKSAMRecord read : reads ) {
// Modify the read qualities by applying the PCR error model and capping the minimum base,insertion,deletion qualities
List<GATKSAMRecord> processedReads = modifyReadQualities(reads);
final byte[] readBases = read.getReadBases();
final byte[] overallGCP = new byte[read.getReadLength()];
Arrays.fill( overallGCP, constantGCP ); // Is there a way to derive empirical estimates for this from the data?
// Get alleles corresponding to our haplotypees
Map<Allele, Haplotype> alleleHaplotypeMap = createAlleleMap(haplotypes);
// NOTE -- must clone anything that gets modified here so we don't screw up future uses of the read
final byte[] readQuals = read.getBaseQualities().clone();
final byte[] readInsQuals = read.getBaseInsertionQualities().clone();
final byte[] readDelQuals = read.getBaseDeletionQualities().clone();
// Get an array containing the constantGCP for each read in our modified read list
Map<GATKSAMRecord,byte[]> GCPArrayMap = fillGCPArrays(processedReads);
applyPCRErrorModel(readBases, readInsQuals, readDelQuals);
// Run the PairHMM to calculate the log10 likelihood of each (processed) reads' arising from each haplotype
PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap = pairHMMThreadLocal.get().computeLikelihoods(processedReads, alleleHaplotypeMap, GCPArrayMap);
for( int kkk = 0; kkk < readQuals.length; kkk++ ) {
readQuals[kkk] = (byte) Math.min( 0xff & readQuals[kkk], read.getMappingQuality()); // cap base quality by mapping quality, as in UG
readQuals[kkk] = ( readQuals[kkk] < BASE_QUALITY_SCORE_THRESHOLD ? QualityUtils.MIN_USABLE_Q_SCORE : readQuals[kkk] );
readInsQuals[kkk] = ( readInsQuals[kkk] < QualityUtils.MIN_USABLE_Q_SCORE ? QualityUtils.MIN_USABLE_Q_SCORE : readInsQuals[kkk] );
readDelQuals[kkk] = ( readDelQuals[kkk] < QualityUtils.MIN_USABLE_Q_SCORE ? QualityUtils.MIN_USABLE_Q_SCORE : readDelQuals[kkk] );
}
// Generate a new map containing the original, unmodified reads, and with minimal reference haplotype log10ls determined from the global mis-mapping rate
if ( batchPairHMM != null ) {
batchPairHMM.batchAdd(haplotypes, readBases, readQuals, readInsQuals, readDelQuals, overallGCP);
batchedReads.add(read);
continue;
}
// keep track of the reference likelihood and the best non-ref likelihood
double refLog10l = Double.NEGATIVE_INFINITY;
double bestNonReflog10L = Double.NEGATIVE_INFINITY;
// iterate over all haplotypes, calculating the likelihood of the read for each haplotype
for( int jjj = 0; jjj < numHaplotypes; jjj++ ) {
final Haplotype haplotype = haplotypes.get(jjj);
final byte[] nextHaplotypeBases = (jjj == numHaplotypes - 1) ? null : haplotypes.get(jjj+1).getBases();
final boolean isFirstHaplotype = jjj == 0;
final double log10l = pairHMM.get().computeReadLikelihoodGivenHaplotypeLog10(haplotype.getBases(),
readBases, readQuals, readInsQuals, readDelQuals, overallGCP, isFirstHaplotype, nextHaplotypeBases);
if ( WRITE_LIKELIHOODS_TO_FILE ) {
likelihoodsStream.printf("%s %s %s %s %s %s %f%n",
haplotype.getBaseString(),
new String(readBases),
SAMUtils.phredToFastq(readQuals),
SAMUtils.phredToFastq(readInsQuals),
SAMUtils.phredToFastq(readDelQuals),
SAMUtils.phredToFastq(overallGCP),
log10l);
}
if ( haplotype.isNonReference() )
bestNonReflog10L = Math.max(bestNonReflog10L, log10l);
else
refLog10l = log10l;
perReadAlleleLikelihoodMap.add(read, alleleVersions.get(haplotype), log10l);
}
// ensure that the reference haplotype is no worse than the best non-ref haplotype minus the global
// mismapping rate. This protects us from the case where the assembly has produced haplotypes
// that are very divergent from reference, but are supported by only one read. In effect
// we capping how badly scoring the reference can be for any read by the chance that the read
// itself just doesn't belong here
final double worstRefLog10Allowed = bestNonReflog10L + log10globalReadMismappingRate;
if ( refLog10l < (worstRefLog10Allowed) ) {
perReadAlleleLikelihoodMap.add(read, refAllele, worstRefLog10Allowed);
}
}
if ( batchPairHMM != null ) {
for( final GATKSAMRecord read : batchedReads ) {
double refLog10l = Double.NEGATIVE_INFINITY;
double bestNonReflog10L = Double.NEGATIVE_INFINITY;
final double[] likelihoods = batchPairHMM.batchGetResult();
for( int jjj = 0; jjj < numHaplotypes; jjj++ ) {
final Haplotype haplotype = haplotypes.get(jjj);
final double log10l = likelihoods[jjj];
if ( WRITE_LIKELIHOODS_TO_FILE ) {
final byte[] overallGCP = new byte[read.getReadLength()];
Arrays.fill( overallGCP, constantGCP ); // Is there a way to derive empirical estimates for this from the data?
// NOTE -- must clone anything that gets modified here so we don't screw up future uses of the read
final byte[] readQuals = read.getBaseQualities().clone();
final byte[] readInsQuals = read.getBaseInsertionQualities();
final byte[] readDelQuals = read.getBaseDeletionQualities();
for( int kkk = 0; kkk < readQuals.length; kkk++ ) {
readQuals[kkk] = (byte) Math.min( 0xff & readQuals[kkk], read.getMappingQuality()); // cap base quality by mapping quality, as in UG
//readQuals[kkk] = ( readQuals[kkk] > readInsQuals[kkk] ? readInsQuals[kkk] : readQuals[kkk] ); // cap base quality by base insertion quality, needs to be evaluated
//readQuals[kkk] = ( readQuals[kkk] > readDelQuals[kkk] ? readDelQuals[kkk] : readQuals[kkk] ); // cap base quality by base deletion quality, needs to be evaluated
// TODO -- why is Q18 hard-coded here???
readQuals[kkk] = ( readQuals[kkk] < (byte) 18 ? QualityUtils.MIN_USABLE_Q_SCORE : readQuals[kkk] );
}
likelihoodsStream.printf("%s %s %s %s %s %s %f%n",
haplotype.getBaseString(),
new String(read.getReadBases()),
SAMUtils.phredToFastq(readQuals),
SAMUtils.phredToFastq(readInsQuals),
SAMUtils.phredToFastq(readDelQuals),
SAMUtils.phredToFastq(overallGCP),
log10l);
}
if ( haplotype.isNonReference() )
bestNonReflog10L = Math.max(bestNonReflog10L, log10l);
else
refLog10l = log10l;
perReadAlleleLikelihoodMap.add(read, alleleVersions.get(haplotype), log10l);
}
final double worstRefLog10Allowed = bestNonReflog10L + log10globalReadMismappingRate;
if ( refLog10l < (worstRefLog10Allowed) ) {
perReadAlleleLikelihoodMap.add(read, refAllele, worstRefLog10Allowed);
}
}
}
return perReadAlleleLikelihoodMap;
return capReferenceHaplotypeLikelihoods(perReadAlleleLikelihoodMap, reads, processedReads, alleleHaplotypeMap);
}
@Requires({"alleleOrdering.size() > 0"})
@ -421,125 +409,125 @@ public class LikelihoodCalculationEngine {
// System to compute the best N haplotypes for genotyping
//
// --------------------------------------------------------------------------------
/**
* Helper function for selectBestHaplotypesFromEachSample that updates the score of haplotype haplotypeAsAllele
* @param map an annoying map object that moves us between the allele and haplotype representation
* @param haplotypeAsAllele the allele version of the haplotype
* @return the haplotype version, with its score incremented by 1 if its non-reference
*/
private Haplotype updateSelectHaplotype(final Map<Allele, Haplotype> map, final Allele haplotypeAsAllele) {
final Haplotype h = map.get(haplotypeAsAllele); // TODO -- fixme when haplotypes are properly generic
if ( h.isNonReference() ) h.setScore(h.getScore() + 1); // ref is already at max value
return h;
}
/**
* Take the best N haplotypes and return them as a list
*
* Only considers the haplotypes selectedHaplotypes that were actually selected by at least one sample
* as it's preferred haplotype. Takes the best N haplotypes from selectedHaplotypes in decreasing
* order of score (so higher score haplotypes are preferred). The N we take is determined by
*
* N = min(2 * nSamples + 1, maxNumHaplotypesInPopulation)
*
* where 2 * nSamples is the number of chromosomes in 2 samples including the reference, and our workload is
* bounded by maxNumHaplotypesInPopulation as that number can grow without bound
*
* @param selectedHaplotypes a non-null set of haplotypes with scores >= 1
* @param nSamples the number of samples used to select the haplotypes
* @param maxNumHaplotypesInPopulation the maximum number of haplotypes we're allowed to take, regardless of nSamples
* @return a list of N or fewer haplotypes, with the reference haplotype first
*/
private List<Haplotype> selectBestHaplotypesAccordingToScore(final Set<Haplotype> selectedHaplotypes, final int nSamples, final int maxNumHaplotypesInPopulation) {
final List<Haplotype> selectedHaplotypesList = new ArrayList<Haplotype>(selectedHaplotypes);
Collections.sort(selectedHaplotypesList, new HaplotypeScoreComparator());
final int numChromosomesInSamplesPlusRef = 2 * nSamples + 1;
final int haplotypesToKeep = Math.min(numChromosomesInSamplesPlusRef, maxNumHaplotypesInPopulation);
final List<Haplotype> bestHaplotypes = selectedHaplotypesList.size() <= haplotypesToKeep ? selectedHaplotypesList : selectedHaplotypesList.subList(0, haplotypesToKeep);
if ( bestHaplotypes.get(0).isNonReference()) throw new IllegalStateException("BUG: reference haplotype should be first in list");
return bestHaplotypes;
}
/**
* Select the best haplotypes for genotyping the samples in stratifiedReadMap
*
* Selects these haplotypes by counting up how often each haplotype is selected as one of the most likely
* haplotypes per sample. What this means is that each sample computes the diploid genotype likelihoods for
* all possible pairs of haplotypes, and the pair with the highest likelihood has each haplotype each get
* one extra count for each haplotype (so hom-var haplotypes get two counts). After performing this calculation
* the best N haplotypes are selected (@see #selectBestHaplotypesAccordingToScore) and a list of the
* haplotypes in order of score are returned, ensuring that at least one of the haplotypes is reference.
*
* @param haplotypes a list of all haplotypes we're considering
* @param stratifiedReadMap a map from sample -> read likelihoods per haplotype
* @param maxNumHaplotypesInPopulation the max. number of haplotypes we can select from haplotypes
* @return a list of selected haplotypes with size <= maxNumHaplotypesInPopulation
*/
public List<Haplotype> selectBestHaplotypesFromEachSample(final List<Haplotype> haplotypes, final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap, final int maxNumHaplotypesInPopulation) {
if ( haplotypes.size() < 2 ) throw new IllegalArgumentException("Must have at least 2 haplotypes to consider but only have " + haplotypes);
if ( haplotypes.size() == 2 ) return haplotypes; // fast path -- we'll always want to use 2 haplotypes
// all of the haplotypes that at least one sample called as one of the most likely
final Set<Haplotype> selectedHaplotypes = new HashSet<>();
selectedHaplotypes.add(findReferenceHaplotype(haplotypes)); // ref is always one of the selected
// our annoying map from allele -> haplotype
final Map<Allele, Haplotype> allele2Haplotype = new HashMap<>();
for ( final Haplotype h : haplotypes ) {
h.setScore(h.isReference() ? Double.MAX_VALUE : 0.0); // set all of the scores to 0 (lowest value) for all non-ref haplotypes
allele2Haplotype.put(Allele.create(h, h.isReference()), h);
}
// for each sample, compute the most likely pair of haplotypes
for ( final Map.Entry<String, PerReadAlleleLikelihoodMap> entry : stratifiedReadMap.entrySet() ) {
// get the two most likely haplotypes under a diploid model for this sample
final MostLikelyAllele mla = entry.getValue().getMostLikelyDiploidAlleles();
if ( mla != null ) { // there was something to evaluate in this sample
// note that there must be at least 2 haplotypes
final Haplotype best = updateSelectHaplotype(allele2Haplotype, mla.getMostLikelyAllele());
final Haplotype second = updateSelectHaplotype(allele2Haplotype, mla.getSecondMostLikelyAllele());
// if ( DEBUG ) {
// logger.info("Chose haplotypes " + best + " " + best.getCigar() + " and " + second + " " + second.getCigar() + " for sample " + entry.getKey());
//
// /**
// * Helper function for selectBestHaplotypesFromEachSample that updates the score of haplotype haplotypeAsAllele
// * @param map an annoying map object that moves us between the allele and haplotype representation
// * @param haplotypeAsAllele the allele version of the haplotype
// * @return the haplotype version, with its score incremented by 1 if its non-reference
// */
// private Haplotype updateSelectHaplotype(final Map<Allele, Haplotype> map, final Allele haplotypeAsAllele) {
// final Haplotype h = map.get(haplotypeAsAllele); // TODO -- fixme when haplotypes are properly generic
// if ( h.isNonReference() ) h.setScore(h.getScore() + 1); // ref is already at max value
// return h;
// }
//
// /**
// * Take the best N haplotypes and return them as a list
// *
// * Only considers the haplotypes selectedHaplotypes that were actually selected by at least one sample
// * as it's preferred haplotype. Takes the best N haplotypes from selectedHaplotypes in decreasing
// * order of score (so higher score haplotypes are preferred). The N we take is determined by
// *
// * N = min(2 * nSamples + 1, maxNumHaplotypesInPopulation)
// *
// * where 2 * nSamples is the number of chromosomes in 2 samples including the reference, and our workload is
// * bounded by maxNumHaplotypesInPopulation as that number can grow without bound
// *
// * @param selectedHaplotypes a non-null set of haplotypes with scores >= 1
// * @param nSamples the number of samples used to select the haplotypes
// * @param maxNumHaplotypesInPopulation the maximum number of haplotypes we're allowed to take, regardless of nSamples
// * @return a list of N or fewer haplotypes, with the reference haplotype first
// */
// private List<Haplotype> selectBestHaplotypesAccordingToScore(final Set<Haplotype> selectedHaplotypes, final int nSamples, final int maxNumHaplotypesInPopulation) {
// final List<Haplotype> selectedHaplotypesList = new ArrayList<>(selectedHaplotypes);
// Collections.sort(selectedHaplotypesList, new HaplotypeScoreComparator());
// final int numChromosomesInSamplesPlusRef = 2 * nSamples + 1;
// final int haplotypesToKeep = Math.min(numChromosomesInSamplesPlusRef, maxNumHaplotypesInPopulation);
// final List<Haplotype> bestHaplotypes = selectedHaplotypesList.size() <= haplotypesToKeep ? selectedHaplotypesList : selectedHaplotypesList.subList(0, haplotypesToKeep);
// if ( bestHaplotypes.get(0).isNonReference()) throw new IllegalStateException("BUG: reference haplotype should be first in list");
// return bestHaplotypes;
// }
//
// /**
// * Select the best haplotypes for genotyping the samples in stratifiedReadMap
// *
// * Selects these haplotypes by counting up how often each haplotype is selected as one of the most likely
// * haplotypes per sample. What this means is that each sample computes the diploid genotype likelihoods for
// * all possible pairs of haplotypes, and the pair with the highest likelihood has each haplotype each get
// * one extra count for each haplotype (so hom-var haplotypes get two counts). After performing this calculation
// * the best N haplotypes are selected (@see #selectBestHaplotypesAccordingToScore) and a list of the
// * haplotypes in order of score are returned, ensuring that at least one of the haplotypes is reference.
// *
// * @param haplotypes a list of all haplotypes we're considering
// * @param stratifiedReadMap a map from sample -> read likelihoods per haplotype
// * @param maxNumHaplotypesInPopulation the max. number of haplotypes we can select from haplotypes
// * @return a list of selected haplotypes with size <= maxNumHaplotypesInPopulation
// */
// public List<Haplotype> selectBestHaplotypesFromEachSample(final List<Haplotype> haplotypes, final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap, final int maxNumHaplotypesInPopulation) {
// if ( haplotypes.size() < 2 ) throw new IllegalArgumentException("Must have at least 2 haplotypes to consider but only have " + haplotypes);
//
// if ( haplotypes.size() == 2 ) return haplotypes; // fast path -- we'll always want to use 2 haplotypes
//
// // all of the haplotypes that at least one sample called as one of the most likely
// final Set<Haplotype> selectedHaplotypes = new HashSet<>();
// selectedHaplotypes.add(findReferenceHaplotype(haplotypes)); // ref is always one of the selected
//
// // our annoying map from allele -> haplotype
// final Map<Allele, Haplotype> allele2Haplotype = new HashMap<>();
// for ( final Haplotype h : haplotypes ) {
// h.setScore(h.isReference() ? Double.MAX_VALUE : 0.0); // set all of the scores to 0 (lowest value) for all non-ref haplotypes
// allele2Haplotype.put(Allele.create(h, h.isReference()), h);
// }
//
// // for each sample, compute the most likely pair of haplotypes
// for ( final Map.Entry<String, PerReadAlleleLikelihoodMap> entry : stratifiedReadMap.entrySet() ) {
// // get the two most likely haplotypes under a diploid model for this sample
// final MostLikelyAllele mla = entry.getValue().getMostLikelyDiploidAlleles();
//
// if ( mla != null ) { // there was something to evaluate in this sample
// // note that there must be at least 2 haplotypes
// final Haplotype best = updateSelectHaplotype(allele2Haplotype, mla.getMostLikelyAllele());
// final Haplotype second = updateSelectHaplotype(allele2Haplotype, mla.getSecondMostLikelyAllele());
//
//// if ( DEBUG ) {
//// logger.info("Chose haplotypes " + best + " " + best.getCigar() + " and " + second + " " + second.getCigar() + " for sample " + entry.getKey());
//// }
//
// // add these two haplotypes to the set of haplotypes that have been selected
// selectedHaplotypes.add(best);
// selectedHaplotypes.add(second);
//
// // we've already selected all of our haplotypes, and we don't need to prune them down
// if ( selectedHaplotypes.size() == haplotypes.size() && haplotypes.size() < maxNumHaplotypesInPopulation )
// break;
// }
// add these two haplotypes to the set of haplotypes that have been selected
selectedHaplotypes.add(best);
selectedHaplotypes.add(second);
// we've already selected all of our haplotypes, and we don't need to prune them down
if ( selectedHaplotypes.size() == haplotypes.size() && haplotypes.size() < maxNumHaplotypesInPopulation )
break;
}
}
// take the best N haplotypes forward, in order of the number of samples that choose them
final int nSamples = stratifiedReadMap.size();
final List<Haplotype> bestHaplotypes = selectBestHaplotypesAccordingToScore(selectedHaplotypes, nSamples, maxNumHaplotypesInPopulation);
if ( DEBUG ) {
logger.info("Chose " + (bestHaplotypes.size() - 1) + " alternate haplotypes to genotype in all samples.");
for ( final Haplotype h : bestHaplotypes ) {
logger.info("\tHaplotype " + h.getCigar() + " selected for further genotyping" + (h.isNonReference() ? " found " + (int)h.getScore() + " haplotypes" : " as ref haplotype"));
}
}
return bestHaplotypes;
}
/**
* Find the haplotype that isRef(), or @throw ReviewedStingException if one isn't found
* @param haplotypes non-null list of haplotypes
* @return the reference haplotype
*/
private static Haplotype findReferenceHaplotype( final List<Haplotype> haplotypes ) {
for( final Haplotype h : haplotypes ) {
if( h.isReference() ) return h;
}
throw new ReviewedStingException( "No reference haplotype found in the list of haplotypes!" );
}
// }
//
// // take the best N haplotypes forward, in order of the number of samples that choose them
// final int nSamples = stratifiedReadMap.size();
// final List<Haplotype> bestHaplotypes = selectBestHaplotypesAccordingToScore(selectedHaplotypes, nSamples, maxNumHaplotypesInPopulation);
//
// if ( DEBUG ) {
// logger.info("Chose " + (bestHaplotypes.size() - 1) + " alternate haplotypes to genotype in all samples.");
// for ( final Haplotype h : bestHaplotypes ) {
// logger.info("\tHaplotype " + h.getCigar() + " selected for further genotyping" + (h.isNonReference() ? " found " + (int)h.getScore() + " haplotypes" : " as ref haplotype"));
// }
// }
// return bestHaplotypes;
// }
//
// /**
// * Find the haplotype that isRef(), or @throw ReviewedStingException if one isn't found
// * @param haplotypes non-null list of haplotypes
// * @return the reference haplotype
// */
// private static Haplotype findReferenceHaplotype( final List<Haplotype> haplotypes ) {
// for( final Haplotype h : haplotypes ) {
// if( h.isReference() ) return h;
// }
// throw new ReviewedStingException( "No reference haplotype found in the list of haplotypes!" );
// }
// --------------------------------------------------------------------------------
//

View File

@ -48,11 +48,11 @@ package org.broadinstitute.sting.gatk.walkers.indels;
import com.google.java.contract.Ensures;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.utils.haplotype.Haplotype;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.clipping.ReadClipper;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
import org.broadinstitute.sting.utils.haplotype.Haplotype;
import org.broadinstitute.sting.utils.pairhmm.ArrayLoglessPairHMM;
import org.broadinstitute.sting.utils.pairhmm.Log10PairHMM;
import org.broadinstitute.sting.utils.pairhmm.LoglessPairHMM;
@ -65,6 +65,7 @@ import org.broadinstitute.variant.variantcontext.Allele;
import java.util.Arrays;
import java.util.LinkedHashMap;
import java.util.LinkedList;
import java.util.Map;
//import org.broadinstitute.sting.utils.pairhmm.LoglessCachingPairHMM;
@ -206,6 +207,39 @@ public class PairHMMIndelErrorModel {
}
}
private LinkedHashMap<Allele, Haplotype> trimHaplotypes(final LinkedHashMap<Allele, Haplotype> haplotypeMap,
long startLocationInRefForHaplotypes,
long stopLocationInRefForHaplotypes,
final ReferenceContext ref){
final LinkedHashMap<Allele, Haplotype> trimmedHaplotypeMap = new LinkedHashMap<>();
for (final Allele a: haplotypeMap.keySet()) {
final Haplotype haplotype = haplotypeMap.get(a);
if (stopLocationInRefForHaplotypes > haplotype.getStopPosition())
stopLocationInRefForHaplotypes = haplotype.getStopPosition();
if (startLocationInRefForHaplotypes < haplotype.getStartPosition())
startLocationInRefForHaplotypes = haplotype.getStartPosition();
else if (startLocationInRefForHaplotypes > haplotype.getStopPosition())
startLocationInRefForHaplotypes = haplotype.getStopPosition();
final long indStart = startLocationInRefForHaplotypes - haplotype.getStartPosition();
final long indStop = stopLocationInRefForHaplotypes - haplotype.getStartPosition();
if (DEBUG)
System.out.format("indStart: %d indStop: %d WinStart:%d WinStop:%d start: %d stop: %d\n",
indStart, indStop, ref.getWindow().getStart(), ref.getWindow().getStop(), startLocationInRefForHaplotypes, stopLocationInRefForHaplotypes);
// get the trimmed haplotype-bases array and create a new haplotype based on it. Pack this into the new map
final byte[] trimmedHaplotypeBases = Arrays.copyOfRange(haplotype.getBases(), (int)indStart, (int)indStop);
final Haplotype trimmedHaplotype = new Haplotype(trimmedHaplotypeBases, haplotype.isReference());
trimmedHaplotypeMap.put(a, trimmedHaplotype);
}
return trimmedHaplotypeMap;
}
public synchronized double[] computeDiploidReadHaplotypeLikelihoods(final ReadBackedPileup pileup,
final LinkedHashMap<Allele, Haplotype> haplotypeMap,
@ -231,6 +265,8 @@ public class PairHMMIndelErrorModel {
final int[] readCounts) {
final double readLikelihoods[][] = new double[pileup.getNumberOfElements()][haplotypeMap.size()];
final LinkedList<GATKSAMRecord> readList = new LinkedList<>();
final Map<GATKSAMRecord, byte[]> readGCPArrayMap = new LinkedHashMap<>();
int readIdx=0;
for (PileupElement p: pileup) {
// > 1 when the read is a consensus read representing multiple independent observations
@ -369,86 +405,30 @@ public class PairHMMIndelErrorModel {
baseDeletionQualities = contextLogGapOpenProbabilities;
}
byte[] currentHaplotypeBases = null;
boolean firstHap = true;
double readLikelihood;
Allele currentAllele = null;
for (Allele a: haplotypeMap.keySet()) {
// Create a new read based on the current one, but with trimmed bases/quals, for use in the HMM
final GATKSAMRecord processedRead = GATKSAMRecord.createQualityModifiedRead(read, readBases, readQuals, baseInsertionQualities, baseDeletionQualities);
readList.add(processedRead);
Haplotype haplotype = haplotypeMap.get(a);
// Pack the shortened read and its associated gap-continuation-penalty array into a map, as required by PairHMM
readGCPArrayMap.put(processedRead,contextLogGapContinuationProbabilities);
if (stopLocationInRefForHaplotypes > haplotype.getStopPosition())
stopLocationInRefForHaplotypes = haplotype.getStopPosition();
// Create a map of alleles to a new set of haplotypes, whose bases have been trimmed to the approprate genomic locations
final Map<Allele, Haplotype> trimmedHaplotypeMap = trimHaplotypes(haplotypeMap, startLocationInRefForHaplotypes, stopLocationInRefForHaplotypes, ref);
if (startLocationInRefForHaplotypes < haplotype.getStartPosition())
startLocationInRefForHaplotypes = haplotype.getStartPosition();
else if (startLocationInRefForHaplotypes > haplotype.getStopPosition())
startLocationInRefForHaplotypes = haplotype.getStopPosition();
// Get the likelihoods for our clipped read against each of our trimmed haplotypes.
final PerReadAlleleLikelihoodMap singleReadRawLikelihoods = pairHMM.computeLikelihoods(readList, trimmedHaplotypeMap, readGCPArrayMap);
final long indStart = startLocationInRefForHaplotypes - haplotype.getStartPosition();
final long indStop = stopLocationInRefForHaplotypes - haplotype.getStartPosition();
if (DEBUG)
System.out.format("indStart: %d indStop: %d WinStart:%d WinStop:%d start: %d stop: %d readLength: %d C:%s\n",
indStart, indStop, ref.getWindow().getStart(), ref.getWindow().getStop(), startLocationInRefForHaplotypes, stopLocationInRefForHaplotypes, read.getReadLength(), read.getCigar().toString());
// peak at the next haplotype in the list
final byte[] nextHaplotypeBases = Arrays.copyOfRange(haplotype.getBases(), (int)indStart, (int)indStop);
// process the current haplotype in the list
if (currentHaplotypeBases != null) {
// it's possible that the indel starts at the last base of the haplotypes
if ( currentHaplotypeBases.length == 0 ) {
readLikelihood = -Double.MAX_VALUE;
} else {
if (firstHap) {
//no need to reallocate arrays for each new haplotype, as length won't change
pairHMM.initialize(readBases.length, currentHaplotypeBases.length);
firstHap = false;
}
readLikelihood = pairHMM.computeReadLikelihoodGivenHaplotypeLog10(currentHaplotypeBases, readBases, readQuals,
baseInsertionQualities, baseDeletionQualities, contextLogGapContinuationProbabilities, firstHap, nextHaplotypeBases);
}
if (DEBUG) {
System.out.println("H:"+new String(currentHaplotypeBases));
System.out.println("R:"+new String(readBases));
System.out.format("L:%4.2f\n",readLikelihood);
}
perReadAlleleLikelihoodMap.add(p, currentAllele, readLikelihood);
readLikelihoods[readIdx][j++] = readLikelihood;
}
// update the current haplotype
currentHaplotypeBases = nextHaplotypeBases;
currentAllele = a;
}
// process the final haplotype
if (currentHaplotypeBases != null) {
// it's possible that the indel starts at the last base of the haplotypes
if ( currentHaplotypeBases.length == 0 ) {
readLikelihood = -Double.MAX_VALUE;
} else {
if (firstHap) {
//no need to reallocate arrays for each new haplotype, as length won't change
pairHMM.initialize(readBases.length, currentHaplotypeBases.length);
firstHap = false;
}
// there is no next haplotype, so pass null for nextHaplotypeBases.
readLikelihood = pairHMM.computeReadLikelihoodGivenHaplotypeLog10(currentHaplotypeBases, readBases, readQuals,
baseInsertionQualities, baseDeletionQualities, contextLogGapContinuationProbabilities, firstHap, null);
}
if (DEBUG) {
System.out.println("H:"+new String(currentHaplotypeBases));
System.out.println("R:"+new String(readBases));
System.out.format("L:%4.2f\n",readLikelihood);
}
perReadAlleleLikelihoodMap.add(p, currentAllele, readLikelihood);
// Pack the original pilup element, each allele, and each associated log10 likelihood into a final map, and add each likelihood to the array
for (Allele a: trimmedHaplotypeMap.keySet()){
double readLikelihood = singleReadRawLikelihoods.getLikelihoodAssociatedWithReadAndAllele(processedRead, a);
perReadAlleleLikelihoodMap.add(p, a, readLikelihood);
readLikelihoods[readIdx][j++] = readLikelihood;
}
// The readList for sending to the HMM should only ever contain 1 read, as each must be clipped individually
readList.remove(processedRead);
// The same is true for the read/GCP-array map
readGCPArrayMap.remove(processedRead);
}
}
readIdx++;
@ -472,16 +452,16 @@ public class PairHMMIndelErrorModel {
return !((read.getAlignmentStart() >= eventStartPos-eventLength && read.getAlignmentStart() <= eventStartPos+1) || (read.getAlignmentEnd() >= eventStartPos && read.getAlignmentEnd() <= eventStartPos + eventLength));
}
private int computeFirstDifferingPosition(byte[] b1, byte[] b2) {
if (b1.length != b2.length)
return 0; // sanity check
for (int i=0; i < b1.length; i++ ){
if ( b1[i]!= b2[i] )
return i;
}
return b1.length;
}
// private int computeFirstDifferingPosition(byte[] b1, byte[] b2) {
// if (b1.length != b2.length)
// return 0; // sanity check
//
// for (int i=0; i < b1.length; i++ ){
// if ( b1[i]!= b2[i] )
// return i;
// }
// return b1.length;
// }
private static double[] getDiploidHaplotypeLikelihoods(final int numHaplotypes, final int readCounts[], final double readLikelihoods[][]) {
final double[][] haplotypeLikehoodMatrix = new double[numHaplotypes][numHaplotypes];

View File

@ -49,6 +49,7 @@ package org.broadinstitute.sting.utils.pairhmm;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import org.broadinstitute.sting.utils.QualityUtils;
import java.util.Arrays;
/**

View File

@ -47,12 +47,16 @@
package org.broadinstitute.sting.utils.pairhmm;
import org.broadinstitute.sting.utils.haplotype.Haplotype;
import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.variant.variantcontext.Allele;
import java.io.File;
import java.lang.reflect.Field;
import java.util.Arrays;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
public final class CnyPairHMM extends PairHMM implements BatchPairHMM {
private static class HmmInput {
@ -62,14 +66,14 @@ public final class CnyPairHMM extends PairHMM implements BatchPairHMM {
public byte[] deletionGOP;
public byte[] overallGCP;
public List<Haplotype> haplotypes;
};
}
public static class ResultQueue {
private int offset;
private List<double[]> batchResults;
public ResultQueue() {
batchResults = new LinkedList<double[]>();
batchResults = new LinkedList<>();
offset = 0;
}
@ -92,7 +96,7 @@ public final class CnyPairHMM extends PairHMM implements BatchPairHMM {
final static String libName = "gmvhdl_gatk_hmm";
private static boolean loaded = false;
private List<HmmInput> batchRequests = new LinkedList<HmmInput>();
private List<HmmInput> batchRequests = new LinkedList<>();
private ResultQueue resultQueue = new ResultQueue();
static public boolean isAvailable() {
@ -184,6 +188,55 @@ public final class CnyPairHMM extends PairHMM implements BatchPairHMM {
return results;
}
/**
* {@inheritDoc}
*/
@Override
public PerReadAlleleLikelihoodMap computeLikelihoods(final List<GATKSAMRecord> reads, final Map<Allele, Haplotype> alleleHaplotypeMap, final Map<GATKSAMRecord, byte[]> GCPArrayMap){
// initialize the pairHMM if necessary
if (! initialized) {
int readMaxLength = findMaxReadLength(reads);
int haplotypeMaxLength = findMaxHaplotypeLength(alleleHaplotypeMap);
initialize(readMaxLength, haplotypeMaxLength);
}
// Pass the read bases/quals, and the haplotypes as a list into the HMM
performBatchAdditions(reads, alleleHaplotypeMap, GCPArrayMap);
// Get the log10-likelihoods for each read/haplotype ant pack into the results map
final PerReadAlleleLikelihoodMap likelihoodMap = new PerReadAlleleLikelihoodMap();
collectLikelihoodResults(reads, alleleHaplotypeMap, likelihoodMap);
return likelihoodMap;
}
private void collectLikelihoodResults(List<GATKSAMRecord> reads, Map<Allele, Haplotype> alleleHaplotypeMap, PerReadAlleleLikelihoodMap likelihoodMap) {
for(final GATKSAMRecord read : reads){
final double[] likelihoods = batchGetResult();
int jjj = 0;
for (Allele allele : alleleHaplotypeMap.keySet()){
final double log10l = likelihoods[jjj];
likelihoodMap.add(read, allele, log10l);
jjj++;
}
}
}
private void performBatchAdditions(List<GATKSAMRecord> reads, Map<Allele, Haplotype> alleleHaplotypeMap, Map<GATKSAMRecord, byte[]> GCPArrayMap) {
final List<Haplotype> haplotypeList = getHaplotypeList(alleleHaplotypeMap);
for(final GATKSAMRecord read : reads){
final byte[] readBases = read.getReadBases();
final byte[] readQuals = read.getBaseQualities();
final byte[] readInsQuals = read.getBaseInsertionQualities();
final byte[] readDelQuals = read.getBaseDeletionQualities();
final byte[] overallGCP = GCPArrayMap.get(read);
batchAdd(haplotypeList, readBases, readQuals, readInsQuals, readDelQuals, overallGCP);
}
}
protected double subComputeReadLikelihoodGivenHaplotypeLog10( final byte[] haplotypeBases,
final byte[] readBases,
final byte[] readQuals,
@ -196,6 +249,14 @@ public final class CnyPairHMM extends PairHMM implements BatchPairHMM {
return 0.0;
}
private List<Haplotype> getHaplotypeList(Map<Allele, Haplotype> alleleHaplotypeMap){
final List<Haplotype> haplotypeList = new LinkedList<>();
for (Allele a : alleleHaplotypeMap.keySet()){
haplotypeList.add(alleleHaplotypeMap.get(a));
}
return haplotypeList;
}
private void enqueuePrepare(byte[] haplotypeBases, byte[] readBases) {
double[] results = null;
int n = dequeueRequirement(haplotypeBases.length, readBases.length);

View File

@ -70,17 +70,6 @@ public final class LoglessPairHMM extends N2MemoryPairHMM {
private static final int deletionToDeletion = 5;
/**
* {@inheritDoc}
*/
@Override
public void initialize(final int readMaxLength, final int haplotypeMaxLength ) {
super.initialize(readMaxLength, haplotypeMaxLength);
transition = new double[paddedMaxReadLength][6];
prior = new double[paddedMaxReadLength][paddedMaxHaplotypeLength];
}
/**
* {@inheritDoc}
*/

View File

@ -110,7 +110,7 @@ public class PerReadAlleleLikelihoodMap {
* @return a map from each allele to a list of reads that 'support' the allele
*/
protected Map<Allele,List<GATKSAMRecord>> getAlleleStratifiedReadMap() {
final Map<Allele, List<GATKSAMRecord>> alleleReadMap = new HashMap<Allele, List<GATKSAMRecord>>(alleles.size());
final Map<Allele, List<GATKSAMRecord>> alleleReadMap = new HashMap<>(alleles.size());
for ( final Allele allele : alleles )
alleleReadMap.put(allele, new ArrayList<GATKSAMRecord>());
@ -152,7 +152,7 @@ public class PerReadAlleleLikelihoodMap {
/**
* Does the current map contain the key associated with a particular SAM record in pileup?
* @param p Pileup element
* @return
* @return true if the map contains pileup element, else false
*/
public boolean containsPileupElement(final PileupElement p) {
return likelihoodReadMap.containsKey(p.getRead());
@ -176,9 +176,9 @@ public class PerReadAlleleLikelihoodMap {
return likelihoodReadMap.keySet();
}
public Collection<Map<Allele,Double>> getLikelihoodMapValues() {
return likelihoodReadMap.values();
}
// public Collection<Map<Allele,Double>> getLikelihoodMapValues() {
// return likelihoodReadMap.values();
// }
public int getNumberOfStoredElements() {
return likelihoodReadMap.size();
@ -191,6 +191,21 @@ public class PerReadAlleleLikelihoodMap {
return likelihoodReadMap.get(p.getRead());
}
/**
* Get the log10 likelihood associated with an individual read/allele
*
* @param read the read whose likelihood we want
* @param allele the allele whose likelihood we want
* @return the log10 likelihood that this read matches this allele
*/
public double getLikelihoodAssociatedWithReadAndAllele(final GATKSAMRecord read, final Allele allele){
if (!allelesSet.contains(allele) || !likelihoodReadMap.containsKey(read))
return 0.0;
return likelihoodReadMap.get(read).get(allele);
}
/**
* Get the most likely alleles estimated across all reads in this object
*

View File

@ -85,9 +85,6 @@ public final class Log10PairHMM extends N2MemoryPairHMM {
Arrays.fill(insertionMatrix[iii], Double.NEGATIVE_INFINITY);
Arrays.fill(deletionMatrix[iii], Double.NEGATIVE_INFINITY);
}
transition = new double[paddedMaxReadLength][6];
prior = new double[paddedMaxReadLength][paddedMaxHaplotypeLength];
}
/**

View File

@ -26,10 +26,6 @@
package org.broadinstitute.sting.utils.pairhmm;
import com.google.java.contract.Requires;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.utils.MathUtils;
import java.util.Arrays;
/**
* Superclass for PairHMM that want to use a full read x haplotype matrix for their match, insertion, and deletion matrix
@ -58,6 +54,9 @@ abstract class N2MemoryPairHMM extends PairHMM {
matchMatrix = new double[paddedMaxReadLength][paddedMaxHaplotypeLength];
insertionMatrix = new double[paddedMaxReadLength][paddedMaxHaplotypeLength];
deletionMatrix = new double[paddedMaxReadLength][paddedMaxHaplotypeLength];
transition = new double[paddedMaxReadLength][6];
prior = new double[paddedMaxReadLength][paddedMaxHaplotypeLength];
}
/**

View File

@ -28,9 +28,14 @@ package org.broadinstitute.sting.utils.pairhmm;
import com.google.java.contract.Requires;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
import org.broadinstitute.sting.utils.haplotype.Haplotype;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.variant.variantcontext.Allele;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
/**
* Util class for performing the pair HMM for local alignment. Figure 4.3 in Durbin 1998 book.
*
@ -59,7 +64,7 @@ public abstract class PairHMM {
protected int maxHaplotypeLength, maxReadLength;
protected int paddedMaxReadLength, paddedMaxHaplotypeLength;
protected int paddedReadLength, paddedHaplotypeLength;
private boolean initialized = false;
protected boolean initialized = false;
// only used for debugging purposes
protected boolean doNotUseTristateCorrection = false;
@ -73,7 +78,7 @@ public abstract class PairHMM {
* @param haplotypeMaxLength the max length of haplotypes we want to use with this PairHMM
* @param readMaxLength the max length of reads we want to use with this PairHMM
*/
public void initialize( final int readMaxLength, final int haplotypeMaxLength ) {
protected void initialize( final int readMaxLength, final int haplotypeMaxLength ) {
if ( readMaxLength <= 0 ) throw new IllegalArgumentException("READ_MAX_LENGTH must be > 0 but got " + readMaxLength);
if ( haplotypeMaxLength <= 0 ) throw new IllegalArgumentException("HAPLOTYPE_MAX_LENGTH must be > 0 but got " + haplotypeMaxLength);
@ -90,6 +95,79 @@ public abstract class PairHMM {
initialized = true;
}
protected int findMaxReadLength(final List<GATKSAMRecord> reads) {
int listMaxReadLength = 0;
for(GATKSAMRecord read : reads){
final int readLength = read.getReadLength();
if( readLength > listMaxReadLength ) { listMaxReadLength = readLength; }
}
return listMaxReadLength;
}
protected int findMaxHaplotypeLength(final Map<Allele, Haplotype> haplotypeMap) {
int listMaxHaplotypeLength = 0;
for( final Allele a: haplotypeMap.keySet() ) {
final Haplotype h = haplotypeMap.get(a);
final int haplotypeLength = h.getBases().length;
if( haplotypeLength > listMaxHaplotypeLength ) { listMaxHaplotypeLength = haplotypeLength; }
}
return listMaxHaplotypeLength;
}
/**
* Given a list of reads and haplotypes, for every read compute the total probability of said read arising from
* each haplotype given base substitution, insertion, and deletion probabilities.
*
* @param reads the list of reads
* @param alleleHaplotypeMap the list of haplotypes
* @param GCPArrayMap Each read is associated with an array containing the gap continuation penalties for use in the model. Length of each GCP-array must match that of its read.
* @return a PerReadAlleleLikelihoodMap containing each read, haplotype-allele, and the log10 probability of
* said read coming from the said haplotype under the provided error model
*/
public PerReadAlleleLikelihoodMap computeLikelihoods(final List<GATKSAMRecord> reads, final Map<Allele, Haplotype> alleleHaplotypeMap, final Map<GATKSAMRecord, byte[]> GCPArrayMap) {
// (re)initialize the pairHMM only if necessary
final int readMaxLength = findMaxReadLength(reads);
final int haplotypeMaxLength = findMaxHaplotypeLength(alleleHaplotypeMap);
if (!initialized || readMaxLength > maxReadLength || haplotypeMaxLength > maxHaplotypeLength) { initialize(readMaxLength, haplotypeMaxLength); }
final PerReadAlleleLikelihoodMap likelihoodMap = new PerReadAlleleLikelihoodMap();
for(GATKSAMRecord read : reads){
final byte[] readBases = read.getReadBases();
final byte[] readQuals = read.getBaseQualities();
final byte[] readInsQuals = read.getBaseInsertionQualities();
final byte[] readDelQuals = read.getBaseDeletionQualities();
final byte[] overallGCP = GCPArrayMap.get(read);
// peak at the next haplotype in the list (necessary to get nextHaplotypeBases, which is required for caching in the array implementation)
byte[] currentHaplotypeBases = null;
boolean isFirstHaplotype = true;
Allele currentAllele = null;
double log10l;
for (final Allele allele : alleleHaplotypeMap.keySet()){
final Haplotype haplotype = alleleHaplotypeMap.get(allele);
final byte[] nextHaplotypeBases = haplotype.getBases();
if (currentHaplotypeBases != null) {
log10l = computeReadLikelihoodGivenHaplotypeLog10(currentHaplotypeBases,
readBases, readQuals, readInsQuals, readDelQuals, overallGCP, isFirstHaplotype, nextHaplotypeBases);
likelihoodMap.add(read, currentAllele, log10l);
}
// update the current haplotype
currentHaplotypeBases = nextHaplotypeBases;
currentAllele = allele;
}
// process the final haplotype
if (currentHaplotypeBases != null) {
// there is no next haplotype, so pass null for nextHaplotypeBases.
log10l = computeReadLikelihoodGivenHaplotypeLog10(currentHaplotypeBases,
readBases, readQuals, readInsQuals, readDelQuals, overallGCP, isFirstHaplotype, null);
likelihoodMap.add(read, currentAllele, log10l);
}
}
return likelihoodMap;
}
/**
* Compute the total probability of read arising from haplotypeBases given base substitution, insertion, and deletion
* probabilities.
@ -110,7 +188,7 @@ public abstract class PairHMM {
* parameters are the same, and only the haplotype bases are changing underneath us
* @return the log10 probability of read coming from the haplotype under the provided error model
*/
public final double computeReadLikelihoodGivenHaplotypeLog10( final byte[] haplotypeBases,
protected final double computeReadLikelihoodGivenHaplotypeLog10( final byte[] haplotypeBases,
final byte[] readBases,
final byte[] readQuals,
final byte[] insertionGOP,
@ -118,6 +196,7 @@ public abstract class PairHMM {
final byte[] overallGCP,
final boolean recacheReadValues,
final byte[] nextHaploytpeBases) {
if ( ! initialized ) throw new IllegalStateException("Must call initialize before calling computeReadLikelihoodGivenHaplotypeLog10");
if ( haplotypeBases == null ) throw new IllegalArgumentException("haplotypeBases cannot be null");
if ( haplotypeBases.length > maxHaplotypeLength ) throw new IllegalArgumentException("Haplotype bases is too long, got " + haplotypeBases.length + " but max is " + maxHaplotypeLength);

View File

@ -139,7 +139,7 @@ public class GATKSAMRecord extends BAMRecord {
}
public static GATKSAMRecord createRandomRead(int length) {
List<CigarElement> cigarElements = new LinkedList<CigarElement>();
List<CigarElement> cigarElements = new LinkedList<>();
cigarElements.add(new CigarElement(length, CigarOperator.M));
Cigar cigar = new Cigar(cigarElements);
return ArtificialSAMUtils.createArtificialRead(cigar);
@ -536,10 +536,7 @@ public class GATKSAMRecord extends BAMRecord {
* @return True if an attribute has been set for this key.
*/
public boolean containsTemporaryAttribute(Object key) {
if(temporaryAttributes != null) {
return temporaryAttributes.containsKey(key);
}
return false;
return temporaryAttributes != null && temporaryAttributes.containsKey(key);
}
/**
@ -556,7 +553,7 @@ public class GATKSAMRecord extends BAMRecord {
*/
public Object setTemporaryAttribute(Object key, Object value) {
if(temporaryAttributes == null) {
temporaryAttributes = new HashMap<Object, Object>();
temporaryAttributes = new HashMap<>();
}
return temporaryAttributes.put(key, value);
}
@ -750,6 +747,46 @@ public class GATKSAMRecord extends BAMRecord {
return emptyRead;
}
/**
* Creates a new GATKSAMRecord with the source read's header, read group and mate
* information, but with the following fields set to user-supplied values:
* - Read Bases
* - Base Qualities
* - Base Insertion Qualities
* - Base Deletion Qualities
*
* Cigar string is empty (not-null)
*
* Use this method if you want to create a new GATKSAMRecord based on
* another GATKSAMRecord, but with modified bases and qualities
*
* @param read a read to copy the header from
* @param readBases an array containing the new bases you wish use in place of the originals
* @param baseQualities an array containing the new base qualities you wish use in place of the originals
* @param baseInsertionQualities an array containing the new base insertion qaulities
* @param baseDeletionQualities an array containing the new base deletion qualities
* @return a read with modified bases and qualities, safe for the GATK
*/
public static GATKSAMRecord createQualityModifiedRead(final GATKSAMRecord read,
final byte[] readBases,
final byte[] baseQualities,
final byte[] baseInsertionQualities,
final byte[] baseDeletionQualities) {
if ( baseQualities.length != readBases.length || baseInsertionQualities.length != readBases.length || baseDeletionQualities.length != readBases.length )
throw new IllegalArgumentException("Read bases and read quality arrays aren't the same size: Bases:" + readBases.length
+ " vs Base Q's:" + baseQualities.length
+ " vs Insert Q's:" + baseInsertionQualities.length
+ " vs Delete Q's:" + baseDeletionQualities.length);
final GATKSAMRecord processedRead = GATKSAMRecord.emptyRead(read);
processedRead.setReadBases(readBases);
processedRead.setBaseQualities(baseQualities, EventType.BASE_SUBSTITUTION);
processedRead.setBaseQualities(baseInsertionQualities, EventType.BASE_INSERTION);
processedRead.setBaseQualities(baseDeletionQualities, EventType.BASE_DELETION);
return processedRead;
}
/**
* Shallow copy of everything, except for the attribute list and the temporary attributes.
* A new list of the attributes is created for both, but the attributes themselves are copied by reference.
@ -762,7 +799,7 @@ public class GATKSAMRecord extends BAMRecord {
public Object clone() throws CloneNotSupportedException {
final GATKSAMRecord clone = (GATKSAMRecord) super.clone();
if (temporaryAttributes != null) {
clone.temporaryAttributes = new HashMap<Object, Object>();
clone.temporaryAttributes = new HashMap<>();
for (Object attribute : temporaryAttributes.keySet())
clone.setTemporaryAttribute(attribute, temporaryAttributes.get(attribute));
}