Merge pull request #417 from broadinstitute/bt_pairhmm_api_cleanup2

Improve the PairHMM API for better FPGA integration
This commit is contained in:
MauricioCarneiro 2013-11-14 10:47:07 -08:00
commit 7f08250870
10 changed files with 547 additions and 401 deletions

View File

@ -52,18 +52,10 @@ import net.sf.samtools.SAMUtils;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.genotyper.MostLikelyAllele;
import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
import org.broadinstitute.sting.utils.haplotype.Haplotype;
import org.broadinstitute.sting.utils.haplotype.HaplotypeScoreComparator;
import org.broadinstitute.sting.utils.pairhmm.ArrayLoglessPairHMM;
import org.broadinstitute.sting.utils.pairhmm.Log10PairHMM;
import org.broadinstitute.sting.utils.pairhmm.LoglessPairHMM;
import org.broadinstitute.sting.utils.pairhmm.CnyPairHMM;
import org.broadinstitute.sting.utils.pairhmm.BatchPairHMM;
import org.broadinstitute.sting.utils.pairhmm.PairHMM;
import org.broadinstitute.sting.utils.pairhmm.*;
import org.broadinstitute.sting.utils.recalibration.covariates.RepeatCovariate;
import org.broadinstitute.sting.utils.recalibration.covariates.RepeatLengthCovariate;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
@ -88,7 +80,7 @@ public class LikelihoodCalculationEngine {
private final PairHMM.HMM_IMPLEMENTATION hmmType;
private final boolean noFpga;
private final ThreadLocal<PairHMM> pairHMM = new ThreadLocal<PairHMM>() {
private final ThreadLocal<PairHMM> pairHMMThreadLocal = new ThreadLocal<PairHMM>() {
@Override
protected PairHMM initialValue() {
switch (hmmType) {
@ -109,6 +101,8 @@ public class LikelihoodCalculationEngine {
}
}
};
// Attempted to do as below, to avoid calling pairHMMThreadLocal.get() later on, but it resulted in a NullPointerException
// private final PairHMM pairHMM = pairHMMThreadLocal.get();
private final static boolean WRITE_LIKELIHOODS_TO_FILE = false;
private final static String LIKELIHOODS_FILENAME = "likelihoods.txt";
@ -173,36 +167,145 @@ public class LikelihoodCalculationEngine {
if ( likelihoodsStream != null ) likelihoodsStream.close();
}
/**
* Initialize our pairHMM with parameters appropriate to the haplotypes and reads we're going to evaluate
*
* After calling this routine the PairHMM will be configured to best evaluate all reads in the samples
* against the set of haplotypes
*
* @param haplotypes a non-null list of haplotypes
* @param perSampleReadList a mapping from sample -> reads
*/
private void initializePairHMM(final List<Haplotype> haplotypes, final Map<String, List<GATKSAMRecord>> perSampleReadList) {
int X_METRIC_LENGTH = 0;
for( final Map.Entry<String, List<GATKSAMRecord>> sample : perSampleReadList.entrySet() ) {
for( final GATKSAMRecord read : sample.getValue() ) {
final int readLength = read.getReadLength();
if( readLength > X_METRIC_LENGTH ) { X_METRIC_LENGTH = readLength; }
}
private void writeDebugLikelihoods(final GATKSAMRecord processedRead, final Haplotype haplotype, final double log10l){
if ( WRITE_LIKELIHOODS_TO_FILE ) {
likelihoodsStream.printf("%s %s %s %s %s %s %f%n",
haplotype.getBaseString(),
new String(processedRead.getReadBases() ),
SAMUtils.phredToFastq(processedRead.getBaseQualities() ),
SAMUtils.phredToFastq(processedRead.getBaseInsertionQualities() ),
SAMUtils.phredToFastq(processedRead.getBaseDeletionQualities() ),
SAMUtils.phredToFastq(constantGCP),
log10l);
}
int Y_METRIC_LENGTH = 0;
for( final Haplotype h : haplotypes ) {
final int haplotypeLength = h.getBases().length;
if( haplotypeLength > Y_METRIC_LENGTH ) { Y_METRIC_LENGTH = haplotypeLength; }
}
// initialize arrays to hold the probabilities of being in the match, insertion and deletion cases
pairHMM.get().initialize(X_METRIC_LENGTH, Y_METRIC_LENGTH);
}
private Map<Allele, Haplotype> createAlleleMap(List<Haplotype> haplotypes){
final int numHaplotypes = haplotypes.size();
final Map<Allele, Haplotype> alleleMap = new LinkedHashMap<>(numHaplotypes);
for ( final Haplotype haplotype : haplotypes ) {
final Allele allele = Allele.create(haplotype, true);
alleleMap.put(allele, haplotype);
}
return alleleMap;
}
private Map<GATKSAMRecord, byte[]> fillGCPArrays(List<GATKSAMRecord> reads){
final Map<GATKSAMRecord, byte []> GCPArrayMap = new LinkedHashMap<>();
for (GATKSAMRecord read: reads){
byte [] GCPArray = new byte[read.getReadBases().length];
Arrays.fill( GCPArray, constantGCP ); // Is there a way to derive empirical estimates for this from the data?
GCPArrayMap.put(read, GCPArray);
}
return GCPArrayMap;
}
private void capMinimumReadQualities(GATKSAMRecord read, byte[] readQuals, byte[] readInsQuals, byte[] readDelQuals) {
for( int kkk = 0; kkk < readQuals.length; kkk++ ) {
readQuals[kkk] = (byte) Math.min( 0xff & readQuals[kkk], read.getMappingQuality()); // cap base quality by mapping quality, as in UG
readQuals[kkk] = ( readQuals[kkk] < BASE_QUALITY_SCORE_THRESHOLD ? QualityUtils.MIN_USABLE_Q_SCORE : readQuals[kkk] );
readInsQuals[kkk] = ( readInsQuals[kkk] < QualityUtils.MIN_USABLE_Q_SCORE ? QualityUtils.MIN_USABLE_Q_SCORE : readInsQuals[kkk] );
readDelQuals[kkk] = ( readDelQuals[kkk] < QualityUtils.MIN_USABLE_Q_SCORE ? QualityUtils.MIN_USABLE_Q_SCORE : readDelQuals[kkk] );
}
}
/**
* Pre-processing of the reads to be evaluated at the current location from the current sample.
* We apply the PCR Error Model, and cap the minimum base, insertion, and deletion qualities of each read.
* Modified copies of reads are packed into a new list, while original reads are retained for downstream use
*
* @param reads The original list of unmodified reads
* @return processedReads. A new list of reads, in the same order, whose qualities have been altered by PCR error model and minimal quality thresholding
*/
private List<GATKSAMRecord> modifyReadQualities(final List<GATKSAMRecord> reads) {
List<GATKSAMRecord> processedReads = new LinkedList<>();
for ( GATKSAMRecord read : reads ) {
final byte[] readBases = read.getReadBases();
// NOTE -- must clone anything that gets modified here so we don't screw up future uses of the read
final byte[] readQuals = read.getBaseQualities().clone();
final byte[] readInsQuals = read.getBaseInsertionQualities().clone();
final byte[] readDelQuals = read.getBaseDeletionQualities().clone();
applyPCRErrorModel(readBases, readInsQuals, readDelQuals);
capMinimumReadQualities(read, readQuals, readInsQuals, readDelQuals);
// Create a new copy of the read and sets its base qualities to the modified versions.
// Pack this into a new list for return
final GATKSAMRecord processedRead = GATKSAMRecord.createQualityModifiedRead(read, readBases, readQuals, readInsQuals, readDelQuals);
processedReads.add(processedRead);
}
return processedReads;
}
/**
* Post-processing of the read/allele likelihoods.
*
* We send quality-capped reads to the pairHMM for evaluation, and it returns a map containing these capped reads.
* We wish to return a map containing the original, unmodified reads.
*
* At the same time, we want to effectively set a lower cap on the reference score, based on the global mis-mapping rate.
* This protects us from the case where the assembly has produced haplotypes
* that are very divergent from reference, but are supported by only one read. In effect
* we capping how badly scoring the reference can be for any read by the chance that the read
* itself just doesn't belong here
*
* @param perReadAlleleLikelihoodMap the original map returned by the PairHMM. Contains the processed reads, the haplotype Alleles, and their log10ls
* @param reads Our original, unmodified reads
* @param processedReads Reads whose minimum base,insertion,deletion qualities have been capped; these were actually used to derive log10ls
* @param alleleHaplotypeMap The map associating the Allele and Haplotype versions of each haplotype
*
* @return processedReadAlleleLikelihoodMap; a new PRALM containing the original reads, and their haplotype log10ls including capped reference log10ls
*/
private PerReadAlleleLikelihoodMap capReferenceHaplotypeLikelihoods(PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap, List<GATKSAMRecord> reads, List<GATKSAMRecord> processedReads, Map<Allele, Haplotype> alleleHaplotypeMap){
// a new read/allele map, to contain the uncapped reads, haplotypes, and potentially the capped reference log10ls
final PerReadAlleleLikelihoodMap processedReadAlleleLikelihoodMap = new PerReadAlleleLikelihoodMap();
Allele refAllele = null;
final int numReads = reads.size();
for (int readIndex = 0; readIndex < numReads; readIndex++) {
// Get the original and quality-modified read from their respective lists
// Note that this requires both lists to have reads in the same order
final GATKSAMRecord originalRead = reads.get(readIndex);
final GATKSAMRecord processedRead = processedReads.get(readIndex);
// keep track of the reference likelihood and the best non-ref likelihood
double refLog10l = Double.NEGATIVE_INFINITY;
double bestNonReflog10L = Double.NEGATIVE_INFINITY;
for ( Allele allele : alleleHaplotypeMap.keySet() ) {
final double log10l = perReadAlleleLikelihoodMap.getLikelihoodAssociatedWithReadAndAllele(processedRead, allele);
final Haplotype haplotype = alleleHaplotypeMap.get(allele);
if ( haplotype.isNonReference() )
bestNonReflog10L = Math.max(bestNonReflog10L, log10l);
else {
refAllele = allele;
refLog10l = log10l;
}
writeDebugLikelihoods(processedRead, haplotype, log10l);
// add the ORIGINAL (non-capped) read to the final map, along with the current haplotype and associated log10l
processedReadAlleleLikelihoodMap.add(originalRead, allele, log10l);
}
// ensure that the reference haplotype is no worse than the best non-ref haplotype minus the global
// mismapping rate. This protects us from the case where the assembly has produced haplotypes
// that are very divergent from reference, but are supported by only one read. In effect
// we capping how badly scoring the reference can be for any read by the chance that the read
// itself just doesn't belong here
final double worstRefLog10Allowed = bestNonReflog10L + log10globalReadMismappingRate;
if ( refLog10l < (worstRefLog10Allowed) ) {
processedReadAlleleLikelihoodMap.add(originalRead, refAllele, worstRefLog10Allowed);
}
}
return processedReadAlleleLikelihoodMap;
}
public Map<String, PerReadAlleleLikelihoodMap> computeReadLikelihoods( final List<Haplotype> haplotypes, final Map<String, List<GATKSAMRecord>> perSampleReadList ) {
// configure the HMM
initializePairHMM(haplotypes, perSampleReadList);
// Add likelihoods for each sample's reads to our stratifiedReadMap
final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap = new LinkedHashMap<>();
@ -218,137 +321,22 @@ public class LikelihoodCalculationEngine {
}
private PerReadAlleleLikelihoodMap computeReadLikelihoods( final List<Haplotype> haplotypes, final List<GATKSAMRecord> reads) {
// first, a little set up to get copies of the Haplotypes that are Alleles (more efficient than creating them each time)
final BatchPairHMM batchPairHMM = (pairHMM.get() instanceof BatchPairHMM) ? (BatchPairHMM)pairHMM.get() : null;
final Vector<GATKSAMRecord> batchedReads = new Vector<GATKSAMRecord>(reads.size());
final int numHaplotypes = haplotypes.size();
final Map<Haplotype, Allele> alleleVersions = new LinkedHashMap<>(numHaplotypes);
Allele refAllele = null;
for ( final Haplotype haplotype : haplotypes ) {
final Allele allele = Allele.create(haplotype, true);
alleleVersions.put(haplotype, allele);
if ( haplotype.isReference() ) refAllele = allele;
}
final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap = new PerReadAlleleLikelihoodMap();
for( final GATKSAMRecord read : reads ) {
// Modify the read qualities by applying the PCR error model and capping the minimum base,insertion,deletion qualities
List<GATKSAMRecord> processedReads = modifyReadQualities(reads);
final byte[] readBases = read.getReadBases();
final byte[] overallGCP = new byte[read.getReadLength()];
Arrays.fill( overallGCP, constantGCP ); // Is there a way to derive empirical estimates for this from the data?
// Get alleles corresponding to our haplotypees
Map<Allele, Haplotype> alleleHaplotypeMap = createAlleleMap(haplotypes);
// NOTE -- must clone anything that gets modified here so we don't screw up future uses of the read
final byte[] readQuals = read.getBaseQualities().clone();
final byte[] readInsQuals = read.getBaseInsertionQualities().clone();
final byte[] readDelQuals = read.getBaseDeletionQualities().clone();
// Get an array containing the constantGCP for each read in our modified read list
Map<GATKSAMRecord,byte[]> GCPArrayMap = fillGCPArrays(processedReads);
applyPCRErrorModel(readBases, readInsQuals, readDelQuals);
// Run the PairHMM to calculate the log10 likelihood of each (processed) reads' arising from each haplotype
PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap = pairHMMThreadLocal.get().computeLikelihoods(processedReads, alleleHaplotypeMap, GCPArrayMap);
for( int kkk = 0; kkk < readQuals.length; kkk++ ) {
readQuals[kkk] = (byte) Math.min( 0xff & readQuals[kkk], read.getMappingQuality()); // cap base quality by mapping quality, as in UG
readQuals[kkk] = ( readQuals[kkk] < BASE_QUALITY_SCORE_THRESHOLD ? QualityUtils.MIN_USABLE_Q_SCORE : readQuals[kkk] );
readInsQuals[kkk] = ( readInsQuals[kkk] < QualityUtils.MIN_USABLE_Q_SCORE ? QualityUtils.MIN_USABLE_Q_SCORE : readInsQuals[kkk] );
readDelQuals[kkk] = ( readDelQuals[kkk] < QualityUtils.MIN_USABLE_Q_SCORE ? QualityUtils.MIN_USABLE_Q_SCORE : readDelQuals[kkk] );
}
// Generate a new map containing the original, unmodified reads, and with minimal reference haplotype log10ls determined from the global mis-mapping rate
if ( batchPairHMM != null ) {
batchPairHMM.batchAdd(haplotypes, readBases, readQuals, readInsQuals, readDelQuals, overallGCP);
batchedReads.add(read);
continue;
}
// keep track of the reference likelihood and the best non-ref likelihood
double refLog10l = Double.NEGATIVE_INFINITY;
double bestNonReflog10L = Double.NEGATIVE_INFINITY;
// iterate over all haplotypes, calculating the likelihood of the read for each haplotype
for( int jjj = 0; jjj < numHaplotypes; jjj++ ) {
final Haplotype haplotype = haplotypes.get(jjj);
final byte[] nextHaplotypeBases = (jjj == numHaplotypes - 1) ? null : haplotypes.get(jjj+1).getBases();
final boolean isFirstHaplotype = jjj == 0;
final double log10l = pairHMM.get().computeReadLikelihoodGivenHaplotypeLog10(haplotype.getBases(),
readBases, readQuals, readInsQuals, readDelQuals, overallGCP, isFirstHaplotype, nextHaplotypeBases);
if ( WRITE_LIKELIHOODS_TO_FILE ) {
likelihoodsStream.printf("%s %s %s %s %s %s %f%n",
haplotype.getBaseString(),
new String(readBases),
SAMUtils.phredToFastq(readQuals),
SAMUtils.phredToFastq(readInsQuals),
SAMUtils.phredToFastq(readDelQuals),
SAMUtils.phredToFastq(overallGCP),
log10l);
}
if ( haplotype.isNonReference() )
bestNonReflog10L = Math.max(bestNonReflog10L, log10l);
else
refLog10l = log10l;
perReadAlleleLikelihoodMap.add(read, alleleVersions.get(haplotype), log10l);
}
// ensure that the reference haplotype is no worse than the best non-ref haplotype minus the global
// mismapping rate. This protects us from the case where the assembly has produced haplotypes
// that are very divergent from reference, but are supported by only one read. In effect
// we capping how badly scoring the reference can be for any read by the chance that the read
// itself just doesn't belong here
final double worstRefLog10Allowed = bestNonReflog10L + log10globalReadMismappingRate;
if ( refLog10l < (worstRefLog10Allowed) ) {
perReadAlleleLikelihoodMap.add(read, refAllele, worstRefLog10Allowed);
}
}
if ( batchPairHMM != null ) {
for( final GATKSAMRecord read : batchedReads ) {
double refLog10l = Double.NEGATIVE_INFINITY;
double bestNonReflog10L = Double.NEGATIVE_INFINITY;
final double[] likelihoods = batchPairHMM.batchGetResult();
for( int jjj = 0; jjj < numHaplotypes; jjj++ ) {
final Haplotype haplotype = haplotypes.get(jjj);
final double log10l = likelihoods[jjj];
if ( WRITE_LIKELIHOODS_TO_FILE ) {
final byte[] overallGCP = new byte[read.getReadLength()];
Arrays.fill( overallGCP, constantGCP ); // Is there a way to derive empirical estimates for this from the data?
// NOTE -- must clone anything that gets modified here so we don't screw up future uses of the read
final byte[] readQuals = read.getBaseQualities().clone();
final byte[] readInsQuals = read.getBaseInsertionQualities();
final byte[] readDelQuals = read.getBaseDeletionQualities();
for( int kkk = 0; kkk < readQuals.length; kkk++ ) {
readQuals[kkk] = (byte) Math.min( 0xff & readQuals[kkk], read.getMappingQuality()); // cap base quality by mapping quality, as in UG
//readQuals[kkk] = ( readQuals[kkk] > readInsQuals[kkk] ? readInsQuals[kkk] : readQuals[kkk] ); // cap base quality by base insertion quality, needs to be evaluated
//readQuals[kkk] = ( readQuals[kkk] > readDelQuals[kkk] ? readDelQuals[kkk] : readQuals[kkk] ); // cap base quality by base deletion quality, needs to be evaluated
// TODO -- why is Q18 hard-coded here???
readQuals[kkk] = ( readQuals[kkk] < (byte) 18 ? QualityUtils.MIN_USABLE_Q_SCORE : readQuals[kkk] );
}
likelihoodsStream.printf("%s %s %s %s %s %s %f%n",
haplotype.getBaseString(),
new String(read.getReadBases()),
SAMUtils.phredToFastq(readQuals),
SAMUtils.phredToFastq(readInsQuals),
SAMUtils.phredToFastq(readDelQuals),
SAMUtils.phredToFastq(overallGCP),
log10l);
}
if ( haplotype.isNonReference() )
bestNonReflog10L = Math.max(bestNonReflog10L, log10l);
else
refLog10l = log10l;
perReadAlleleLikelihoodMap.add(read, alleleVersions.get(haplotype), log10l);
}
final double worstRefLog10Allowed = bestNonReflog10L + log10globalReadMismappingRate;
if ( refLog10l < (worstRefLog10Allowed) ) {
perReadAlleleLikelihoodMap.add(read, refAllele, worstRefLog10Allowed);
}
}
}
return perReadAlleleLikelihoodMap;
return capReferenceHaplotypeLikelihoods(perReadAlleleLikelihoodMap, reads, processedReads, alleleHaplotypeMap);
}
@Requires({"alleleOrdering.size() > 0"})
@ -421,125 +409,125 @@ public class LikelihoodCalculationEngine {
// System to compute the best N haplotypes for genotyping
//
// --------------------------------------------------------------------------------
/**
* Helper function for selectBestHaplotypesFromEachSample that updates the score of haplotype haplotypeAsAllele
* @param map an annoying map object that moves us between the allele and haplotype representation
* @param haplotypeAsAllele the allele version of the haplotype
* @return the haplotype version, with its score incremented by 1 if its non-reference
*/
private Haplotype updateSelectHaplotype(final Map<Allele, Haplotype> map, final Allele haplotypeAsAllele) {
final Haplotype h = map.get(haplotypeAsAllele); // TODO -- fixme when haplotypes are properly generic
if ( h.isNonReference() ) h.setScore(h.getScore() + 1); // ref is already at max value
return h;
}
/**
* Take the best N haplotypes and return them as a list
*
* Only considers the haplotypes selectedHaplotypes that were actually selected by at least one sample
* as it's preferred haplotype. Takes the best N haplotypes from selectedHaplotypes in decreasing
* order of score (so higher score haplotypes are preferred). The N we take is determined by
*
* N = min(2 * nSamples + 1, maxNumHaplotypesInPopulation)
*
* where 2 * nSamples is the number of chromosomes in 2 samples including the reference, and our workload is
* bounded by maxNumHaplotypesInPopulation as that number can grow without bound
*
* @param selectedHaplotypes a non-null set of haplotypes with scores >= 1
* @param nSamples the number of samples used to select the haplotypes
* @param maxNumHaplotypesInPopulation the maximum number of haplotypes we're allowed to take, regardless of nSamples
* @return a list of N or fewer haplotypes, with the reference haplotype first
*/
private List<Haplotype> selectBestHaplotypesAccordingToScore(final Set<Haplotype> selectedHaplotypes, final int nSamples, final int maxNumHaplotypesInPopulation) {
final List<Haplotype> selectedHaplotypesList = new ArrayList<Haplotype>(selectedHaplotypes);
Collections.sort(selectedHaplotypesList, new HaplotypeScoreComparator());
final int numChromosomesInSamplesPlusRef = 2 * nSamples + 1;
final int haplotypesToKeep = Math.min(numChromosomesInSamplesPlusRef, maxNumHaplotypesInPopulation);
final List<Haplotype> bestHaplotypes = selectedHaplotypesList.size() <= haplotypesToKeep ? selectedHaplotypesList : selectedHaplotypesList.subList(0, haplotypesToKeep);
if ( bestHaplotypes.get(0).isNonReference()) throw new IllegalStateException("BUG: reference haplotype should be first in list");
return bestHaplotypes;
}
/**
* Select the best haplotypes for genotyping the samples in stratifiedReadMap
*
* Selects these haplotypes by counting up how often each haplotype is selected as one of the most likely
* haplotypes per sample. What this means is that each sample computes the diploid genotype likelihoods for
* all possible pairs of haplotypes, and the pair with the highest likelihood has each haplotype each get
* one extra count for each haplotype (so hom-var haplotypes get two counts). After performing this calculation
* the best N haplotypes are selected (@see #selectBestHaplotypesAccordingToScore) and a list of the
* haplotypes in order of score are returned, ensuring that at least one of the haplotypes is reference.
*
* @param haplotypes a list of all haplotypes we're considering
* @param stratifiedReadMap a map from sample -> read likelihoods per haplotype
* @param maxNumHaplotypesInPopulation the max. number of haplotypes we can select from haplotypes
* @return a list of selected haplotypes with size <= maxNumHaplotypesInPopulation
*/
public List<Haplotype> selectBestHaplotypesFromEachSample(final List<Haplotype> haplotypes, final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap, final int maxNumHaplotypesInPopulation) {
if ( haplotypes.size() < 2 ) throw new IllegalArgumentException("Must have at least 2 haplotypes to consider but only have " + haplotypes);
if ( haplotypes.size() == 2 ) return haplotypes; // fast path -- we'll always want to use 2 haplotypes
// all of the haplotypes that at least one sample called as one of the most likely
final Set<Haplotype> selectedHaplotypes = new HashSet<>();
selectedHaplotypes.add(findReferenceHaplotype(haplotypes)); // ref is always one of the selected
// our annoying map from allele -> haplotype
final Map<Allele, Haplotype> allele2Haplotype = new HashMap<>();
for ( final Haplotype h : haplotypes ) {
h.setScore(h.isReference() ? Double.MAX_VALUE : 0.0); // set all of the scores to 0 (lowest value) for all non-ref haplotypes
allele2Haplotype.put(Allele.create(h, h.isReference()), h);
}
// for each sample, compute the most likely pair of haplotypes
for ( final Map.Entry<String, PerReadAlleleLikelihoodMap> entry : stratifiedReadMap.entrySet() ) {
// get the two most likely haplotypes under a diploid model for this sample
final MostLikelyAllele mla = entry.getValue().getMostLikelyDiploidAlleles();
if ( mla != null ) { // there was something to evaluate in this sample
// note that there must be at least 2 haplotypes
final Haplotype best = updateSelectHaplotype(allele2Haplotype, mla.getMostLikelyAllele());
final Haplotype second = updateSelectHaplotype(allele2Haplotype, mla.getSecondMostLikelyAllele());
// if ( DEBUG ) {
// logger.info("Chose haplotypes " + best + " " + best.getCigar() + " and " + second + " " + second.getCigar() + " for sample " + entry.getKey());
//
// /**
// * Helper function for selectBestHaplotypesFromEachSample that updates the score of haplotype haplotypeAsAllele
// * @param map an annoying map object that moves us between the allele and haplotype representation
// * @param haplotypeAsAllele the allele version of the haplotype
// * @return the haplotype version, with its score incremented by 1 if its non-reference
// */
// private Haplotype updateSelectHaplotype(final Map<Allele, Haplotype> map, final Allele haplotypeAsAllele) {
// final Haplotype h = map.get(haplotypeAsAllele); // TODO -- fixme when haplotypes are properly generic
// if ( h.isNonReference() ) h.setScore(h.getScore() + 1); // ref is already at max value
// return h;
// }
//
// /**
// * Take the best N haplotypes and return them as a list
// *
// * Only considers the haplotypes selectedHaplotypes that were actually selected by at least one sample
// * as it's preferred haplotype. Takes the best N haplotypes from selectedHaplotypes in decreasing
// * order of score (so higher score haplotypes are preferred). The N we take is determined by
// *
// * N = min(2 * nSamples + 1, maxNumHaplotypesInPopulation)
// *
// * where 2 * nSamples is the number of chromosomes in 2 samples including the reference, and our workload is
// * bounded by maxNumHaplotypesInPopulation as that number can grow without bound
// *
// * @param selectedHaplotypes a non-null set of haplotypes with scores >= 1
// * @param nSamples the number of samples used to select the haplotypes
// * @param maxNumHaplotypesInPopulation the maximum number of haplotypes we're allowed to take, regardless of nSamples
// * @return a list of N or fewer haplotypes, with the reference haplotype first
// */
// private List<Haplotype> selectBestHaplotypesAccordingToScore(final Set<Haplotype> selectedHaplotypes, final int nSamples, final int maxNumHaplotypesInPopulation) {
// final List<Haplotype> selectedHaplotypesList = new ArrayList<>(selectedHaplotypes);
// Collections.sort(selectedHaplotypesList, new HaplotypeScoreComparator());
// final int numChromosomesInSamplesPlusRef = 2 * nSamples + 1;
// final int haplotypesToKeep = Math.min(numChromosomesInSamplesPlusRef, maxNumHaplotypesInPopulation);
// final List<Haplotype> bestHaplotypes = selectedHaplotypesList.size() <= haplotypesToKeep ? selectedHaplotypesList : selectedHaplotypesList.subList(0, haplotypesToKeep);
// if ( bestHaplotypes.get(0).isNonReference()) throw new IllegalStateException("BUG: reference haplotype should be first in list");
// return bestHaplotypes;
// }
//
// /**
// * Select the best haplotypes for genotyping the samples in stratifiedReadMap
// *
// * Selects these haplotypes by counting up how often each haplotype is selected as one of the most likely
// * haplotypes per sample. What this means is that each sample computes the diploid genotype likelihoods for
// * all possible pairs of haplotypes, and the pair with the highest likelihood has each haplotype each get
// * one extra count for each haplotype (so hom-var haplotypes get two counts). After performing this calculation
// * the best N haplotypes are selected (@see #selectBestHaplotypesAccordingToScore) and a list of the
// * haplotypes in order of score are returned, ensuring that at least one of the haplotypes is reference.
// *
// * @param haplotypes a list of all haplotypes we're considering
// * @param stratifiedReadMap a map from sample -> read likelihoods per haplotype
// * @param maxNumHaplotypesInPopulation the max. number of haplotypes we can select from haplotypes
// * @return a list of selected haplotypes with size <= maxNumHaplotypesInPopulation
// */
// public List<Haplotype> selectBestHaplotypesFromEachSample(final List<Haplotype> haplotypes, final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap, final int maxNumHaplotypesInPopulation) {
// if ( haplotypes.size() < 2 ) throw new IllegalArgumentException("Must have at least 2 haplotypes to consider but only have " + haplotypes);
//
// if ( haplotypes.size() == 2 ) return haplotypes; // fast path -- we'll always want to use 2 haplotypes
//
// // all of the haplotypes that at least one sample called as one of the most likely
// final Set<Haplotype> selectedHaplotypes = new HashSet<>();
// selectedHaplotypes.add(findReferenceHaplotype(haplotypes)); // ref is always one of the selected
//
// // our annoying map from allele -> haplotype
// final Map<Allele, Haplotype> allele2Haplotype = new HashMap<>();
// for ( final Haplotype h : haplotypes ) {
// h.setScore(h.isReference() ? Double.MAX_VALUE : 0.0); // set all of the scores to 0 (lowest value) for all non-ref haplotypes
// allele2Haplotype.put(Allele.create(h, h.isReference()), h);
// }
//
// // for each sample, compute the most likely pair of haplotypes
// for ( final Map.Entry<String, PerReadAlleleLikelihoodMap> entry : stratifiedReadMap.entrySet() ) {
// // get the two most likely haplotypes under a diploid model for this sample
// final MostLikelyAllele mla = entry.getValue().getMostLikelyDiploidAlleles();
//
// if ( mla != null ) { // there was something to evaluate in this sample
// // note that there must be at least 2 haplotypes
// final Haplotype best = updateSelectHaplotype(allele2Haplotype, mla.getMostLikelyAllele());
// final Haplotype second = updateSelectHaplotype(allele2Haplotype, mla.getSecondMostLikelyAllele());
//
//// if ( DEBUG ) {
//// logger.info("Chose haplotypes " + best + " " + best.getCigar() + " and " + second + " " + second.getCigar() + " for sample " + entry.getKey());
//// }
//
// // add these two haplotypes to the set of haplotypes that have been selected
// selectedHaplotypes.add(best);
// selectedHaplotypes.add(second);
//
// // we've already selected all of our haplotypes, and we don't need to prune them down
// if ( selectedHaplotypes.size() == haplotypes.size() && haplotypes.size() < maxNumHaplotypesInPopulation )
// break;
// }
// add these two haplotypes to the set of haplotypes that have been selected
selectedHaplotypes.add(best);
selectedHaplotypes.add(second);
// we've already selected all of our haplotypes, and we don't need to prune them down
if ( selectedHaplotypes.size() == haplotypes.size() && haplotypes.size() < maxNumHaplotypesInPopulation )
break;
}
}
// take the best N haplotypes forward, in order of the number of samples that choose them
final int nSamples = stratifiedReadMap.size();
final List<Haplotype> bestHaplotypes = selectBestHaplotypesAccordingToScore(selectedHaplotypes, nSamples, maxNumHaplotypesInPopulation);
if ( DEBUG ) {
logger.info("Chose " + (bestHaplotypes.size() - 1) + " alternate haplotypes to genotype in all samples.");
for ( final Haplotype h : bestHaplotypes ) {
logger.info("\tHaplotype " + h.getCigar() + " selected for further genotyping" + (h.isNonReference() ? " found " + (int)h.getScore() + " haplotypes" : " as ref haplotype"));
}
}
return bestHaplotypes;
}
/**
* Find the haplotype that isRef(), or @throw ReviewedStingException if one isn't found
* @param haplotypes non-null list of haplotypes
* @return the reference haplotype
*/
private static Haplotype findReferenceHaplotype( final List<Haplotype> haplotypes ) {
for( final Haplotype h : haplotypes ) {
if( h.isReference() ) return h;
}
throw new ReviewedStingException( "No reference haplotype found in the list of haplotypes!" );
}
// }
//
// // take the best N haplotypes forward, in order of the number of samples that choose them
// final int nSamples = stratifiedReadMap.size();
// final List<Haplotype> bestHaplotypes = selectBestHaplotypesAccordingToScore(selectedHaplotypes, nSamples, maxNumHaplotypesInPopulation);
//
// if ( DEBUG ) {
// logger.info("Chose " + (bestHaplotypes.size() - 1) + " alternate haplotypes to genotype in all samples.");
// for ( final Haplotype h : bestHaplotypes ) {
// logger.info("\tHaplotype " + h.getCigar() + " selected for further genotyping" + (h.isNonReference() ? " found " + (int)h.getScore() + " haplotypes" : " as ref haplotype"));
// }
// }
// return bestHaplotypes;
// }
//
// /**
// * Find the haplotype that isRef(), or @throw ReviewedStingException if one isn't found
// * @param haplotypes non-null list of haplotypes
// * @return the reference haplotype
// */
// private static Haplotype findReferenceHaplotype( final List<Haplotype> haplotypes ) {
// for( final Haplotype h : haplotypes ) {
// if( h.isReference() ) return h;
// }
// throw new ReviewedStingException( "No reference haplotype found in the list of haplotypes!" );
// }
// --------------------------------------------------------------------------------
//

View File

@ -48,11 +48,11 @@ package org.broadinstitute.sting.gatk.walkers.indels;
import com.google.java.contract.Ensures;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.utils.haplotype.Haplotype;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.clipping.ReadClipper;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
import org.broadinstitute.sting.utils.haplotype.Haplotype;
import org.broadinstitute.sting.utils.pairhmm.ArrayLoglessPairHMM;
import org.broadinstitute.sting.utils.pairhmm.Log10PairHMM;
import org.broadinstitute.sting.utils.pairhmm.LoglessPairHMM;
@ -65,6 +65,7 @@ import org.broadinstitute.variant.variantcontext.Allele;
import java.util.Arrays;
import java.util.LinkedHashMap;
import java.util.LinkedList;
import java.util.Map;
//import org.broadinstitute.sting.utils.pairhmm.LoglessCachingPairHMM;
@ -206,6 +207,39 @@ public class PairHMMIndelErrorModel {
}
}
private LinkedHashMap<Allele, Haplotype> trimHaplotypes(final LinkedHashMap<Allele, Haplotype> haplotypeMap,
long startLocationInRefForHaplotypes,
long stopLocationInRefForHaplotypes,
final ReferenceContext ref){
final LinkedHashMap<Allele, Haplotype> trimmedHaplotypeMap = new LinkedHashMap<>();
for (final Allele a: haplotypeMap.keySet()) {
final Haplotype haplotype = haplotypeMap.get(a);
if (stopLocationInRefForHaplotypes > haplotype.getStopPosition())
stopLocationInRefForHaplotypes = haplotype.getStopPosition();
if (startLocationInRefForHaplotypes < haplotype.getStartPosition())
startLocationInRefForHaplotypes = haplotype.getStartPosition();
else if (startLocationInRefForHaplotypes > haplotype.getStopPosition())
startLocationInRefForHaplotypes = haplotype.getStopPosition();
final long indStart = startLocationInRefForHaplotypes - haplotype.getStartPosition();
final long indStop = stopLocationInRefForHaplotypes - haplotype.getStartPosition();
if (DEBUG)
System.out.format("indStart: %d indStop: %d WinStart:%d WinStop:%d start: %d stop: %d\n",
indStart, indStop, ref.getWindow().getStart(), ref.getWindow().getStop(), startLocationInRefForHaplotypes, stopLocationInRefForHaplotypes);
// get the trimmed haplotype-bases array and create a new haplotype based on it. Pack this into the new map
final byte[] trimmedHaplotypeBases = Arrays.copyOfRange(haplotype.getBases(), (int)indStart, (int)indStop);
final Haplotype trimmedHaplotype = new Haplotype(trimmedHaplotypeBases, haplotype.isReference());
trimmedHaplotypeMap.put(a, trimmedHaplotype);
}
return trimmedHaplotypeMap;
}
public synchronized double[] computeDiploidReadHaplotypeLikelihoods(final ReadBackedPileup pileup,
final LinkedHashMap<Allele, Haplotype> haplotypeMap,
@ -231,6 +265,8 @@ public class PairHMMIndelErrorModel {
final int[] readCounts) {
final double readLikelihoods[][] = new double[pileup.getNumberOfElements()][haplotypeMap.size()];
final LinkedList<GATKSAMRecord> readList = new LinkedList<>();
final Map<GATKSAMRecord, byte[]> readGCPArrayMap = new LinkedHashMap<>();
int readIdx=0;
for (PileupElement p: pileup) {
// > 1 when the read is a consensus read representing multiple independent observations
@ -369,86 +405,30 @@ public class PairHMMIndelErrorModel {
baseDeletionQualities = contextLogGapOpenProbabilities;
}
byte[] currentHaplotypeBases = null;
boolean firstHap = true;
double readLikelihood;
Allele currentAllele = null;
for (Allele a: haplotypeMap.keySet()) {
// Create a new read based on the current one, but with trimmed bases/quals, for use in the HMM
final GATKSAMRecord processedRead = GATKSAMRecord.createQualityModifiedRead(read, readBases, readQuals, baseInsertionQualities, baseDeletionQualities);
readList.add(processedRead);
Haplotype haplotype = haplotypeMap.get(a);
// Pack the shortened read and its associated gap-continuation-penalty array into a map, as required by PairHMM
readGCPArrayMap.put(processedRead,contextLogGapContinuationProbabilities);
if (stopLocationInRefForHaplotypes > haplotype.getStopPosition())
stopLocationInRefForHaplotypes = haplotype.getStopPosition();
// Create a map of alleles to a new set of haplotypes, whose bases have been trimmed to the approprate genomic locations
final Map<Allele, Haplotype> trimmedHaplotypeMap = trimHaplotypes(haplotypeMap, startLocationInRefForHaplotypes, stopLocationInRefForHaplotypes, ref);
if (startLocationInRefForHaplotypes < haplotype.getStartPosition())
startLocationInRefForHaplotypes = haplotype.getStartPosition();
else if (startLocationInRefForHaplotypes > haplotype.getStopPosition())
startLocationInRefForHaplotypes = haplotype.getStopPosition();
// Get the likelihoods for our clipped read against each of our trimmed haplotypes.
final PerReadAlleleLikelihoodMap singleReadRawLikelihoods = pairHMM.computeLikelihoods(readList, trimmedHaplotypeMap, readGCPArrayMap);
final long indStart = startLocationInRefForHaplotypes - haplotype.getStartPosition();
final long indStop = stopLocationInRefForHaplotypes - haplotype.getStartPosition();
if (DEBUG)
System.out.format("indStart: %d indStop: %d WinStart:%d WinStop:%d start: %d stop: %d readLength: %d C:%s\n",
indStart, indStop, ref.getWindow().getStart(), ref.getWindow().getStop(), startLocationInRefForHaplotypes, stopLocationInRefForHaplotypes, read.getReadLength(), read.getCigar().toString());
// peak at the next haplotype in the list
final byte[] nextHaplotypeBases = Arrays.copyOfRange(haplotype.getBases(), (int)indStart, (int)indStop);
// process the current haplotype in the list
if (currentHaplotypeBases != null) {
// it's possible that the indel starts at the last base of the haplotypes
if ( currentHaplotypeBases.length == 0 ) {
readLikelihood = -Double.MAX_VALUE;
} else {
if (firstHap) {
//no need to reallocate arrays for each new haplotype, as length won't change
pairHMM.initialize(readBases.length, currentHaplotypeBases.length);
firstHap = false;
}
readLikelihood = pairHMM.computeReadLikelihoodGivenHaplotypeLog10(currentHaplotypeBases, readBases, readQuals,
baseInsertionQualities, baseDeletionQualities, contextLogGapContinuationProbabilities, firstHap, nextHaplotypeBases);
}
if (DEBUG) {
System.out.println("H:"+new String(currentHaplotypeBases));
System.out.println("R:"+new String(readBases));
System.out.format("L:%4.2f\n",readLikelihood);
}
perReadAlleleLikelihoodMap.add(p, currentAllele, readLikelihood);
readLikelihoods[readIdx][j++] = readLikelihood;
}
// update the current haplotype
currentHaplotypeBases = nextHaplotypeBases;
currentAllele = a;
}
// process the final haplotype
if (currentHaplotypeBases != null) {
// it's possible that the indel starts at the last base of the haplotypes
if ( currentHaplotypeBases.length == 0 ) {
readLikelihood = -Double.MAX_VALUE;
} else {
if (firstHap) {
//no need to reallocate arrays for each new haplotype, as length won't change
pairHMM.initialize(readBases.length, currentHaplotypeBases.length);
firstHap = false;
}
// there is no next haplotype, so pass null for nextHaplotypeBases.
readLikelihood = pairHMM.computeReadLikelihoodGivenHaplotypeLog10(currentHaplotypeBases, readBases, readQuals,
baseInsertionQualities, baseDeletionQualities, contextLogGapContinuationProbabilities, firstHap, null);
}
if (DEBUG) {
System.out.println("H:"+new String(currentHaplotypeBases));
System.out.println("R:"+new String(readBases));
System.out.format("L:%4.2f\n",readLikelihood);
}
perReadAlleleLikelihoodMap.add(p, currentAllele, readLikelihood);
// Pack the original pilup element, each allele, and each associated log10 likelihood into a final map, and add each likelihood to the array
for (Allele a: trimmedHaplotypeMap.keySet()){
double readLikelihood = singleReadRawLikelihoods.getLikelihoodAssociatedWithReadAndAllele(processedRead, a);
perReadAlleleLikelihoodMap.add(p, a, readLikelihood);
readLikelihoods[readIdx][j++] = readLikelihood;
}
// The readList for sending to the HMM should only ever contain 1 read, as each must be clipped individually
readList.remove(processedRead);
// The same is true for the read/GCP-array map
readGCPArrayMap.remove(processedRead);
}
}
readIdx++;
@ -472,16 +452,16 @@ public class PairHMMIndelErrorModel {
return !((read.getAlignmentStart() >= eventStartPos-eventLength && read.getAlignmentStart() <= eventStartPos+1) || (read.getAlignmentEnd() >= eventStartPos && read.getAlignmentEnd() <= eventStartPos + eventLength));
}
private int computeFirstDifferingPosition(byte[] b1, byte[] b2) {
if (b1.length != b2.length)
return 0; // sanity check
for (int i=0; i < b1.length; i++ ){
if ( b1[i]!= b2[i] )
return i;
}
return b1.length;
}
// private int computeFirstDifferingPosition(byte[] b1, byte[] b2) {
// if (b1.length != b2.length)
// return 0; // sanity check
//
// for (int i=0; i < b1.length; i++ ){
// if ( b1[i]!= b2[i] )
// return i;
// }
// return b1.length;
// }
private static double[] getDiploidHaplotypeLikelihoods(final int numHaplotypes, final int readCounts[], final double readLikelihoods[][]) {
final double[][] haplotypeLikehoodMatrix = new double[numHaplotypes][numHaplotypes];

View File

@ -49,6 +49,7 @@ package org.broadinstitute.sting.utils.pairhmm;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import org.broadinstitute.sting.utils.QualityUtils;
import java.util.Arrays;
/**

View File

@ -47,12 +47,16 @@
package org.broadinstitute.sting.utils.pairhmm;
import org.broadinstitute.sting.utils.haplotype.Haplotype;
import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.variant.variantcontext.Allele;
import java.io.File;
import java.lang.reflect.Field;
import java.util.Arrays;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
public final class CnyPairHMM extends PairHMM implements BatchPairHMM {
private static class HmmInput {
@ -62,14 +66,14 @@ public final class CnyPairHMM extends PairHMM implements BatchPairHMM {
public byte[] deletionGOP;
public byte[] overallGCP;
public List<Haplotype> haplotypes;
};
}
public static class ResultQueue {
private int offset;
private List<double[]> batchResults;
public ResultQueue() {
batchResults = new LinkedList<double[]>();
batchResults = new LinkedList<>();
offset = 0;
}
@ -92,7 +96,7 @@ public final class CnyPairHMM extends PairHMM implements BatchPairHMM {
final static String libName = "gmvhdl_gatk_hmm";
private static boolean loaded = false;
private List<HmmInput> batchRequests = new LinkedList<HmmInput>();
private List<HmmInput> batchRequests = new LinkedList<>();
private ResultQueue resultQueue = new ResultQueue();
static public boolean isAvailable() {
@ -184,6 +188,55 @@ public final class CnyPairHMM extends PairHMM implements BatchPairHMM {
return results;
}
/**
* {@inheritDoc}
*/
@Override
public PerReadAlleleLikelihoodMap computeLikelihoods(final List<GATKSAMRecord> reads, final Map<Allele, Haplotype> alleleHaplotypeMap, final Map<GATKSAMRecord, byte[]> GCPArrayMap){
// initialize the pairHMM if necessary
if (! initialized) {
int readMaxLength = findMaxReadLength(reads);
int haplotypeMaxLength = findMaxHaplotypeLength(alleleHaplotypeMap);
initialize(readMaxLength, haplotypeMaxLength);
}
// Pass the read bases/quals, and the haplotypes as a list into the HMM
performBatchAdditions(reads, alleleHaplotypeMap, GCPArrayMap);
// Get the log10-likelihoods for each read/haplotype ant pack into the results map
final PerReadAlleleLikelihoodMap likelihoodMap = new PerReadAlleleLikelihoodMap();
collectLikelihoodResults(reads, alleleHaplotypeMap, likelihoodMap);
return likelihoodMap;
}
private void collectLikelihoodResults(List<GATKSAMRecord> reads, Map<Allele, Haplotype> alleleHaplotypeMap, PerReadAlleleLikelihoodMap likelihoodMap) {
for(final GATKSAMRecord read : reads){
final double[] likelihoods = batchGetResult();
int jjj = 0;
for (Allele allele : alleleHaplotypeMap.keySet()){
final double log10l = likelihoods[jjj];
likelihoodMap.add(read, allele, log10l);
jjj++;
}
}
}
private void performBatchAdditions(List<GATKSAMRecord> reads, Map<Allele, Haplotype> alleleHaplotypeMap, Map<GATKSAMRecord, byte[]> GCPArrayMap) {
final List<Haplotype> haplotypeList = getHaplotypeList(alleleHaplotypeMap);
for(final GATKSAMRecord read : reads){
final byte[] readBases = read.getReadBases();
final byte[] readQuals = read.getBaseQualities();
final byte[] readInsQuals = read.getBaseInsertionQualities();
final byte[] readDelQuals = read.getBaseDeletionQualities();
final byte[] overallGCP = GCPArrayMap.get(read);
batchAdd(haplotypeList, readBases, readQuals, readInsQuals, readDelQuals, overallGCP);
}
}
protected double subComputeReadLikelihoodGivenHaplotypeLog10( final byte[] haplotypeBases,
final byte[] readBases,
final byte[] readQuals,
@ -196,6 +249,14 @@ public final class CnyPairHMM extends PairHMM implements BatchPairHMM {
return 0.0;
}
private List<Haplotype> getHaplotypeList(Map<Allele, Haplotype> alleleHaplotypeMap){
final List<Haplotype> haplotypeList = new LinkedList<>();
for (Allele a : alleleHaplotypeMap.keySet()){
haplotypeList.add(alleleHaplotypeMap.get(a));
}
return haplotypeList;
}
private void enqueuePrepare(byte[] haplotypeBases, byte[] readBases) {
double[] results = null;
int n = dequeueRequirement(haplotypeBases.length, readBases.length);

View File

@ -70,17 +70,6 @@ public final class LoglessPairHMM extends N2MemoryPairHMM {
private static final int deletionToDeletion = 5;
/**
* {@inheritDoc}
*/
@Override
public void initialize(final int readMaxLength, final int haplotypeMaxLength ) {
super.initialize(readMaxLength, haplotypeMaxLength);
transition = new double[paddedMaxReadLength][6];
prior = new double[paddedMaxReadLength][paddedMaxHaplotypeLength];
}
/**
* {@inheritDoc}
*/

View File

@ -110,7 +110,7 @@ public class PerReadAlleleLikelihoodMap {
* @return a map from each allele to a list of reads that 'support' the allele
*/
protected Map<Allele,List<GATKSAMRecord>> getAlleleStratifiedReadMap() {
final Map<Allele, List<GATKSAMRecord>> alleleReadMap = new HashMap<Allele, List<GATKSAMRecord>>(alleles.size());
final Map<Allele, List<GATKSAMRecord>> alleleReadMap = new HashMap<>(alleles.size());
for ( final Allele allele : alleles )
alleleReadMap.put(allele, new ArrayList<GATKSAMRecord>());
@ -152,7 +152,7 @@ public class PerReadAlleleLikelihoodMap {
/**
* Does the current map contain the key associated with a particular SAM record in pileup?
* @param p Pileup element
* @return
* @return true if the map contains pileup element, else false
*/
public boolean containsPileupElement(final PileupElement p) {
return likelihoodReadMap.containsKey(p.getRead());
@ -176,9 +176,9 @@ public class PerReadAlleleLikelihoodMap {
return likelihoodReadMap.keySet();
}
public Collection<Map<Allele,Double>> getLikelihoodMapValues() {
return likelihoodReadMap.values();
}
// public Collection<Map<Allele,Double>> getLikelihoodMapValues() {
// return likelihoodReadMap.values();
// }
public int getNumberOfStoredElements() {
return likelihoodReadMap.size();
@ -191,6 +191,21 @@ public class PerReadAlleleLikelihoodMap {
return likelihoodReadMap.get(p.getRead());
}
/**
* Get the log10 likelihood associated with an individual read/allele
*
* @param read the read whose likelihood we want
* @param allele the allele whose likelihood we want
* @return the log10 likelihood that this read matches this allele
*/
public double getLikelihoodAssociatedWithReadAndAllele(final GATKSAMRecord read, final Allele allele){
if (!allelesSet.contains(allele) || !likelihoodReadMap.containsKey(read))
return 0.0;
return likelihoodReadMap.get(read).get(allele);
}
/**
* Get the most likely alleles estimated across all reads in this object
*

View File

@ -85,9 +85,6 @@ public final class Log10PairHMM extends N2MemoryPairHMM {
Arrays.fill(insertionMatrix[iii], Double.NEGATIVE_INFINITY);
Arrays.fill(deletionMatrix[iii], Double.NEGATIVE_INFINITY);
}
transition = new double[paddedMaxReadLength][6];
prior = new double[paddedMaxReadLength][paddedMaxHaplotypeLength];
}
/**

View File

@ -26,10 +26,6 @@
package org.broadinstitute.sting.utils.pairhmm;
import com.google.java.contract.Requires;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.utils.MathUtils;
import java.util.Arrays;
/**
* Superclass for PairHMM that want to use a full read x haplotype matrix for their match, insertion, and deletion matrix
@ -58,6 +54,9 @@ abstract class N2MemoryPairHMM extends PairHMM {
matchMatrix = new double[paddedMaxReadLength][paddedMaxHaplotypeLength];
insertionMatrix = new double[paddedMaxReadLength][paddedMaxHaplotypeLength];
deletionMatrix = new double[paddedMaxReadLength][paddedMaxHaplotypeLength];
transition = new double[paddedMaxReadLength][6];
prior = new double[paddedMaxReadLength][paddedMaxHaplotypeLength];
}
/**

View File

@ -28,9 +28,14 @@ package org.broadinstitute.sting.utils.pairhmm;
import com.google.java.contract.Requires;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
import org.broadinstitute.sting.utils.haplotype.Haplotype;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.variant.variantcontext.Allele;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
/**
* Util class for performing the pair HMM for local alignment. Figure 4.3 in Durbin 1998 book.
*
@ -59,7 +64,7 @@ public abstract class PairHMM {
protected int maxHaplotypeLength, maxReadLength;
protected int paddedMaxReadLength, paddedMaxHaplotypeLength;
protected int paddedReadLength, paddedHaplotypeLength;
private boolean initialized = false;
protected boolean initialized = false;
// only used for debugging purposes
protected boolean doNotUseTristateCorrection = false;
@ -73,7 +78,7 @@ public abstract class PairHMM {
* @param haplotypeMaxLength the max length of haplotypes we want to use with this PairHMM
* @param readMaxLength the max length of reads we want to use with this PairHMM
*/
public void initialize( final int readMaxLength, final int haplotypeMaxLength ) {
protected void initialize( final int readMaxLength, final int haplotypeMaxLength ) {
if ( readMaxLength <= 0 ) throw new IllegalArgumentException("READ_MAX_LENGTH must be > 0 but got " + readMaxLength);
if ( haplotypeMaxLength <= 0 ) throw new IllegalArgumentException("HAPLOTYPE_MAX_LENGTH must be > 0 but got " + haplotypeMaxLength);
@ -90,6 +95,79 @@ public abstract class PairHMM {
initialized = true;
}
protected int findMaxReadLength(final List<GATKSAMRecord> reads) {
int listMaxReadLength = 0;
for(GATKSAMRecord read : reads){
final int readLength = read.getReadLength();
if( readLength > listMaxReadLength ) { listMaxReadLength = readLength; }
}
return listMaxReadLength;
}
protected int findMaxHaplotypeLength(final Map<Allele, Haplotype> haplotypeMap) {
int listMaxHaplotypeLength = 0;
for( final Allele a: haplotypeMap.keySet() ) {
final Haplotype h = haplotypeMap.get(a);
final int haplotypeLength = h.getBases().length;
if( haplotypeLength > listMaxHaplotypeLength ) { listMaxHaplotypeLength = haplotypeLength; }
}
return listMaxHaplotypeLength;
}
/**
* Given a list of reads and haplotypes, for every read compute the total probability of said read arising from
* each haplotype given base substitution, insertion, and deletion probabilities.
*
* @param reads the list of reads
* @param alleleHaplotypeMap the list of haplotypes
* @param GCPArrayMap Each read is associated with an array containing the gap continuation penalties for use in the model. Length of each GCP-array must match that of its read.
* @return a PerReadAlleleLikelihoodMap containing each read, haplotype-allele, and the log10 probability of
* said read coming from the said haplotype under the provided error model
*/
public PerReadAlleleLikelihoodMap computeLikelihoods(final List<GATKSAMRecord> reads, final Map<Allele, Haplotype> alleleHaplotypeMap, final Map<GATKSAMRecord, byte[]> GCPArrayMap) {
// (re)initialize the pairHMM only if necessary
final int readMaxLength = findMaxReadLength(reads);
final int haplotypeMaxLength = findMaxHaplotypeLength(alleleHaplotypeMap);
if (!initialized || readMaxLength > maxReadLength || haplotypeMaxLength > maxHaplotypeLength) { initialize(readMaxLength, haplotypeMaxLength); }
final PerReadAlleleLikelihoodMap likelihoodMap = new PerReadAlleleLikelihoodMap();
for(GATKSAMRecord read : reads){
final byte[] readBases = read.getReadBases();
final byte[] readQuals = read.getBaseQualities();
final byte[] readInsQuals = read.getBaseInsertionQualities();
final byte[] readDelQuals = read.getBaseDeletionQualities();
final byte[] overallGCP = GCPArrayMap.get(read);
// peak at the next haplotype in the list (necessary to get nextHaplotypeBases, which is required for caching in the array implementation)
byte[] currentHaplotypeBases = null;
boolean isFirstHaplotype = true;
Allele currentAllele = null;
double log10l;
for (final Allele allele : alleleHaplotypeMap.keySet()){
final Haplotype haplotype = alleleHaplotypeMap.get(allele);
final byte[] nextHaplotypeBases = haplotype.getBases();
if (currentHaplotypeBases != null) {
log10l = computeReadLikelihoodGivenHaplotypeLog10(currentHaplotypeBases,
readBases, readQuals, readInsQuals, readDelQuals, overallGCP, isFirstHaplotype, nextHaplotypeBases);
likelihoodMap.add(read, currentAllele, log10l);
}
// update the current haplotype
currentHaplotypeBases = nextHaplotypeBases;
currentAllele = allele;
}
// process the final haplotype
if (currentHaplotypeBases != null) {
// there is no next haplotype, so pass null for nextHaplotypeBases.
log10l = computeReadLikelihoodGivenHaplotypeLog10(currentHaplotypeBases,
readBases, readQuals, readInsQuals, readDelQuals, overallGCP, isFirstHaplotype, null);
likelihoodMap.add(read, currentAllele, log10l);
}
}
return likelihoodMap;
}
/**
* Compute the total probability of read arising from haplotypeBases given base substitution, insertion, and deletion
* probabilities.
@ -110,7 +188,7 @@ public abstract class PairHMM {
* parameters are the same, and only the haplotype bases are changing underneath us
* @return the log10 probability of read coming from the haplotype under the provided error model
*/
public final double computeReadLikelihoodGivenHaplotypeLog10( final byte[] haplotypeBases,
protected final double computeReadLikelihoodGivenHaplotypeLog10( final byte[] haplotypeBases,
final byte[] readBases,
final byte[] readQuals,
final byte[] insertionGOP,
@ -118,6 +196,7 @@ public abstract class PairHMM {
final byte[] overallGCP,
final boolean recacheReadValues,
final byte[] nextHaploytpeBases) {
if ( ! initialized ) throw new IllegalStateException("Must call initialize before calling computeReadLikelihoodGivenHaplotypeLog10");
if ( haplotypeBases == null ) throw new IllegalArgumentException("haplotypeBases cannot be null");
if ( haplotypeBases.length > maxHaplotypeLength ) throw new IllegalArgumentException("Haplotype bases is too long, got " + haplotypeBases.length + " but max is " + maxHaplotypeLength);

View File

@ -139,7 +139,7 @@ public class GATKSAMRecord extends BAMRecord {
}
public static GATKSAMRecord createRandomRead(int length) {
List<CigarElement> cigarElements = new LinkedList<CigarElement>();
List<CigarElement> cigarElements = new LinkedList<>();
cigarElements.add(new CigarElement(length, CigarOperator.M));
Cigar cigar = new Cigar(cigarElements);
return ArtificialSAMUtils.createArtificialRead(cigar);
@ -536,10 +536,7 @@ public class GATKSAMRecord extends BAMRecord {
* @return True if an attribute has been set for this key.
*/
public boolean containsTemporaryAttribute(Object key) {
if(temporaryAttributes != null) {
return temporaryAttributes.containsKey(key);
}
return false;
return temporaryAttributes != null && temporaryAttributes.containsKey(key);
}
/**
@ -556,7 +553,7 @@ public class GATKSAMRecord extends BAMRecord {
*/
public Object setTemporaryAttribute(Object key, Object value) {
if(temporaryAttributes == null) {
temporaryAttributes = new HashMap<Object, Object>();
temporaryAttributes = new HashMap<>();
}
return temporaryAttributes.put(key, value);
}
@ -750,6 +747,46 @@ public class GATKSAMRecord extends BAMRecord {
return emptyRead;
}
/**
* Creates a new GATKSAMRecord with the source read's header, read group and mate
* information, but with the following fields set to user-supplied values:
* - Read Bases
* - Base Qualities
* - Base Insertion Qualities
* - Base Deletion Qualities
*
* Cigar string is empty (not-null)
*
* Use this method if you want to create a new GATKSAMRecord based on
* another GATKSAMRecord, but with modified bases and qualities
*
* @param read a read to copy the header from
* @param readBases an array containing the new bases you wish use in place of the originals
* @param baseQualities an array containing the new base qualities you wish use in place of the originals
* @param baseInsertionQualities an array containing the new base insertion qaulities
* @param baseDeletionQualities an array containing the new base deletion qualities
* @return a read with modified bases and qualities, safe for the GATK
*/
public static GATKSAMRecord createQualityModifiedRead(final GATKSAMRecord read,
final byte[] readBases,
final byte[] baseQualities,
final byte[] baseInsertionQualities,
final byte[] baseDeletionQualities) {
if ( baseQualities.length != readBases.length || baseInsertionQualities.length != readBases.length || baseDeletionQualities.length != readBases.length )
throw new IllegalArgumentException("Read bases and read quality arrays aren't the same size: Bases:" + readBases.length
+ " vs Base Q's:" + baseQualities.length
+ " vs Insert Q's:" + baseInsertionQualities.length
+ " vs Delete Q's:" + baseDeletionQualities.length);
final GATKSAMRecord processedRead = GATKSAMRecord.emptyRead(read);
processedRead.setReadBases(readBases);
processedRead.setBaseQualities(baseQualities, EventType.BASE_SUBSTITUTION);
processedRead.setBaseQualities(baseInsertionQualities, EventType.BASE_INSERTION);
processedRead.setBaseQualities(baseDeletionQualities, EventType.BASE_DELETION);
return processedRead;
}
/**
* Shallow copy of everything, except for the attribute list and the temporary attributes.
* A new list of the attributes is created for both, but the attributes themselves are copied by reference.
@ -762,7 +799,7 @@ public class GATKSAMRecord extends BAMRecord {
public Object clone() throws CloneNotSupportedException {
final GATKSAMRecord clone = (GATKSAMRecord) super.clone();
if (temporaryAttributes != null) {
clone.temporaryAttributes = new HashMap<Object, Object>();
clone.temporaryAttributes = new HashMap<>();
for (Object attribute : temporaryAttributes.keySet())
clone.setTemporaryAttribute(attribute, temporaryAttributes.get(attribute));
}