Refactoring the PairHMM util class to allow for multiple implementations which can be specified by the callers via an enum argument. Adding an optimized PairHMM implementation which caches per-read calculations as well as a logless implementation which drastically reduces the runtime of the HMM while also increasing the precision of the result. In the HaplotypeCaller we now lexicographically sort the haplotypes to take maximal benefit of the haplotype offset optimization which only recalculates the HMM matrices after the first differing base in the haplotype. Many thanks to Mauricio for all the initial groundwork for these optimizations. The change to the one HC integration test is in the fourth decimal of HaplotypeScore.

This commit is contained in:
Ryan Poplin 2012-10-20 16:38:18 -04:00
parent 25e7dcc46f
commit a647f1e076
16 changed files with 813 additions and 457 deletions

View File

@ -72,7 +72,7 @@ public class ErrorModel {
haplotypeMap = new LinkedHashMap<Allele, Haplotype>();
if (refSampleVC.isIndel()) {
pairModel = new PairHMMIndelErrorModel(UAC.INDEL_GAP_OPEN_PENALTY, UAC.INDEL_GAP_CONTINUATION_PENALTY,
UAC.OUTPUT_DEBUG_INDEL_INFO, !UAC.DONT_DO_BANDED_INDEL_COMPUTATION);
UAC.OUTPUT_DEBUG_INDEL_INFO, UAC.pairHMM);
IndelGenotypeLikelihoodsCalculationModel.getHaplotypeMapFromAlleles(refSampleVC.getAlleles(), refContext, refContext.getLocus(), haplotypeMap); // will update haplotypeMap adding elements
}
}

View File

@ -62,7 +62,7 @@ public class GeneralPloidyIndelGenotypeLikelihoodsCalculationModel extends Gener
pairModel = new PairHMMIndelErrorModel(UAC.INDEL_GAP_OPEN_PENALTY, UAC.INDEL_GAP_CONTINUATION_PENALTY,
UAC.OUTPUT_DEBUG_INDEL_INFO, !UAC.DONT_DO_BANDED_INDEL_COMPUTATION);
UAC.OUTPUT_DEBUG_INDEL_INFO, UAC.pairHMM);
haplotypeMap = new LinkedHashMap<Allele, Haplotype>();
}

View File

@ -52,6 +52,7 @@ import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
import org.broadinstitute.sting.utils.fragments.FragmentCollection;
import org.broadinstitute.sting.utils.fragments.FragmentUtils;
import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
import org.broadinstitute.sting.utils.pairhmm.PairHMM;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.sam.AlignmentUtils;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
@ -114,6 +115,12 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
@Output(fullName="graphOutput", shortName="graph", doc="File to which debug assembly graph information should be written", required = false)
protected PrintStream graphWriter = null;
/**
* The PairHMM implementation to use for genotype likelihood calculations. The various implementations balance a tradeoff of accuracy and runtime.
*/
@Argument(fullName = "pair_hmm_implementation", shortName = "pairHMM", doc = "The PairHMM implementation to use for genotype likelihood calculations", required = false)
public PairHMM.HMM_IMPLEMENTATION pairHMM = PairHMM.HMM_IMPLEMENTATION.LOGLESS_CACHING;
@Hidden
@Argument(fullName="keepRG", shortName="keepRG", doc="Only use read from this read group when making calls (but use all reads to build the assembly)", required = false)
protected String keepRG = null;
@ -287,7 +294,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
}
assemblyEngine = new SimpleDeBruijnAssembler( DEBUG, graphWriter );
likelihoodCalculationEngine = new LikelihoodCalculationEngine( (byte)gcpHMM, DEBUG, false );
likelihoodCalculationEngine = new LikelihoodCalculationEngine( (byte)gcpHMM, DEBUG, pairHMM );
genotypingEngine = new GenotypingEngine( DEBUG, OUTPUT_FULL_HAPLOTYPE_SEQUENCE );
}
@ -400,6 +407,9 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
final List<GATKSAMRecord> filteredReads = filterNonPassingReads( activeRegion ); // filter out reads from genotyping which fail mapping quality based criteria
if( activeRegion.size() == 0 ) { return 1; } // no reads remain after filtering so nothing else to do!
// sort haplotypes to take full advantage of haplotype start offset optimizations in PairHMM
Collections.sort( haplotypes, new Haplotype.HaplotypeBaseComparator() );
// evaluate each sample's reads against all haplotypes
final HashMap<String, ArrayList<GATKSAMRecord>> perSampleReadList = splitReadsBySample( activeRegion.getReads() );
final HashMap<String, ArrayList<GATKSAMRecord>> perSampleFilteredReadList = splitReadsBySample( filteredReads );

View File

@ -30,6 +30,9 @@ import com.google.java.contract.Requires;
import org.broadinstitute.sting.gatk.walkers.genotyper.PerReadAlleleLikelihoodMap;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.pairhmm.*;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.sam.ReadUtils;
import org.broadinstitute.sting.utils.variantcontext.Allele;
@ -44,8 +47,25 @@ public class LikelihoodCalculationEngine {
private final boolean DEBUG;
private final PairHMM pairHMM;
public LikelihoodCalculationEngine( final byte constantGCP, final boolean debug, final boolean noBanded ) {
pairHMM = new PairHMM( noBanded );
public LikelihoodCalculationEngine( final byte constantGCP, final boolean debug, final PairHMM.HMM_IMPLEMENTATION hmmType ) {
switch (hmmType) {
case EXACT:
pairHMM = new ExactPairHMM();
break;
case ORIGINAL:
pairHMM = new OriginalPairHMM();
break;
case CACHING:
pairHMM = new CachingPairHMM();
break;
case LOGLESS_CACHING:
pairHMM = new LoglessCachingPairHMM();
break;
default:
throw new UserException.BadArgumentValue("pairHMM", "Specified pairHMM implementation is unrecognized or incompatible with the HaplotypeCaller. Acceptable options are ORIGINAL, EXACT, CACHING, and LOGLESS_CACHING.");
}
this.constantGCP = constantGCP;
DEBUG = debug;
}
@ -69,23 +89,18 @@ public class LikelihoodCalculationEngine {
X_METRIC_LENGTH += 2;
Y_METRIC_LENGTH += 2;
// initial arrays to hold the probabilities of being in the match, insertion and deletion cases
final double[][] matchMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
final double[][] XMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
final double[][] YMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
PairHMM.initializeArrays(matchMetricArray, XMetricArray, YMetricArray, X_METRIC_LENGTH);
// initialize arrays to hold the probabilities of being in the match, insertion and deletion cases
pairHMM.initialize(X_METRIC_LENGTH, Y_METRIC_LENGTH);
// for each sample's reads
for( final Map.Entry<String, ArrayList<GATKSAMRecord>> sampleEntry : perSampleReadList.entrySet() ) {
//if( DEBUG ) { System.out.println("Evaluating sample " + sample + " with " + perSampleReadList.get( sample ).size() + " passing reads"); }
// evaluate the likelihood of the reads given those haplotypes
computeReadLikelihoods( haplotypes, sampleEntry.getValue(), sampleEntry.getKey(), matchMetricArray, XMetricArray, YMetricArray );
computeReadLikelihoods( haplotypes, sampleEntry.getValue(), sampleEntry.getKey() );
}
}
private void computeReadLikelihoods( final ArrayList<Haplotype> haplotypes, final ArrayList<GATKSAMRecord> reads, final String sample,
final double[][] matchMetricArray, final double[][] XMetricArray, final double[][] YMetricArray ) {
private void computeReadLikelihoods( final ArrayList<Haplotype> haplotypes, final ArrayList<GATKSAMRecord> reads, final String sample ) {
final int numHaplotypes = haplotypes.size();
final int numReads = reads.size();
@ -113,9 +128,8 @@ public class LikelihoodCalculationEngine {
final int haplotypeStart = ( previousHaplotypeSeen == null ? 0 : computeFirstDifferingPosition(haplotype.getBases(), previousHaplotypeSeen.getBases()) );
previousHaplotypeSeen = haplotype;
readLikelihoods[jjj][iii] = pairHMM.computeReadLikelihoodGivenHaplotype(haplotype.getBases(), read.getReadBases(),
readQuals, readInsQuals, readDelQuals, overallGCP,
haplotypeStart, matchMetricArray, XMetricArray, YMetricArray);
readLikelihoods[jjj][iii] = pairHMM.computeReadLikelihoodGivenHaplotypeLog10(haplotype.getBases(), read.getReadBases(),
readQuals, readInsQuals, readDelQuals, overallGCP, haplotypeStart, jjj == 0);
readCounts[jjj][iii] = readCount;
}
}
@ -130,7 +144,7 @@ public class LikelihoodCalculationEngine {
return iii;
}
}
return b1.length;
return Math.min(b1.length, b2.length);
}
@Requires({"haplotypes.size() > 0"})
@ -280,7 +294,7 @@ public class LikelihoodCalculationEngine {
final int numHaplotypes = haplotypes.size();
final Set<String> sampleKeySet = haplotypes.get(0).getSampleKeySet(); // BUGBUG: assume all haplotypes saw the same samples
final ArrayList<Integer> bestHaplotypesIndexList = new ArrayList<Integer>();
bestHaplotypesIndexList.add(0); // always start with the reference haplotype
bestHaplotypesIndexList.add( findReferenceIndex(haplotypes) ); // always start with the reference haplotype
// set up the default 1-to-1 haplotype mapping object
final ArrayList<ArrayList<Haplotype>> haplotypeMapping = new ArrayList<ArrayList<Haplotype>>();
for( final Haplotype h : haplotypes ) {
@ -322,6 +336,13 @@ public class LikelihoodCalculationEngine {
return bestHaplotypes;
}
public static int findReferenceIndex( final List<Haplotype> haplotypes ) {
for( final Haplotype h : haplotypes ) {
if( h.isReference() ) { return haplotypes.indexOf(h); }
}
throw new ReviewedStingException( "No reference haplotype found in the list of haplotypes!" );
}
public static Map<String, PerReadAlleleLikelihoodMap> partitionReadsBasedOnLikelihoods( final GenomeLocParser parser,
final HashMap<String, ArrayList<GATKSAMRecord>> perSampleReadList,
final HashMap<String, ArrayList<GATKSAMRecord>> perSampleFilteredReadList,

View File

@ -0,0 +1,181 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.pairhmm;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.QualityUtils;
import java.util.Arrays;
/**
* Created with IntelliJ IDEA.
* User: rpoplin, carneiro
* Date: 10/16/12
*/
public class CachingPairHMM extends OriginalPairHMM {
double[][] constantMatrix = null; // The cache in the CachingPairHMM
double[][] distanceMatrix = null; // The cache in the CachingPairHMM
protected static final double [] firstRowConstantMatrix = {
QualityUtils.qualToProbLog10((byte) (DEFAULT_GOP + DEFAULT_GOP)),
QualityUtils.qualToProbLog10(DEFAULT_GCP),
QualityUtils.qualToErrorProbLog10(DEFAULT_GOP),
QualityUtils.qualToErrorProbLog10(DEFAULT_GCP),
0.0,
0.0
};
@Override
public void initialize( final int READ_MAX_LENGTH, final int HAPLOTYPE_MAX_LENGTH ) {
super.initialize(READ_MAX_LENGTH, HAPLOTYPE_MAX_LENGTH);
// M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment
final int X_METRIC_LENGTH = READ_MAX_LENGTH + 2;
final int Y_METRIC_LENGTH = HAPLOTYPE_MAX_LENGTH + 2;
constantMatrix = new double[X_METRIC_LENGTH][6];
distanceMatrix = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
// fill in the first row
for( int jjj = 2; jjj < Y_METRIC_LENGTH; jjj++ ) {
updateCell(1, jjj, 0.0, firstRowConstantMatrix, matchMetricArray, XMetricArray, YMetricArray);
}
}
@Override
public double computeReadLikelihoodGivenHaplotypeLog10( final byte[] haplotypeBases,
final byte[] readBases,
final byte[] readQuals,
final byte[] insertionGOP,
final byte[] deletionGOP,
final byte[] overallGCP,
final int hapStartIndex,
final boolean recacheReadValues ) {
if( recacheReadValues ) {
initializeConstants( insertionGOP, deletionGOP, overallGCP );
}
initializeDistanceMatrix( haplotypeBases, readBases, readQuals, hapStartIndex );
// M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment
final int X_METRIC_LENGTH = readBases.length + 2;
final int Y_METRIC_LENGTH = haplotypeBases.length + 2;
for (int i = 2; i < X_METRIC_LENGTH; i++) {
for (int j = hapStartIndex+1; j < Y_METRIC_LENGTH; j++) {
updateCell(i, j, distanceMatrix[i][j], constantMatrix[i], matchMetricArray, XMetricArray, YMetricArray);
}
}
// final probability is the log10 sum of the last element in all three state arrays
final int endI = X_METRIC_LENGTH - 1;
final int endJ = Y_METRIC_LENGTH - 1;
return MathUtils.approximateLog10SumLog10(matchMetricArray[endI][endJ], XMetricArray[endI][endJ], YMetricArray[endI][endJ]);
}
/**
* Initializes the matrix that holds all the constants related to the editing
* distance between the read and the haplotype.
*
* @param haplotypeBases the bases of the haplotype
* @param readBases the bases of the read
* @param readQuals the base quality scores of the read
* @param startIndex where to start updating the distanceMatrix (in case this read is similar to the previous read)
*/
public void initializeDistanceMatrix( final byte[] haplotypeBases,
final byte[] readBases,
final byte[] readQuals,
final int startIndex ) {
// initialize the pBaseReadLog10 matrix for all combinations of read x haplotype bases
// Abusing the fact that java initializes arrays with 0.0, so no need to fill in rows and columns below 2.
for (int i = 0; i < readBases.length; i++) {
final byte x = readBases[i];
final byte qual = readQuals[i];
for (int j = startIndex; j < haplotypeBases.length; j++) {
final byte y = haplotypeBases[j];
distanceMatrix[i+2][j+2] = ( x == y || x == (byte) 'N' || y == (byte) 'N' ?
QualityUtils.qualToProbLog10(qual) : QualityUtils.qualToErrorProbLog10(qual) );
}
}
}
/**
* Initializes the matrix that holds all the constants related to quality scores.
*
* @param insertionGOP insertion quality scores of the read
* @param deletionGOP deletion quality scores of the read
* @param overallGCP overall gap continuation penalty
*/
public void initializeConstants( final byte[] insertionGOP,
final byte[] deletionGOP,
final byte[] overallGCP ) {
final int l = insertionGOP.length;
constantMatrix[1] = firstRowConstantMatrix;
for (int i = 0; i < l; i++) {
final int qualIndexGOP = Math.min(insertionGOP[i] + deletionGOP[i], Byte.MAX_VALUE);
constantMatrix[i+2][0] = QualityUtils.qualToProbLog10((byte) qualIndexGOP);
constantMatrix[i+2][1] = QualityUtils.qualToProbLog10(overallGCP[i]);
constantMatrix[i+2][2] = QualityUtils.qualToErrorProbLog10(insertionGOP[i]);
constantMatrix[i+2][3] = QualityUtils.qualToErrorProbLog10(overallGCP[i]);
constantMatrix[i+2][4] = QualityUtils.qualToErrorProbLog10(deletionGOP[i]);
constantMatrix[i+2][5] = QualityUtils.qualToErrorProbLog10(overallGCP[i]);
}
constantMatrix[l+1][4] = 0.0;
constantMatrix[l+1][5] = 0.0;
}
/**
* Updates a cell in the HMM matrix
*
* The read and haplotype indices are offset by one because the state arrays have an extra column to hold the
* initial conditions
* @param indI row index in the matrices to update
* @param indJ column index in the matrices to update
* @param prior the likelihood editing distance matrix for the read x haplotype
* @param constants an array with the six constants relevant to this location
* @param matchMetricArray the matches likelihood matrix
* @param XMetricArray the insertions likelihood matrix
* @param YMetricArray the deletions likelihood matrix
*/
private void updateCell( final int indI, final int indJ, final double prior, final double[] constants,
final double[][] matchMetricArray, final double[][] XMetricArray, final double[][] YMetricArray ) {
matchMetricArray[indI][indJ] = prior +
MathUtils.approximateLog10SumLog10( matchMetricArray[indI - 1][indJ - 1] + constants[0],
XMetricArray[indI - 1][indJ - 1] + constants[1],
YMetricArray[indI - 1][indJ - 1] + constants[1] );
XMetricArray[indI][indJ] = MathUtils.approximateLog10SumLog10( matchMetricArray[indI - 1][indJ] + constants[2],
XMetricArray[indI - 1][indJ] + constants[3]);
YMetricArray[indI][indJ] = MathUtils.approximateLog10SumLog10( matchMetricArray[indI][indJ - 1] + constants[4],
YMetricArray[indI][indJ - 1] + constants[5]);
}
}

View File

@ -0,0 +1,187 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.pairhmm;
import org.broadinstitute.sting.utils.QualityUtils;
import java.util.Arrays;
/**
* Created with IntelliJ IDEA.
* User: rpoplin, carneiro
* Date: 10/16/12
*/
public class LoglessCachingPairHMM extends CachingPairHMM {
protected static final double SCALE_FACTOR_LOG10 = 300.0;
protected static final double [] firstRowConstantMatrix = {
QualityUtils.qualToProb((byte) (DEFAULT_GOP + DEFAULT_GOP)),
QualityUtils.qualToProb(DEFAULT_GCP),
QualityUtils.qualToErrorProb(DEFAULT_GOP),
QualityUtils.qualToErrorProb(DEFAULT_GCP),
1.0,
1.0
};
@Override
public void initialize( final int READ_MAX_LENGTH, final int HAPLOTYPE_MAX_LENGTH ) {
// M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment
final int X_METRIC_LENGTH = READ_MAX_LENGTH + 2;
final int Y_METRIC_LENGTH = HAPLOTYPE_MAX_LENGTH + 2;
matchMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
XMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
YMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
for( int iii=0; iii < X_METRIC_LENGTH; iii++ ) {
Arrays.fill(matchMetricArray[iii], 0.0);
Arrays.fill(XMetricArray[iii], 0.0);
Arrays.fill(YMetricArray[iii], 0.0);
}
// the initial condition
matchMetricArray[1][1] = Math.pow(10.0, SCALE_FACTOR_LOG10); // Math.log10(1.0);
constantMatrix = new double[X_METRIC_LENGTH][6];
distanceMatrix = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
// fill in the first row
for( int jjj = 2; jjj < Y_METRIC_LENGTH; jjj++ ) {
updateCell(1, jjj, 1.0, firstRowConstantMatrix, matchMetricArray, XMetricArray, YMetricArray);
}
}
@Override
public double computeReadLikelihoodGivenHaplotypeLog10( final byte[] haplotypeBases,
final byte[] readBases,
final byte[] readQuals,
final byte[] insertionGOP,
final byte[] deletionGOP,
final byte[] overallGCP,
final int hapStartIndex,
final boolean recacheReadValues ) {
if( recacheReadValues ) {
initializeConstants( insertionGOP, deletionGOP, overallGCP );
}
initializeDistanceMatrix( haplotypeBases, readBases, readQuals, hapStartIndex );
// M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment
final int X_METRIC_LENGTH = readBases.length + 2;
final int Y_METRIC_LENGTH = haplotypeBases.length + 2;
for (int i = 2; i < X_METRIC_LENGTH; i++) {
for (int j = hapStartIndex+1; j < Y_METRIC_LENGTH; j++) {
updateCell(i, j, distanceMatrix[i][j], constantMatrix[i], matchMetricArray, XMetricArray, YMetricArray);
}
}
// final probability is the log10 sum of the last element in all three state arrays
final int endI = X_METRIC_LENGTH - 1;
final int endJ = Y_METRIC_LENGTH - 1;
return Math.log10( matchMetricArray[endI][endJ] + XMetricArray[endI][endJ] + YMetricArray[endI][endJ] ) - SCALE_FACTOR_LOG10;
}
/**
* Initializes the matrix that holds all the constants related to the editing
* distance between the read and the haplotype.
*
* @param haplotypeBases the bases of the haplotype
* @param readBases the bases of the read
* @param readQuals the base quality scores of the read
* @param startIndex where to start updating the distanceMatrix (in case this read is similar to the previous read)
*/
public void initializeDistanceMatrix( final byte[] haplotypeBases,
final byte[] readBases,
final byte[] readQuals,
final int startIndex ) {
// initialize the pBaseReadLog10 matrix for all combinations of read x haplotype bases
// Abusing the fact that java initializes arrays with 0.0, so no need to fill in rows and columns below 2.
for (int i = 0; i < readBases.length; i++) {
final byte x = readBases[i];
final byte qual = readQuals[i];
for (int j = startIndex; j < haplotypeBases.length; j++) {
final byte y = haplotypeBases[j];
distanceMatrix[i+2][j+2] = ( x == y || x == (byte) 'N' || y == (byte) 'N' ?
QualityUtils.qualToProb(qual) : QualityUtils.qualToErrorProb(qual) );
}
}
}
/**
* Initializes the matrix that holds all the constants related to quality scores.
*
* @param insertionGOP insertion quality scores of the read
* @param deletionGOP deletion quality scores of the read
* @param overallGCP overall gap continuation penalty
*/
public void initializeConstants( final byte[] insertionGOP,
final byte[] deletionGOP,
final byte[] overallGCP ) {
final int l = insertionGOP.length;
constantMatrix[1] = firstRowConstantMatrix;
for (int i = 0; i < l; i++) {
final int qualIndexGOP = Math.min(insertionGOP[i] + deletionGOP[i], Byte.MAX_VALUE);
constantMatrix[i+2][0] = QualityUtils.qualToProb((byte) qualIndexGOP);
constantMatrix[i+2][1] = QualityUtils.qualToProb(overallGCP[i]);
constantMatrix[i+2][2] = QualityUtils.qualToErrorProb(insertionGOP[i]);
constantMatrix[i+2][3] = QualityUtils.qualToErrorProb(overallGCP[i]);
constantMatrix[i+2][4] = QualityUtils.qualToErrorProb(deletionGOP[i]);
constantMatrix[i+2][5] = QualityUtils.qualToErrorProb(overallGCP[i]);
}
constantMatrix[l+1][4] = 1.0;
constantMatrix[l+1][5] = 1.0;
}
/**
* Updates a cell in the HMM matrix
*
* The read and haplotype indices are offset by one because the state arrays have an extra column to hold the
* initial conditions
* @param indI row index in the matrices to update
* @param indJ column index in the matrices to update
* @param prior the likelihood editing distance matrix for the read x haplotype
* @param constants an array with the six constants relevant to this location
* @param matchMetricArray the matches likelihood matrix
* @param XMetricArray the insertions likelihood matrix
* @param YMetricArray the deletions likelihood matrix
*/
private void updateCell( final int indI, final int indJ, final double prior, final double[] constants,
final double[][] matchMetricArray, final double[][] XMetricArray, final double[][] YMetricArray ) {
matchMetricArray[indI][indJ] = prior * ( matchMetricArray[indI - 1][indJ - 1] * constants[0] +
XMetricArray[indI - 1][indJ - 1] * constants[1] +
YMetricArray[indI - 1][indJ - 1] * constants[1] );
XMetricArray[indI][indJ] = matchMetricArray[indI - 1][indJ] * constants[2] + XMetricArray[indI - 1][indJ] * constants[3];
YMetricArray[indI][indJ] = matchMetricArray[indI][indJ - 1] * constants[4] + YMetricArray[indI][indJ - 1] * constants[5];
}
}

View File

@ -70,7 +70,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void HCTestProblematicReadsModifiedInActiveRegions() {
final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3";
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("fa5c5eb996e95aed12c50d70e6dd74d7"));
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("c54c0c9411054bf629bfd98b616e53fc"));
executeTest("HCTestProblematicReadsModifiedInActiveRegions: ", spec);
}

View File

@ -23,24 +23,26 @@
*/
// our package
package org.broadinstitute.sting.utils;
package org.broadinstitute.sting.utils.pairhmm;
// the imports for unit testing.
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.Utils;
import org.testng.Assert;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import java.util.*;
public class PairHMMUnitTest extends BaseTest {
final static boolean EXTENSIVE_TESTING = true;
PairHMM hmm = new PairHMM( false ); // reference implementation
PairHMM bandedHMM = new PairHMM( true ); // algorithm with banding
PairHMM exactHMM = new ExactPairHMM(); // the log truth implementation
PairHMM originalHMM = new OriginalPairHMM(); // the reference implementation
PairHMM cachingHMM = new CachingPairHMM();
PairHMM loglessHMM = new LoglessCachingPairHMM();
// --------------------------------------------------------------------------------
//
@ -57,7 +59,7 @@ public class PairHMMUnitTest extends BaseTest {
final static String LEFT_FLANK = "GATTTATCATCGAGTCTGC";
final static String RIGHT_FLANK = "CATGGATCGTTATCAGCTATCTCGAGGGATTCACTTAACAGTTTTA";
public BasicLikelihoodTestProvider(final String ref, final String read, final int baseQual, final int insQual, final int delQual, final int expectedQual, final int gcp) {
public BasicLikelihoodTestProvider(final String ref, final String read, final int baseQual, final int insQual, final int delQual, final int expectedQual, final int gcp ) {
this(ref, read, baseQual, insQual, delQual, expectedQual, gcp, false, false);
}
@ -76,115 +78,51 @@ public class PairHMMUnitTest extends BaseTest {
}
public double expectedLogL() {
return expectedQual / -10.0;
return (expectedQual / -10.0) + 0.03 ;
}
public double tolerance() {
return 0.1; // TODO FIXME arbitrary
public double toleranceFromTheoretical() {
return 0.2;
}
public double calcLogL() {
public double toleranceFromReference() {
return 1E-4;
}
double logL = hmm.computeReadLikelihoodGivenHaplotype(
public double toleranceFromExact() {
return 1E-9;
}
public double calcLogL( final PairHMM pairHMM, boolean anchorIndel ) {
pairHMM.initialize(readBasesWithContext.length, refBasesWithContext.length);
return pairHMM.computeReadLikelihoodGivenHaplotypeLog10(
refBasesWithContext, readBasesWithContext,
qualAsBytes(baseQual, false), qualAsBytes(insQual, true), qualAsBytes(delQual, true),
qualAsBytes(gcp, false));
return logL;
qualAsBytes(baseQual, false, anchorIndel), qualAsBytes(insQual, true, anchorIndel), qualAsBytes(delQual, true, anchorIndel),
qualAsBytes(gcp, false, anchorIndel), 0, true);
}
private final byte[] asBytes(final String bases, final boolean left, final boolean right) {
return ( (left ? LEFT_FLANK : "") + CONTEXT + bases + CONTEXT + (right ? RIGHT_FLANK : "")).getBytes();
}
private byte[] qualAsBytes(final int phredQual, final boolean doGOP) {
private byte[] qualAsBytes(final int phredQual, final boolean doGOP, final boolean anchorIndel) {
final byte phredQuals[] = new byte[readBasesWithContext.length];
// initialize everything to MASSIVE_QUAL so it cannot be moved by HMM
Arrays.fill(phredQuals, (byte)100);
// update just the bases corresponding to the provided micro read with the quality scores
if( doGOP ) {
phredQuals[0 + CONTEXT.length()] = (byte)phredQual;
} else {
for ( int i = 0; i < read.length(); i++)
phredQuals[i + CONTEXT.length()] = (byte)phredQual;
}
if( anchorIndel ) {
// initialize everything to MASSIVE_QUAL so it cannot be moved by HMM
Arrays.fill(phredQuals, (byte)100);
return phredQuals;
}
}
final Random random = new Random(87865573);
private class BandedLikelihoodTestProvider extends TestDataProvider {
final String ref, read;
final byte[] refBasesWithContext, readBasesWithContext;
final int baseQual, insQual, delQual, gcp;
final int expectedQual;
final static String LEFT_CONTEXT = "ACGTAATGACGCTACATGTCGCCAACCGTC";
final static String RIGHT_CONTEXT = "TACGGCTTCATATAGGGCAATGTGTGTGGCAAAA";
final static String LEFT_FLANK = "GATTTATCATCGAGTCTGTT";
final static String RIGHT_FLANK = "CATGGATCGTTATCAGCTATCTCGAGGGATTCACTTAACAGTTTCCGTA";
final byte[] baseQuals, insQuals, delQuals, gcps;
public BandedLikelihoodTestProvider(final String ref, final String read, final int baseQual, final int insQual, final int delQual, final int expectedQual, final int gcp) {
this(ref, read, baseQual, insQual, delQual, expectedQual, gcp, false, false);
}
public BandedLikelihoodTestProvider(final String ref, final String read, final int baseQual, final int insQual, final int delQual, final int expectedQual, final int gcp, final boolean left, final boolean right) {
super(BandedLikelihoodTestProvider.class, String.format("BANDED: ref=%s read=%s b/i/d/c quals = %d/%d/%d/%d l/r flank = %b/%b e[qual]=%d", ref, read, baseQual, insQual, delQual, gcp, left, right, expectedQual));
this.baseQual = baseQual;
this.delQual = delQual;
this.insQual = insQual;
this.gcp = gcp;
this.read = read;
this.ref = ref;
this.expectedQual = expectedQual;
refBasesWithContext = asBytes(ref, left, right);
readBasesWithContext = asBytes(read, false, false);
baseQuals = qualAsBytes(baseQual);
insQuals = qualAsBytes(insQual);
delQuals = qualAsBytes(delQual);
gcps = qualAsBytes(gcp, false);
}
public double expectedLogL() {
double logL = hmm.computeReadLikelihoodGivenHaplotype(
refBasesWithContext, readBasesWithContext,
baseQuals, insQuals, delQuals, gcps);
return logL;
}
public double tolerance() {
return 0.2; // TODO FIXME arbitrary
}
public double calcLogL() {
double logL = bandedHMM.computeReadLikelihoodGivenHaplotype(
refBasesWithContext, readBasesWithContext,
baseQuals, insQuals, delQuals, gcps);
return logL;
}
private final byte[] asBytes(final String bases, final boolean left, final boolean right) {
return ( (left ? LEFT_FLANK : "") + LEFT_CONTEXT + bases + RIGHT_CONTEXT + (right ? RIGHT_FLANK : "")).getBytes();
}
private byte[] qualAsBytes(final int phredQual) {
return qualAsBytes(phredQual, true);
}
private byte[] qualAsBytes(final int phredQual, final boolean addRandom) {
final byte phredQuals[] = new byte[readBasesWithContext.length];
Arrays.fill(phredQuals, (byte)phredQual);
if(addRandom) {
for( int iii = 0; iii < phredQuals.length; iii++) {
phredQuals[iii] = (byte) ((int) phredQuals[iii] + (random.nextInt(7) - 3));
// update just the bases corresponding to the provided micro read with the quality scores
if( doGOP ) {
phredQuals[0 + CONTEXT.length()] = (byte)phredQual;
} else {
for ( int i = 0; i < read.length(); i++)
phredQuals[i + CONTEXT.length()] = (byte)phredQual;
}
} else {
Arrays.fill(phredQuals, (byte)phredQual);
}
return phredQuals;
}
}
@ -195,8 +133,8 @@ public class PairHMMUnitTest extends BaseTest {
// test all combinations
final List<Integer> baseQuals = EXTENSIVE_TESTING ? Arrays.asList(10, 20, 30, 40, 50) : Arrays.asList(30);
final List<Integer> indelQuals = EXTENSIVE_TESTING ? Arrays.asList(20, 30, 40, 50) : Arrays.asList(40);
final List<Integer> gcps = EXTENSIVE_TESTING ? Arrays.asList(10, 20, 30) : Arrays.asList(10);
final List<Integer> sizes = EXTENSIVE_TESTING ? Arrays.asList(2,3,4,5,7,8,9,10,20) : Arrays.asList(2);
final List<Integer> gcps = EXTENSIVE_TESTING ? Arrays.asList(8, 10, 20) : Arrays.asList(10);
final List<Integer> sizes = EXTENSIVE_TESTING ? Arrays.asList(2,3,4,5,7,8,9,10,20,30,35) : Arrays.asList(2);
for ( final int baseQual : baseQuals ) {
for ( final int indelQual : indelQuals ) {
@ -219,7 +157,7 @@ public class PairHMMUnitTest extends BaseTest {
for ( boolean insertionP : Arrays.asList(true, false)) {
final String small = Utils.dupString((char)base, 1);
final String big = Utils.dupString((char)base, size);
final String big = Utils.dupString((char) base, size);
final String ref = insertionP ? small : big;
final String read = insertionP ? big : small;
@ -238,69 +176,65 @@ public class PairHMMUnitTest extends BaseTest {
return BasicLikelihoodTestProvider.getTests(BasicLikelihoodTestProvider.class);
}
@Test(dataProvider = "BasicLikelihoodTestProvider", enabled = true)
public void testBasicLikelihoods(BasicLikelihoodTestProvider cfg) {
double calculatedLogL = cfg.calcLogL();
double expectedLogL = cfg.expectedLogL();
logger.warn(String.format("Test: logL calc=%.2f expected=%.2f for %s", calculatedLogL, expectedLogL, cfg.toString()));
Assert.assertEquals(calculatedLogL, expectedLogL, cfg.tolerance());
}
@DataProvider(name = "BandedLikelihoodTestProvider")
public Object[][] makeBandedLikelihoodTests() {
final Random random = new Random(87860573);
@DataProvider(name = "OptimizedLikelihoodTestProvider")
public Object[][] makeOptimizedLikelihoodTests() {
// context on either side is ACGTTGCA REF ACGTTGCA
// test all combinations
final List<Integer> baseQuals = EXTENSIVE_TESTING ? Arrays.asList(25, 30, 40, 50) : Arrays.asList(30);
final List<Integer> indelQuals = EXTENSIVE_TESTING ? Arrays.asList(30, 40, 50) : Arrays.asList(40);
final List<Integer> gcps = EXTENSIVE_TESTING ? Arrays.asList(10, 12) : Arrays.asList(10);
final List<Integer> sizes = EXTENSIVE_TESTING ? Arrays.asList(2,3,4,5,7,8,9,10,20) : Arrays.asList(2);
final List<Integer> baseQuals = EXTENSIVE_TESTING ? Arrays.asList(10, 30, 40, 60) : Arrays.asList(30);
final List<Integer> indelQuals = EXTENSIVE_TESTING ? Arrays.asList(20, 40, 60) : Arrays.asList(40);
final List<Integer> gcps = EXTENSIVE_TESTING ? Arrays.asList(10, 20, 30) : Arrays.asList(10);
final List<Integer> sizes = EXTENSIVE_TESTING ? Arrays.asList(3, 20, 50, 90, 160) : Arrays.asList(2);
for ( final int baseQual : baseQuals ) {
for ( final int indelQual : indelQuals ) {
for ( final int gcp : gcps ) {
// test substitutions
for ( final byte refBase : BaseUtils.BASES ) {
for ( final byte readBase : BaseUtils.BASES ) {
final String ref = new String(new byte[]{refBase});
final String read = new String(new byte[]{readBase});
final int expected = refBase == readBase ? 0 : baseQual;
new BandedLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp);
}
}
// test insertions and deletions
for ( final int size : sizes ) {
for ( final byte base : BaseUtils.BASES ) {
final int expected = indelQual + (size - 2) * gcp;
for ( boolean insertionP : Arrays.asList(true, false)) {
final String small = Utils.dupString((char)base, 1);
final String big = Utils.dupString((char)base, size);
final String ref = insertionP ? small : big;
final String read = insertionP ? big : small;
new BandedLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp);
new BandedLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp, true, false);
new BandedLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp, false, true);
new BandedLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp, true, true);
for ( final int refSize : sizes ) {
for ( final int readSize : sizes ) {
String ref = "";
String read = "";
for( int iii = 0; iii < refSize; iii++) {
ref += (char) BaseUtils.BASES[random.nextInt(4)];
}
for( int iii = 0; iii < readSize; iii++) {
read += (char) BaseUtils.BASES[random.nextInt(4)];
}
new BasicLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, -0, gcp);
new BasicLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, -0, gcp, true, false);
new BasicLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, -0, gcp, false, true);
new BasicLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, -0, gcp, true, true);
}
}
}
}
}
return BandedLikelihoodTestProvider.getTests(BandedLikelihoodTestProvider.class);
return BasicLikelihoodTestProvider.getTests(BasicLikelihoodTestProvider.class);
}
@Test(dataProvider = "BandedLikelihoodTestProvider", enabled = true)
public void testBandedLikelihoods(BandedLikelihoodTestProvider cfg) {
double calculatedLogL = cfg.calcLogL();
@Test(dataProvider = "BasicLikelihoodTestProvider", enabled = true)
public void testBasicLikelihoods(BasicLikelihoodTestProvider cfg) {
double exactLogL = cfg.calcLogL( exactHMM, true );
double calculatedLogL = cfg.calcLogL( originalHMM, true );
double optimizedLogL = cfg.calcLogL( cachingHMM, true );
double loglessLogL = cfg.calcLogL( loglessHMM, true );
double expectedLogL = cfg.expectedLogL();
logger.warn(String.format("Test: logL calc=%.2f expected=%.2f for %s", calculatedLogL, expectedLogL, cfg.toString()));
Assert.assertEquals(calculatedLogL, expectedLogL, cfg.tolerance());
//logger.warn(String.format("Test: logL calc=%.2f optimized=%.2f logless=%.2f expected=%.2f for %s", calculatedLogL, optimizedLogL, loglessLogL, expectedLogL, cfg.toString()));
Assert.assertEquals(exactLogL, expectedLogL, cfg.toleranceFromTheoretical());
Assert.assertEquals(calculatedLogL, expectedLogL, cfg.toleranceFromTheoretical());
Assert.assertEquals(optimizedLogL, calculatedLogL, cfg.toleranceFromReference());
Assert.assertEquals(loglessLogL, exactLogL, cfg.toleranceFromExact());
}
@Test(dataProvider = "OptimizedLikelihoodTestProvider", enabled = true)
public void testOptimizedLikelihoods(BasicLikelihoodTestProvider cfg) {
double exactLogL = cfg.calcLogL( exactHMM, false );
double calculatedLogL = cfg.calcLogL( originalHMM, false );
double optimizedLogL = cfg.calcLogL( cachingHMM, false );
double loglessLogL = cfg.calcLogL( loglessHMM, false );
//logger.warn(String.format("Test: logL calc=%.2f optimized=%.2f logless=%.2f expected=%.2f for %s", calculatedLogL, optimizedLogL, loglessLogL, expectedLogL, cfg.toString()));
Assert.assertEquals(optimizedLogL, calculatedLogL, cfg.toleranceFromReference());
Assert.assertEquals(loglessLogL, exactLogL, cfg.toleranceFromExact());
}
@Test
@ -322,11 +256,11 @@ public class PairHMMUnitTest extends BaseTest {
byte[] mread = Arrays.copyOfRange(haplotype1,offset,haplotype1.length-offset);
// change single base at position k to C. If it's a C, change to T
mread[k] = ( mread[k] == (byte)'C' ? (byte)'T' : (byte)'C');
double res1 = hmm.computeReadLikelihoodGivenHaplotype(
originalHMM.initialize(mread.length, haplotype1.length);
double res1 = originalHMM.computeReadLikelihoodGivenHaplotypeLog10(
haplotype1, mread,
quals, gop, gop,
gcp);
gcp, 0, false);
System.out.format("H:%s\nR: %s\n Pos:%d Result:%4.2f\n",new String(haplotype1), new String(mread), k,res1);
@ -353,11 +287,11 @@ public class PairHMMUnitTest extends BaseTest {
byte[] mread = Arrays.copyOfRange(haplotype1,offset,haplotype1.length);
// change single base at position k to C. If it's a C, change to T
mread[k] = ( mread[k] == (byte)'C' ? (byte)'T' : (byte)'C');
double res1 = hmm.computeReadLikelihoodGivenHaplotype(
originalHMM.initialize(mread.length, haplotype1.length);
double res1 = originalHMM.computeReadLikelihoodGivenHaplotypeLog10(
haplotype1, mread,
quals, gop, gop,
gcp);
gcp, 0, false);
System.out.format("H:%s\nR: %s\n Pos:%d Result:%4.2f\n",new String(haplotype1), new String(mread), k,res1);

View File

@ -57,7 +57,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
protected IndelGenotypeLikelihoodsCalculationModel(UnifiedArgumentCollection UAC, Logger logger) {
super(UAC, logger);
pairModel = new PairHMMIndelErrorModel(UAC.INDEL_GAP_OPEN_PENALTY, UAC.INDEL_GAP_CONTINUATION_PENALTY,
UAC.OUTPUT_DEBUG_INDEL_INFO, !UAC.DONT_DO_BANDED_INDEL_COMPUTATION);
UAC.OUTPUT_DEBUG_INDEL_INFO, UAC.pairHMM);
DEBUG = UAC.OUTPUT_DEBUG_INDEL_INFO;
haplotypeMap = new LinkedHashMap<Allele, Haplotype>();
ignoreSNPAllelesWhenGenotypingIndels = UAC.IGNORE_SNP_ALLELES;

View File

@ -28,6 +28,7 @@ package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.broadinstitute.sting.commandline.*;
import org.broadinstitute.sting.gatk.arguments.StandardCallerArgumentCollection;
import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.AFCalcFactory;
import org.broadinstitute.sting.utils.pairhmm.PairHMM;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
@ -65,6 +66,12 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection
@Argument(fullName = "annotateNDA", shortName = "nda", doc = "If provided, we will annotate records with the number of alternate alleles that were discovered (but not necessarily genotyped) at a given site", required = false)
public boolean ANNOTATE_NUMBER_OF_ALLELES_DISCOVERED = false;
/**
* The PairHMM implementation to use for -glm INDEL genotype likelihood calculations. The various implementations balance a tradeoff of accuracy and runtime.
*/
@Argument(fullName = "pair_hmm_implementation", shortName = "pairHMM", doc = "The PairHMM implementation to use for -glm INDEL genotype likelihood calculations", required = false)
public PairHMM.HMM_IMPLEMENTATION pairHMM = PairHMM.HMM_IMPLEMENTATION.ORIGINAL;
/**
* The minimum confidence needed in a given base for it to be used in variant calling. Note that the base quality of a base
* is capped by the mapping quality so that bases on reads with low mapping quality may get filtered out depending on this value.
@ -112,10 +119,6 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection
@Argument(fullName = "indelHaplotypeSize", shortName = "indelHSize", doc = "Indel haplotype size", required = false)
public int INDEL_HAPLOTYPE_SIZE = 80;
@Hidden
@Argument(fullName = "noBandedIndel", shortName = "noBandedIndel", doc = "Don't do Banded Indel likelihood computation", required = false)
public boolean DONT_DO_BANDED_INDEL_COMPUTATION = false;
@Hidden
@Argument(fullName = "indelDebug", shortName = "indelDebug", doc = "Output indel debug info", required = false)
public boolean OUTPUT_DEBUG_INDEL_INFO = false;
@ -221,10 +224,10 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection
uac.EXCLUDE_FILTERED_REFERENCE_SITES = EXCLUDE_FILTERED_REFERENCE_SITES;
uac.IGNORE_LANE_INFO = IGNORE_LANE_INFO;
uac.exactCallsLog = exactCallsLog;
uac.pairHMM = pairHMM;
// todo- arguments to remove
uac.IGNORE_SNP_ALLELES = IGNORE_SNP_ALLELES;
uac.DONT_DO_BANDED_INDEL_COMPUTATION = DONT_DO_BANDED_INDEL_COMPUTATION;
return uac;
}

View File

@ -30,8 +30,11 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.walkers.genotyper.PerReadAlleleLikelihoodMap;
import org.broadinstitute.sting.utils.Haplotype;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.PairHMM;
import org.broadinstitute.sting.utils.clipping.ReadClipper;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.pairhmm.ExactPairHMM;
import org.broadinstitute.sting.utils.pairhmm.OriginalPairHMM;
import org.broadinstitute.sting.utils.pairhmm.PairHMM;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
@ -48,7 +51,6 @@ public class PairHMMIndelErrorModel {
public static final int BASE_QUAL_THRESHOLD = 20;
private boolean DEBUG = false;
private boolean bandedLikelihoods = false;
private static final int MAX_CACHED_QUAL = 127;
@ -67,6 +69,8 @@ public class PairHMMIndelErrorModel {
private final byte[] GAP_OPEN_PROB_TABLE;
private final byte[] GAP_CONT_PROB_TABLE;
private final PairHMM pairHMM;
/////////////////////////////
// Private Member Variables
/////////////////////////////
@ -85,15 +89,26 @@ public class PairHMMIndelErrorModel {
}
}
public PairHMMIndelErrorModel(byte indelGOP, byte indelGCP, boolean deb, boolean bandedLikelihoods) {
public PairHMMIndelErrorModel(byte indelGOP, byte indelGCP, boolean deb, final PairHMM.HMM_IMPLEMENTATION hmmType ) {
this.DEBUG = deb;
this.bandedLikelihoods = bandedLikelihoods;
switch (hmmType) {
case EXACT:
pairHMM = new ExactPairHMM();
break;
case ORIGINAL:
pairHMM = new OriginalPairHMM();
break;
case CACHING:
case LOGLESS_CACHING:
default:
throw new UserException.BadArgumentValue("pairHMM", "Specified pairHMM implementation is unrecognized or incompatible with the UnifiedGenotyper. Acceptable options are ORIGINAL and EXACT.");
}
// fill gap penalty table, affine naive model:
this.GAP_CONT_PROB_TABLE = new byte[MAX_HRUN_GAP_IDX];
this.GAP_OPEN_PROB_TABLE = new byte[MAX_HRUN_GAP_IDX];
for (int i = 0; i < START_HRUN_GAP_IDX; i++) {
GAP_OPEN_PROB_TABLE[i] = indelGOP;
GAP_CONT_PROB_TABLE[i] = indelGCP;
@ -190,7 +205,6 @@ public class PairHMMIndelErrorModel {
final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap,
final int[] readCounts) {
final double readLikelihoods[][] = new double[pileup.getNumberOfElements()][haplotypeMap.size()];
final PairHMM pairHMM = new PairHMM(bandedLikelihoods);
int readIdx=0;
for (PileupElement p: pileup) {
@ -303,8 +317,6 @@ public class PairHMMIndelErrorModel {
final byte[] readQuals = Arrays.copyOfRange(unclippedReadQuals,numStartSoftClippedBases, unclippedReadBases.length-numEndSoftClippedBases);
int j=0;
// initialize path metric and traceback memories for likelihood computation
double[][] matchMetricArray = null, XMetricArray = null, YMetricArray = null;
byte[] previousHaplotypeSeen = null;
final byte[] contextLogGapOpenProbabilities = new byte[readBases.length];
final byte[] contextLogGapContinuationProbabilities = new byte[readBases.length];
@ -341,14 +353,9 @@ public class PairHMMIndelErrorModel {
final int X_METRIC_LENGTH = readBases.length+2;
final int Y_METRIC_LENGTH = haplotypeBases.length+2;
if (matchMetricArray == null) {
if (previousHaplotypeSeen == null) {
//no need to reallocate arrays for each new haplotype, as length won't change
matchMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
XMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
YMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
PairHMM.initializeArrays(matchMetricArray, XMetricArray, YMetricArray, X_METRIC_LENGTH);
pairHMM.initialize(X_METRIC_LENGTH, Y_METRIC_LENGTH);
}
int startIndexInHaplotype = 0;
@ -356,11 +363,10 @@ public class PairHMMIndelErrorModel {
startIndexInHaplotype = computeFirstDifferingPosition(haplotypeBases, previousHaplotypeSeen);
previousHaplotypeSeen = haplotypeBases.clone();
readLikelihood = pairHMM.computeReadLikelihoodGivenHaplotype(haplotypeBases, readBases, readQuals,
readLikelihood = pairHMM.computeReadLikelihoodGivenHaplotypeLog10(haplotypeBases, readBases, readQuals,
(read.hasBaseIndelQualities() ? read.getBaseInsertionQualities() : contextLogGapOpenProbabilities),
(read.hasBaseIndelQualities() ? read.getBaseDeletionQualities() : contextLogGapOpenProbabilities),
contextLogGapContinuationProbabilities,
startIndexInHaplotype, matchMetricArray, XMetricArray, YMetricArray);
contextLogGapContinuationProbabilities, startIndexInHaplotype, false);
if (DEBUG) {

View File

@ -34,6 +34,7 @@ import org.broadinstitute.sting.utils.sam.ReadUtils;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.io.Serializable;
import java.util.*;
public class Haplotype {
@ -184,6 +185,21 @@ public class Haplotype {
return new Haplotype(newHaplotypeBases);
}
public static class HaplotypeBaseComparator implements Comparator<Haplotype>, Serializable {
@Override
public int compare( final Haplotype hap1, final Haplotype hap2 ) {
final byte[] arr1 = hap1.getBases();
final byte[] arr2 = hap2.getBases();
// compares byte arrays using lexical ordering
final int len = Math.min(arr1.length, arr2.length);
for( int iii = 0; iii < len; iii++ ) {
final int cmp = arr1[iii] - arr2[iii];
if (cmp != 0) { return cmp; }
}
return arr2.length - arr1.length;
}
}
public static LinkedHashMap<Allele,Haplotype> makeHaplotypeListFromAlleles(final List<Allele> alleleList,
final int startPos,
final ReferenceContext ref,

View File

@ -1,259 +0,0 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import java.util.*;
/**
* Util class for performing the pair HMM for local alignment. Figure 4.3 in Durbin 1998 book.
* User: rpoplin
* Date: 3/1/12
*/
public class PairHMM {
private static final Byte MAX_CACHED_QUAL = Byte.MAX_VALUE;
private static final byte DEFAULT_GOP = (byte) 45;
private static final byte DEFAULT_GCP = (byte) 10;
private static final double BANDING_TOLERANCE = 22.0;
private static final int BANDING_CLUSTER_WINDOW = 12;
private final boolean noBanded;
public PairHMM() {
noBanded = false;
}
public PairHMM( final boolean noBanded ) {
this.noBanded = noBanded;
}
public static void initializeArrays(final double[][] matchMetricArray, final double[][] XMetricArray, final double[][] YMetricArray,
final int X_METRIC_LENGTH) {
for( int iii=0; iii < X_METRIC_LENGTH; iii++ ) {
Arrays.fill(matchMetricArray[iii], Double.NEGATIVE_INFINITY);
Arrays.fill(XMetricArray[iii], Double.NEGATIVE_INFINITY);
Arrays.fill(YMetricArray[iii], Double.NEGATIVE_INFINITY);
}
// the initial condition
matchMetricArray[1][1] = 0.0; // Math.log10(1.0);
}
@Requires({"readBases.length == readQuals.length","readBases.length == insertionGOP.length","readBases.length == deletionGOP.length","readBases.length == overallGCP.length"})
@Ensures({"!Double.isInfinite(result)", "!Double.isNaN(result)"}) // Result should be a proper log10 probability
public double computeReadLikelihoodGivenHaplotype( final byte[] haplotypeBases, final byte[] readBases, final byte[] readQuals,
final byte[] insertionGOP, final byte[] deletionGOP, final byte[] overallGCP ) {
// M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment
final int X_METRIC_LENGTH = readBases.length + 2;
final int Y_METRIC_LENGTH = haplotypeBases.length + 2;
// initial arrays to hold the probabilities of being in the match, insertion and deletion cases
final double[][] matchMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
final double[][] XMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
final double[][] YMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
initializeArrays(matchMetricArray, XMetricArray, YMetricArray, X_METRIC_LENGTH);
return computeReadLikelihoodGivenHaplotype(haplotypeBases, readBases, readQuals, insertionGOP, deletionGOP, overallGCP, 0, matchMetricArray, XMetricArray, YMetricArray);
}
@Requires({"readBases.length == readQuals.length","readBases.length == insertionGOP.length","readBases.length == deletionGOP.length","readBases.length == overallGCP.length"})
@Ensures({"!Double.isInfinite(result)", "!Double.isNaN(result)"}) // Result should be a proper log10 probability
public double computeReadLikelihoodGivenHaplotype( final byte[] haplotypeBases, final byte[] readBases, final byte[] readQuals,
final byte[] insertionGOP, final byte[] deletionGOP, final byte[] overallGCP, final int hapStartIndex,
final double[][] matchMetricArray, final double[][] XMetricArray, final double[][] YMetricArray ) {
// M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment
final int X_METRIC_LENGTH = readBases.length + 2;
final int Y_METRIC_LENGTH = haplotypeBases.length + 2;
// ensure that all the qual scores have valid values
for( int iii = 0; iii < readQuals.length; iii++ ) {
readQuals[iii] = ( readQuals[iii] < QualityUtils.MIN_USABLE_Q_SCORE ? QualityUtils.MIN_USABLE_Q_SCORE : (readQuals[iii] > MAX_CACHED_QUAL ? MAX_CACHED_QUAL : readQuals[iii]) );
}
if( false ) {
final ArrayList<Integer> workQueue = new ArrayList<Integer>(); // holds a queue of starting work location (indices along the diagonal). Will be sorted each step
final ArrayList<Integer> workToBeAdded = new ArrayList<Integer>();
final ArrayList<Double> calculatedValues = new ArrayList<Double>();
final int numDiags = X_METRIC_LENGTH + Y_METRIC_LENGTH - 1;
workQueue.add( 1 ); // Always start a new thread at the baseline because of partially repeating sequences that match better in the latter half of the haplotype
for(int diag = 3; diag < numDiags; diag++) { // diag = 3 is the (1,2) element of the metric arrays. (1,1) is the initial condition and is purposefully skipped over
//Collections.sort(workQueue); // no need to sort because elements are guaranteed to be in ascending order
int el = 1;
for( int work : workQueue ) {
// choose the appropriate diagonal baseline location
int iii = 0;
int jjj = diag;
if( diag > Y_METRIC_LENGTH ) {
iii = diag - Y_METRIC_LENGTH;
jjj = Y_METRIC_LENGTH;
}
// move to the starting work location along the diagonal
iii += work;
jjj -= work;
while( iii >= X_METRIC_LENGTH || jjj <= 0 ) {
iii--;
jjj++;
work--;
}
if( !detectClusteredStartLocations(workToBeAdded, work ) ) {
workToBeAdded.add(work); // keep this thread going once it has started
}
if( work >= el - 3 ) {
// step along the diagonal in the forward direction, updating the match matrices and looking for a drop off from the maximum observed value
double maxElement = Double.NEGATIVE_INFINITY;
for( el = work; el < numDiags + 1; el++ ) {
updateCell(iii, jjj, haplotypeBases, readBases, readQuals,
insertionGOP, deletionGOP, overallGCP, matchMetricArray, XMetricArray, YMetricArray);
final double bestMetric = MathUtils.max(matchMetricArray[iii][jjj], XMetricArray[iii][jjj], YMetricArray[iii][jjj]);
calculatedValues.add(bestMetric);
if( bestMetric > maxElement ) {
maxElement = bestMetric;
} else if( maxElement - bestMetric > BANDING_TOLERANCE ) {
break;
}
if( ++iii >= X_METRIC_LENGTH ) { // don't walk off the edge of the matrix
break;
}
if( --jjj <= 0 ) { // don't walk off the edge of the matrix
break;
}
}
// find a local maximum to start a new band in the work queue
double localMaxElement = Double.NEGATIVE_INFINITY;
int localMaxElementIndex = 0;
for(int kkk = calculatedValues.size()-1; kkk >= 1; kkk--) {
final double bestMetric = calculatedValues.get(kkk);
if( bestMetric > localMaxElement ) {
localMaxElement = bestMetric;
localMaxElementIndex = kkk;
} else if( localMaxElement - bestMetric > BANDING_TOLERANCE * 0.5 ) { // find a local maximum
if( !detectClusteredStartLocations(workToBeAdded, work + localMaxElementIndex ) ) {
workToBeAdded.add( work + localMaxElementIndex );
}
break;
}
}
calculatedValues.clear();
// reset iii and jjj to the appropriate diagonal baseline location
iii = 0;
jjj = diag;
if( diag > Y_METRIC_LENGTH ) {
iii = diag - Y_METRIC_LENGTH;
jjj = Y_METRIC_LENGTH;
}
// move to the starting work location along the diagonal
iii += work-1;
jjj -= work-1;
// step along the diagonal in the reverse direction, updating the match matrices and looking for a drop off from the maximum observed value
for( int traceBack = work - 1; traceBack > 0 && iii > 0 && jjj < Y_METRIC_LENGTH; traceBack--,iii--,jjj++ ) {
updateCell(iii, jjj, haplotypeBases, readBases, readQuals,
insertionGOP, deletionGOP, overallGCP, matchMetricArray, XMetricArray, YMetricArray);
final double bestMetric = MathUtils.max(matchMetricArray[iii][jjj], XMetricArray[iii][jjj], YMetricArray[iii][jjj]);
if( bestMetric > maxElement ) {
maxElement = bestMetric;
} else if( maxElement - bestMetric > BANDING_TOLERANCE ) {
break;
}
}
}
}
workQueue.clear();
workQueue.addAll(workToBeAdded);
workToBeAdded.clear();
}
} else {
// simple rectangular version of update loop, slow
for( int iii = 1; iii < X_METRIC_LENGTH; iii++ ) {
for( int jjj = hapStartIndex + 1; jjj < Y_METRIC_LENGTH; jjj++ ) {
if( (iii == 1 && jjj == 1) ) { continue; }
updateCell(iii, jjj, haplotypeBases, readBases, readQuals, insertionGOP, deletionGOP, overallGCP,
matchMetricArray, XMetricArray, YMetricArray);
}
}
}
// final probability is the log10 sum of the last element in all three state arrays
final int endI = X_METRIC_LENGTH - 1;
final int endJ = Y_METRIC_LENGTH - 1;
return MathUtils.approximateLog10SumLog10(matchMetricArray[endI][endJ], XMetricArray[endI][endJ], YMetricArray[endI][endJ]);
}
private void updateCell( final int indI, final int indJ, final byte[] haplotypeBases, final byte[] readBases,
final byte[] readQuals, final byte[] insertionGOP, final byte[] deletionGOP, final byte[] overallGCP,
final double[][] matchMetricArray, final double[][] XMetricArray, final double[][] YMetricArray ) {
// the read and haplotype indices are offset by one because the state arrays have an extra column to hold the initial conditions
final int im1 = indI - 1;
final int jm1 = indJ - 1;
// update the match array
double pBaseReadLog10 = 0.0; // Math.log10(1.0);
if( im1 > 0 && jm1 > 0 ) { // the emission probability is applied when leaving the state
final byte x = readBases[im1-1];
final byte y = haplotypeBases[jm1-1];
final byte qual = readQuals[im1-1];
pBaseReadLog10 = ( x == y || x == (byte) 'N' || y == (byte) 'N' ? QualityUtils.qualToProbLog10(qual) : QualityUtils.qualToErrorProbLog10(qual) );
}
final int qualIndexGOP = ( im1 == 0 ? DEFAULT_GOP + DEFAULT_GOP : ( insertionGOP[im1-1] + deletionGOP[im1-1] > MAX_CACHED_QUAL ? MAX_CACHED_QUAL : insertionGOP[im1-1] + deletionGOP[im1-1]) );
final double d0 = QualityUtils.qualToProbLog10((byte)qualIndexGOP);
final double e0 = ( im1 == 0 ? QualityUtils.qualToProbLog10(DEFAULT_GCP) : QualityUtils.qualToProbLog10(overallGCP[im1-1]) );
matchMetricArray[indI][indJ] = pBaseReadLog10 + MathUtils.approximateLog10SumLog10(matchMetricArray[indI-1][indJ-1] + d0, XMetricArray[indI-1][indJ-1] + e0, YMetricArray[indI-1][indJ-1] + e0);
// update the X (insertion) array
final double d1 = ( im1 == 0 ? QualityUtils.qualToErrorProbLog10(DEFAULT_GOP) : QualityUtils.qualToErrorProbLog10(insertionGOP[im1-1]) );
final double e1 = ( im1 == 0 ? QualityUtils.qualToErrorProbLog10(DEFAULT_GCP) : QualityUtils.qualToErrorProbLog10(overallGCP[im1-1]) );
final double qBaseReadLog10 = 0.0; // Math.log10(1.0) -- we don't have an estimate for this emission probability so assume q=1.0
XMetricArray[indI][indJ] = qBaseReadLog10 + MathUtils.approximateLog10SumLog10(matchMetricArray[indI-1][indJ] + d1, XMetricArray[indI-1][indJ] + e1);
// update the Y (deletion) array, with penalty of zero on the left and right flanks to allow for a local alignment within the haplotype
final double d2 = ( im1 == 0 || im1 == readBases.length ? 0.0 : QualityUtils.qualToErrorProbLog10(deletionGOP[im1-1]) );
final double e2 = ( im1 == 0 || im1 == readBases.length ? 0.0 : QualityUtils.qualToErrorProbLog10(overallGCP[im1-1]) );
final double qBaseRefLog10 = 0.0; // Math.log10(1.0) -- we don't have an estimate for this emission probability so assume q=1.0
YMetricArray[indI][indJ] = qBaseRefLog10 + MathUtils.approximateLog10SumLog10(matchMetricArray[indI][indJ-1] + d2, YMetricArray[indI][indJ-1] + e2);
}
// private function used by the banded approach to ensure the proposed bands are sufficiently distinct from each other
private boolean detectClusteredStartLocations( final ArrayList<Integer> list, int loc ) {
for(int x : list) {
if( Math.abs(x-loc) <= BANDING_CLUSTER_WINDOW ) {
return true;
}
}
return false;
}
}

View File

@ -0,0 +1,107 @@
package org.broadinstitute.sting.utils.pairhmm;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.QualityUtils;
import java.util.ArrayList;
import java.util.Arrays;
/**
* Created with IntelliJ IDEA.
* User: rpoplin
* Date: 10/16/12
*/
public class ExactPairHMM extends PairHMM {
@Override
public void initialize( final int READ_MAX_LENGTH, final int HAPLOTYPE_MAX_LENGTH ) {
// M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment
final int X_METRIC_LENGTH = READ_MAX_LENGTH + 2;
final int Y_METRIC_LENGTH = HAPLOTYPE_MAX_LENGTH + 2;
matchMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
XMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
YMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
for( int iii=0; iii < X_METRIC_LENGTH; iii++ ) {
Arrays.fill(matchMetricArray[iii], Double.NEGATIVE_INFINITY);
Arrays.fill(XMetricArray[iii], Double.NEGATIVE_INFINITY);
Arrays.fill(YMetricArray[iii], Double.NEGATIVE_INFINITY);
}
// the initial condition
matchMetricArray[1][1] = 0.0; // Math.log10(1.0);
}
@Override
public double computeReadLikelihoodGivenHaplotypeLog10( final byte[] haplotypeBases,
final byte[] readBases,
final byte[] readQuals,
final byte[] insertionGOP,
final byte[] deletionGOP,
final byte[] overallGCP,
final int hapStartIndex,
final boolean recacheReadValues ) {
// M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment
final int X_METRIC_LENGTH = readBases.length + 2;
final int Y_METRIC_LENGTH = haplotypeBases.length + 2;
// ensure that all the qual scores have valid values
for( int iii = 0; iii < readQuals.length; iii++ ) {
readQuals[iii] = ( readQuals[iii] < QualityUtils.MIN_USABLE_Q_SCORE ? QualityUtils.MIN_USABLE_Q_SCORE : (readQuals[iii] > MAX_CACHED_QUAL ? MAX_CACHED_QUAL : readQuals[iii]) );
}
// simple rectangular version of update loop, slow
for( int iii = 1; iii < X_METRIC_LENGTH; iii++ ) {
for( int jjj = hapStartIndex + 1; jjj < Y_METRIC_LENGTH; jjj++ ) {
if( (iii == 1 && jjj == 1) ) { continue; }
updateCell(iii, jjj, haplotypeBases, readBases, readQuals, insertionGOP, deletionGOP, overallGCP,
matchMetricArray, XMetricArray, YMetricArray);
}
}
// final probability is the log10 sum of the last element in all three state arrays
final int endI = X_METRIC_LENGTH - 1;
final int endJ = Y_METRIC_LENGTH - 1;
return MathUtils.log10sumLog10(new double[]{matchMetricArray[endI][endJ], XMetricArray[endI][endJ], YMetricArray[endI][endJ]});
}
private void updateCell( final int indI, final int indJ, final byte[] haplotypeBases, final byte[] readBases,
final byte[] readQuals, final byte[] insertionGOP, final byte[] deletionGOP, final byte[] overallGCP,
final double[][] matchMetricArray, final double[][] XMetricArray, final double[][] YMetricArray ) {
// the read and haplotype indices are offset by one because the state arrays have an extra column to hold the initial conditions
final int im1 = indI - 1;
final int jm1 = indJ - 1;
// update the match array
double pBaseReadLog10 = 0.0; // Math.log10(1.0);
if( im1 > 0 && jm1 > 0 ) { // the emission probability is applied when leaving the state
final byte x = readBases[im1-1];
final byte y = haplotypeBases[jm1-1];
final byte qual = readQuals[im1-1];
pBaseReadLog10 = ( x == y || x == (byte) 'N' || y == (byte) 'N' ? QualityUtils.qualToProbLog10(qual) : QualityUtils.qualToErrorProbLog10(qual) );
}
final int qualIndexGOP = ( im1 == 0 ? DEFAULT_GOP + DEFAULT_GOP : ( insertionGOP[im1-1] + deletionGOP[im1-1] > MAX_CACHED_QUAL ? MAX_CACHED_QUAL : insertionGOP[im1-1] + deletionGOP[im1-1]) );
final double d0 = QualityUtils.qualToProbLog10((byte)qualIndexGOP);
final double e0 = ( im1 == 0 ? QualityUtils.qualToProbLog10(DEFAULT_GCP) : QualityUtils.qualToProbLog10(overallGCP[im1-1]) );
matchMetricArray[indI][indJ] = pBaseReadLog10 + MathUtils.log10sumLog10(new double[]{matchMetricArray[indI-1][indJ-1] + d0, XMetricArray[indI-1][indJ-1] + e0, YMetricArray[indI-1][indJ-1] + e0});
// update the X (insertion) array
final double d1 = ( im1 == 0 ? QualityUtils.qualToErrorProbLog10(DEFAULT_GOP) : QualityUtils.qualToErrorProbLog10(insertionGOP[im1-1]) );
final double e1 = ( im1 == 0 ? QualityUtils.qualToErrorProbLog10(DEFAULT_GCP) : QualityUtils.qualToErrorProbLog10(overallGCP[im1-1]) );
final double qBaseReadLog10 = 0.0; // Math.log10(1.0) -- we don't have an estimate for this emission probability so assume q=1.0
XMetricArray[indI][indJ] = qBaseReadLog10 + MathUtils.log10sumLog10(new double[]{matchMetricArray[indI-1][indJ] + d1, XMetricArray[indI-1][indJ] + e1});
// update the Y (deletion) array, with penalty of zero on the left and right flanks to allow for a local alignment within the haplotype
final double d2 = ( im1 == 0 || im1 == readBases.length ? 0.0 : QualityUtils.qualToErrorProbLog10(deletionGOP[im1-1]) );
final double e2 = ( im1 == 0 || im1 == readBases.length ? 0.0 : QualityUtils.qualToErrorProbLog10(overallGCP[im1-1]) );
final double qBaseRefLog10 = 0.0; // Math.log10(1.0) -- we don't have an estimate for this emission probability so assume q=1.0
YMetricArray[indI][indJ] = qBaseRefLog10 + MathUtils.log10sumLog10(new double[]{matchMetricArray[indI][indJ-1] + d2, YMetricArray[indI][indJ-1] + e2});
}
}

View File

@ -0,0 +1,105 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.pairhmm;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.QualityUtils;
/**
* Util class for performing the pair HMM for local alignment. Figure 4.3 in Durbin 1998 book.
* User: rpoplin
* Date: 3/1/12
*/
public class OriginalPairHMM extends ExactPairHMM {
@Override
public double computeReadLikelihoodGivenHaplotypeLog10( final byte[] haplotypeBases,
final byte[] readBases,
final byte[] readQuals,
final byte[] insertionGOP,
final byte[] deletionGOP,
final byte[] overallGCP,
final int hapStartIndex,
final boolean recacheReadValues ) {
// M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment
final int X_METRIC_LENGTH = readBases.length + 2;
final int Y_METRIC_LENGTH = haplotypeBases.length + 2;
// ensure that all the qual scores have valid values
for( int iii = 0; iii < readQuals.length; iii++ ) {
readQuals[iii] = ( readQuals[iii] < QualityUtils.MIN_USABLE_Q_SCORE ? QualityUtils.MIN_USABLE_Q_SCORE : (readQuals[iii] > MAX_CACHED_QUAL ? MAX_CACHED_QUAL : readQuals[iii]) );
}
// simple rectangular version of update loop, slow
for( int iii = 1; iii < X_METRIC_LENGTH; iii++ ) {
for( int jjj = hapStartIndex + 1; jjj < Y_METRIC_LENGTH; jjj++ ) {
if( (iii == 1 && jjj == 1) ) { continue; }
updateCell(iii, jjj, haplotypeBases, readBases, readQuals, insertionGOP, deletionGOP, overallGCP,
matchMetricArray, XMetricArray, YMetricArray);
}
}
// final probability is the log10 sum of the last element in all three state arrays
final int endI = X_METRIC_LENGTH - 1;
final int endJ = Y_METRIC_LENGTH - 1;
return MathUtils.approximateLog10SumLog10(matchMetricArray[endI][endJ], XMetricArray[endI][endJ], YMetricArray[endI][endJ]);
}
private void updateCell( final int indI, final int indJ, final byte[] haplotypeBases, final byte[] readBases,
final byte[] readQuals, final byte[] insertionGOP, final byte[] deletionGOP, final byte[] overallGCP,
final double[][] matchMetricArray, final double[][] XMetricArray, final double[][] YMetricArray ) {
// the read and haplotype indices are offset by one because the state arrays have an extra column to hold the initial conditions
final int im1 = indI - 1;
final int jm1 = indJ - 1;
// update the match array
double pBaseReadLog10 = 0.0; // Math.log10(1.0);
if( im1 > 0 && jm1 > 0 ) { // the emission probability is applied when leaving the state
final byte x = readBases[im1-1];
final byte y = haplotypeBases[jm1-1];
final byte qual = readQuals[im1-1];
pBaseReadLog10 = ( x == y || x == (byte) 'N' || y == (byte) 'N' ? QualityUtils.qualToProbLog10(qual) : QualityUtils.qualToErrorProbLog10(qual) );
}
final int qualIndexGOP = ( im1 == 0 ? DEFAULT_GOP + DEFAULT_GOP : ( insertionGOP[im1-1] + deletionGOP[im1-1] > MAX_CACHED_QUAL ? MAX_CACHED_QUAL : insertionGOP[im1-1] + deletionGOP[im1-1]) );
final double d0 = QualityUtils.qualToProbLog10((byte)qualIndexGOP);
final double e0 = ( im1 == 0 ? QualityUtils.qualToProbLog10(DEFAULT_GCP) : QualityUtils.qualToProbLog10(overallGCP[im1-1]) );
matchMetricArray[indI][indJ] = pBaseReadLog10 + MathUtils.approximateLog10SumLog10(matchMetricArray[indI-1][indJ-1] + d0, XMetricArray[indI-1][indJ-1] + e0, YMetricArray[indI-1][indJ-1] + e0);
// update the X (insertion) array
final double d1 = ( im1 == 0 ? QualityUtils.qualToErrorProbLog10(DEFAULT_GOP) : QualityUtils.qualToErrorProbLog10(insertionGOP[im1-1]) );
final double e1 = ( im1 == 0 ? QualityUtils.qualToErrorProbLog10(DEFAULT_GCP) : QualityUtils.qualToErrorProbLog10(overallGCP[im1-1]) );
final double qBaseReadLog10 = 0.0; // Math.log10(1.0) -- we don't have an estimate for this emission probability so assume q=1.0
XMetricArray[indI][indJ] = qBaseReadLog10 + MathUtils.approximateLog10SumLog10(matchMetricArray[indI-1][indJ] + d1, XMetricArray[indI-1][indJ] + e1);
// update the Y (deletion) array, with penalty of zero on the left and right flanks to allow for a local alignment within the haplotype
final double d2 = ( im1 == 0 || im1 == readBases.length ? 0.0 : QualityUtils.qualToErrorProbLog10(deletionGOP[im1-1]) );
final double e2 = ( im1 == 0 || im1 == readBases.length ? 0.0 : QualityUtils.qualToErrorProbLog10(overallGCP[im1-1]) );
final double qBaseRefLog10 = 0.0; // Math.log10(1.0) -- we don't have an estimate for this emission probability so assume q=1.0
YMetricArray[indI][indJ] = qBaseRefLog10 + MathUtils.approximateLog10SumLog10(matchMetricArray[indI][indJ-1] + d2, YMetricArray[indI][indJ-1] + e2);
}
}

View File

@ -0,0 +1,45 @@
package org.broadinstitute.sting.utils.pairhmm;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
/**
* Created with IntelliJ IDEA.
* User: rpoplin
* Date: 10/16/12
*/
public abstract class PairHMM {
protected static final Byte MAX_CACHED_QUAL = Byte.MAX_VALUE;
protected static final byte DEFAULT_GOP = (byte) 45;
protected static final byte DEFAULT_GCP = (byte) 10;
public enum HMM_IMPLEMENTATION {
/* Very slow implementation which uses very accurate log10 sum functions. Only meant to be used as a reference test implementation */
EXACT,
/* PairHMM as implemented for the UnifiedGenotyper. Uses log10 sum functions accurate to only 1E-4 */
ORIGINAL,
/* Optimized version of the PairHMM which caches per-read computations */
CACHING,
/* Optimized version of the PairHMM which caches per-read computations and operations in real space to avoid costly sums of log10'ed likelihoods */
LOGLESS_CACHING
}
protected double[][] matchMetricArray = null;
protected double[][] XMetricArray = null;
protected double[][] YMetricArray = null;
public abstract void initialize( final int READ_MAX_LENGTH, final int HAPLOTYPE_MAX_LENGTH );
@Requires({"readBases.length == readQuals.length", "readBases.length == insertionGOP.length", "readBases.length == deletionGOP.length",
"readBases.length == overallGCP.length", "matchMetricArray!=null", "XMetricArray!=null", "YMetricArray!=null"})
@Ensures({"!Double.isInfinite(result)", "!Double.isNaN(result)"}) // Result should be a proper log10 likelihood
public abstract double computeReadLikelihoodGivenHaplotypeLog10( final byte[] haplotypeBases,
final byte[] readBases,
final byte[] readQuals,
final byte[] insertionGOP,
final byte[] deletionGOP,
final byte[] overallGCP,
final int hapStartIndex,
final boolean recacheReadValues );
}