From a647f1e076a37d2fbfbbc7650a2111bcff74a367 Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Sat, 20 Oct 2012 16:38:18 -0400 Subject: [PATCH 2/6] Refactoring the PairHMM util class to allow for multiple implementations which can be specified by the callers via an enum argument. Adding an optimized PairHMM implementation which caches per-read calculations as well as a logless implementation which drastically reduces the runtime of the HMM while also increasing the precision of the result. In the HaplotypeCaller we now lexicographically sort the haplotypes to take maximal benefit of the haplotype offset optimization which only recalculates the HMM matrices after the first differing base in the haplotype. Many thanks to Mauricio for all the initial groundwork for these optimizations. The change to the one HC integration test is in the fourth decimal of HaplotypeScore. --- .../gatk/walkers/genotyper/ErrorModel.java | 2 +- ...elGenotypeLikelihoodsCalculationModel.java | 2 +- .../haplotypecaller/HaplotypeCaller.java | 12 +- .../LikelihoodCalculationEngine.java | 53 ++-- .../sting/utils/pairhmm/CachingPairHMM.java | 181 ++++++++++++ .../utils/pairhmm/LoglessCachingPairHMM.java | 187 +++++++++++++ .../HaplotypeCallerIntegrationTest.java | 2 +- .../sting/utils/pairhmm}/PairHMMUnitTest.java | 242 ++++++---------- ...elGenotypeLikelihoodsCalculationModel.java | 2 +- .../genotyper/UnifiedArgumentCollection.java | 13 +- .../indels/PairHMMIndelErrorModel.java | 42 +-- .../broadinstitute/sting/utils/Haplotype.java | 16 ++ .../broadinstitute/sting/utils/PairHMM.java | 259 ------------------ .../sting/utils/pairhmm/ExactPairHMM.java | 107 ++++++++ .../sting/utils/pairhmm/OriginalPairHMM.java | 105 +++++++ .../sting/utils/pairhmm/PairHMM.java | 45 +++ 16 files changed, 813 insertions(+), 457 deletions(-) create mode 100644 protected/java/src/org/broadinstitute/sting/utils/pairhmm/CachingPairHMM.java create mode 100644 protected/java/src/org/broadinstitute/sting/utils/pairhmm/LoglessCachingPairHMM.java rename {public/java/test/org/broadinstitute/sting/utils => protected/java/test/org/broadinstitute/sting/utils/pairhmm}/PairHMMUnitTest.java (56%) delete mode 100644 public/java/src/org/broadinstitute/sting/utils/PairHMM.java create mode 100644 public/java/src/org/broadinstitute/sting/utils/pairhmm/ExactPairHMM.java create mode 100644 public/java/src/org/broadinstitute/sting/utils/pairhmm/OriginalPairHMM.java create mode 100644 public/java/src/org/broadinstitute/sting/utils/pairhmm/PairHMM.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ErrorModel.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ErrorModel.java index f76225134..8042c15d8 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ErrorModel.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ErrorModel.java @@ -72,7 +72,7 @@ public class ErrorModel { haplotypeMap = new LinkedHashMap(); if (refSampleVC.isIndel()) { pairModel = new PairHMMIndelErrorModel(UAC.INDEL_GAP_OPEN_PENALTY, UAC.INDEL_GAP_CONTINUATION_PENALTY, - UAC.OUTPUT_DEBUG_INDEL_INFO, !UAC.DONT_DO_BANDED_INDEL_COMPUTATION); + UAC.OUTPUT_DEBUG_INDEL_INFO, UAC.pairHMM); IndelGenotypeLikelihoodsCalculationModel.getHaplotypeMapFromAlleles(refSampleVC.getAlleles(), refContext, refContext.getLocus(), haplotypeMap); // will update haplotypeMap adding elements } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoodsCalculationModel.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoodsCalculationModel.java index fc0c526bc..f09a1ea3e 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoodsCalculationModel.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoodsCalculationModel.java @@ -62,7 +62,7 @@ public class GeneralPloidyIndelGenotypeLikelihoodsCalculationModel extends Gener pairModel = new PairHMMIndelErrorModel(UAC.INDEL_GAP_OPEN_PENALTY, UAC.INDEL_GAP_CONTINUATION_PENALTY, - UAC.OUTPUT_DEBUG_INDEL_INFO, !UAC.DONT_DO_BANDED_INDEL_COMPUTATION); + UAC.OUTPUT_DEBUG_INDEL_INFO, UAC.pairHMM); haplotypeMap = new LinkedHashMap(); } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index 71e4f5f8a..5f2b5775c 100755 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -52,6 +52,7 @@ import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; import org.broadinstitute.sting.utils.fragments.FragmentCollection; import org.broadinstitute.sting.utils.fragments.FragmentUtils; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; +import org.broadinstitute.sting.utils.pairhmm.PairHMM; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.sam.AlignmentUtils; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; @@ -114,6 +115,12 @@ public class HaplotypeCaller extends ActiveRegionWalker implem @Output(fullName="graphOutput", shortName="graph", doc="File to which debug assembly graph information should be written", required = false) protected PrintStream graphWriter = null; + /** + * The PairHMM implementation to use for genotype likelihood calculations. The various implementations balance a tradeoff of accuracy and runtime. + */ + @Argument(fullName = "pair_hmm_implementation", shortName = "pairHMM", doc = "The PairHMM implementation to use for genotype likelihood calculations", required = false) + public PairHMM.HMM_IMPLEMENTATION pairHMM = PairHMM.HMM_IMPLEMENTATION.LOGLESS_CACHING; + @Hidden @Argument(fullName="keepRG", shortName="keepRG", doc="Only use read from this read group when making calls (but use all reads to build the assembly)", required = false) protected String keepRG = null; @@ -287,7 +294,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem } assemblyEngine = new SimpleDeBruijnAssembler( DEBUG, graphWriter ); - likelihoodCalculationEngine = new LikelihoodCalculationEngine( (byte)gcpHMM, DEBUG, false ); + likelihoodCalculationEngine = new LikelihoodCalculationEngine( (byte)gcpHMM, DEBUG, pairHMM ); genotypingEngine = new GenotypingEngine( DEBUG, OUTPUT_FULL_HAPLOTYPE_SEQUENCE ); } @@ -400,6 +407,9 @@ public class HaplotypeCaller extends ActiveRegionWalker implem final List filteredReads = filterNonPassingReads( activeRegion ); // filter out reads from genotyping which fail mapping quality based criteria if( activeRegion.size() == 0 ) { return 1; } // no reads remain after filtering so nothing else to do! + // sort haplotypes to take full advantage of haplotype start offset optimizations in PairHMM + Collections.sort( haplotypes, new Haplotype.HaplotypeBaseComparator() ); + // evaluate each sample's reads against all haplotypes final HashMap> perSampleReadList = splitReadsBySample( activeRegion.getReads() ); final HashMap> perSampleFilteredReadList = splitReadsBySample( filteredReads ); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java index 14c1cd59d..62554c4ab 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java @@ -30,6 +30,9 @@ import com.google.java.contract.Requires; import org.broadinstitute.sting.gatk.walkers.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.pairhmm.*; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.ReadUtils; import org.broadinstitute.sting.utils.variantcontext.Allele; @@ -44,8 +47,25 @@ public class LikelihoodCalculationEngine { private final boolean DEBUG; private final PairHMM pairHMM; - public LikelihoodCalculationEngine( final byte constantGCP, final boolean debug, final boolean noBanded ) { - pairHMM = new PairHMM( noBanded ); + public LikelihoodCalculationEngine( final byte constantGCP, final boolean debug, final PairHMM.HMM_IMPLEMENTATION hmmType ) { + + switch (hmmType) { + case EXACT: + pairHMM = new ExactPairHMM(); + break; + case ORIGINAL: + pairHMM = new OriginalPairHMM(); + break; + case CACHING: + pairHMM = new CachingPairHMM(); + break; + case LOGLESS_CACHING: + pairHMM = new LoglessCachingPairHMM(); + break; + default: + throw new UserException.BadArgumentValue("pairHMM", "Specified pairHMM implementation is unrecognized or incompatible with the HaplotypeCaller. Acceptable options are ORIGINAL, EXACT, CACHING, and LOGLESS_CACHING."); + } + this.constantGCP = constantGCP; DEBUG = debug; } @@ -69,23 +89,18 @@ public class LikelihoodCalculationEngine { X_METRIC_LENGTH += 2; Y_METRIC_LENGTH += 2; - // initial arrays to hold the probabilities of being in the match, insertion and deletion cases - final double[][] matchMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; - final double[][] XMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; - final double[][] YMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; - - PairHMM.initializeArrays(matchMetricArray, XMetricArray, YMetricArray, X_METRIC_LENGTH); + // initialize arrays to hold the probabilities of being in the match, insertion and deletion cases + pairHMM.initialize(X_METRIC_LENGTH, Y_METRIC_LENGTH); // for each sample's reads for( final Map.Entry> sampleEntry : perSampleReadList.entrySet() ) { //if( DEBUG ) { System.out.println("Evaluating sample " + sample + " with " + perSampleReadList.get( sample ).size() + " passing reads"); } // evaluate the likelihood of the reads given those haplotypes - computeReadLikelihoods( haplotypes, sampleEntry.getValue(), sampleEntry.getKey(), matchMetricArray, XMetricArray, YMetricArray ); + computeReadLikelihoods( haplotypes, sampleEntry.getValue(), sampleEntry.getKey() ); } } - private void computeReadLikelihoods( final ArrayList haplotypes, final ArrayList reads, final String sample, - final double[][] matchMetricArray, final double[][] XMetricArray, final double[][] YMetricArray ) { + private void computeReadLikelihoods( final ArrayList haplotypes, final ArrayList reads, final String sample ) { final int numHaplotypes = haplotypes.size(); final int numReads = reads.size(); @@ -113,9 +128,8 @@ public class LikelihoodCalculationEngine { final int haplotypeStart = ( previousHaplotypeSeen == null ? 0 : computeFirstDifferingPosition(haplotype.getBases(), previousHaplotypeSeen.getBases()) ); previousHaplotypeSeen = haplotype; - readLikelihoods[jjj][iii] = pairHMM.computeReadLikelihoodGivenHaplotype(haplotype.getBases(), read.getReadBases(), - readQuals, readInsQuals, readDelQuals, overallGCP, - haplotypeStart, matchMetricArray, XMetricArray, YMetricArray); + readLikelihoods[jjj][iii] = pairHMM.computeReadLikelihoodGivenHaplotypeLog10(haplotype.getBases(), read.getReadBases(), + readQuals, readInsQuals, readDelQuals, overallGCP, haplotypeStart, jjj == 0); readCounts[jjj][iii] = readCount; } } @@ -130,7 +144,7 @@ public class LikelihoodCalculationEngine { return iii; } } - return b1.length; + return Math.min(b1.length, b2.length); } @Requires({"haplotypes.size() > 0"}) @@ -280,7 +294,7 @@ public class LikelihoodCalculationEngine { final int numHaplotypes = haplotypes.size(); final Set sampleKeySet = haplotypes.get(0).getSampleKeySet(); // BUGBUG: assume all haplotypes saw the same samples final ArrayList bestHaplotypesIndexList = new ArrayList(); - bestHaplotypesIndexList.add(0); // always start with the reference haplotype + bestHaplotypesIndexList.add( findReferenceIndex(haplotypes) ); // always start with the reference haplotype // set up the default 1-to-1 haplotype mapping object final ArrayList> haplotypeMapping = new ArrayList>(); for( final Haplotype h : haplotypes ) { @@ -322,6 +336,13 @@ public class LikelihoodCalculationEngine { return bestHaplotypes; } + public static int findReferenceIndex( final List haplotypes ) { + for( final Haplotype h : haplotypes ) { + if( h.isReference() ) { return haplotypes.indexOf(h); } + } + throw new ReviewedStingException( "No reference haplotype found in the list of haplotypes!" ); + } + public static Map partitionReadsBasedOnLikelihoods( final GenomeLocParser parser, final HashMap> perSampleReadList, final HashMap> perSampleFilteredReadList, diff --git a/protected/java/src/org/broadinstitute/sting/utils/pairhmm/CachingPairHMM.java b/protected/java/src/org/broadinstitute/sting/utils/pairhmm/CachingPairHMM.java new file mode 100644 index 000000000..282db45d5 --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/utils/pairhmm/CachingPairHMM.java @@ -0,0 +1,181 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.pairhmm; + +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.QualityUtils; + +import java.util.Arrays; + +/** + * Created with IntelliJ IDEA. + * User: rpoplin, carneiro + * Date: 10/16/12 + */ + +public class CachingPairHMM extends OriginalPairHMM { + + double[][] constantMatrix = null; // The cache in the CachingPairHMM + double[][] distanceMatrix = null; // The cache in the CachingPairHMM + + protected static final double [] firstRowConstantMatrix = { + QualityUtils.qualToProbLog10((byte) (DEFAULT_GOP + DEFAULT_GOP)), + QualityUtils.qualToProbLog10(DEFAULT_GCP), + QualityUtils.qualToErrorProbLog10(DEFAULT_GOP), + QualityUtils.qualToErrorProbLog10(DEFAULT_GCP), + 0.0, + 0.0 + }; + + @Override + public void initialize( final int READ_MAX_LENGTH, final int HAPLOTYPE_MAX_LENGTH ) { + + super.initialize(READ_MAX_LENGTH, HAPLOTYPE_MAX_LENGTH); + + // M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment + final int X_METRIC_LENGTH = READ_MAX_LENGTH + 2; + final int Y_METRIC_LENGTH = HAPLOTYPE_MAX_LENGTH + 2; + + constantMatrix = new double[X_METRIC_LENGTH][6]; + distanceMatrix = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; + + // fill in the first row + for( int jjj = 2; jjj < Y_METRIC_LENGTH; jjj++ ) { + updateCell(1, jjj, 0.0, firstRowConstantMatrix, matchMetricArray, XMetricArray, YMetricArray); + } + } + + @Override + public double computeReadLikelihoodGivenHaplotypeLog10( final byte[] haplotypeBases, + final byte[] readBases, + final byte[] readQuals, + final byte[] insertionGOP, + final byte[] deletionGOP, + final byte[] overallGCP, + final int hapStartIndex, + final boolean recacheReadValues ) { + + if( recacheReadValues ) { + initializeConstants( insertionGOP, deletionGOP, overallGCP ); + } + initializeDistanceMatrix( haplotypeBases, readBases, readQuals, hapStartIndex ); + + // M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment + final int X_METRIC_LENGTH = readBases.length + 2; + final int Y_METRIC_LENGTH = haplotypeBases.length + 2; + + for (int i = 2; i < X_METRIC_LENGTH; i++) { + for (int j = hapStartIndex+1; j < Y_METRIC_LENGTH; j++) { + updateCell(i, j, distanceMatrix[i][j], constantMatrix[i], matchMetricArray, XMetricArray, YMetricArray); + } + } + + // final probability is the log10 sum of the last element in all three state arrays + final int endI = X_METRIC_LENGTH - 1; + final int endJ = Y_METRIC_LENGTH - 1; + return MathUtils.approximateLog10SumLog10(matchMetricArray[endI][endJ], XMetricArray[endI][endJ], YMetricArray[endI][endJ]); + } + + /** + * Initializes the matrix that holds all the constants related to the editing + * distance between the read and the haplotype. + * + * @param haplotypeBases the bases of the haplotype + * @param readBases the bases of the read + * @param readQuals the base quality scores of the read + * @param startIndex where to start updating the distanceMatrix (in case this read is similar to the previous read) + */ + public void initializeDistanceMatrix( final byte[] haplotypeBases, + final byte[] readBases, + final byte[] readQuals, + final int startIndex ) { + + // initialize the pBaseReadLog10 matrix for all combinations of read x haplotype bases + // Abusing the fact that java initializes arrays with 0.0, so no need to fill in rows and columns below 2. + + for (int i = 0; i < readBases.length; i++) { + final byte x = readBases[i]; + final byte qual = readQuals[i]; + for (int j = startIndex; j < haplotypeBases.length; j++) { + final byte y = haplotypeBases[j]; + distanceMatrix[i+2][j+2] = ( x == y || x == (byte) 'N' || y == (byte) 'N' ? + QualityUtils.qualToProbLog10(qual) : QualityUtils.qualToErrorProbLog10(qual) ); + } + } + } + + /** + * Initializes the matrix that holds all the constants related to quality scores. + * + * @param insertionGOP insertion quality scores of the read + * @param deletionGOP deletion quality scores of the read + * @param overallGCP overall gap continuation penalty + */ + public void initializeConstants( final byte[] insertionGOP, + final byte[] deletionGOP, + final byte[] overallGCP ) { + + final int l = insertionGOP.length; + constantMatrix[1] = firstRowConstantMatrix; + for (int i = 0; i < l; i++) { + final int qualIndexGOP = Math.min(insertionGOP[i] + deletionGOP[i], Byte.MAX_VALUE); + constantMatrix[i+2][0] = QualityUtils.qualToProbLog10((byte) qualIndexGOP); + constantMatrix[i+2][1] = QualityUtils.qualToProbLog10(overallGCP[i]); + constantMatrix[i+2][2] = QualityUtils.qualToErrorProbLog10(insertionGOP[i]); + constantMatrix[i+2][3] = QualityUtils.qualToErrorProbLog10(overallGCP[i]); + constantMatrix[i+2][4] = QualityUtils.qualToErrorProbLog10(deletionGOP[i]); + constantMatrix[i+2][5] = QualityUtils.qualToErrorProbLog10(overallGCP[i]); + } + constantMatrix[l+1][4] = 0.0; + constantMatrix[l+1][5] = 0.0; + } + + /** + * Updates a cell in the HMM matrix + * + * The read and haplotype indices are offset by one because the state arrays have an extra column to hold the + * initial conditions + + * @param indI row index in the matrices to update + * @param indJ column index in the matrices to update + * @param prior the likelihood editing distance matrix for the read x haplotype + * @param constants an array with the six constants relevant to this location + * @param matchMetricArray the matches likelihood matrix + * @param XMetricArray the insertions likelihood matrix + * @param YMetricArray the deletions likelihood matrix + */ + private void updateCell( final int indI, final int indJ, final double prior, final double[] constants, + final double[][] matchMetricArray, final double[][] XMetricArray, final double[][] YMetricArray ) { + + matchMetricArray[indI][indJ] = prior + + MathUtils.approximateLog10SumLog10( matchMetricArray[indI - 1][indJ - 1] + constants[0], + XMetricArray[indI - 1][indJ - 1] + constants[1], + YMetricArray[indI - 1][indJ - 1] + constants[1] ); + XMetricArray[indI][indJ] = MathUtils.approximateLog10SumLog10( matchMetricArray[indI - 1][indJ] + constants[2], + XMetricArray[indI - 1][indJ] + constants[3]); + YMetricArray[indI][indJ] = MathUtils.approximateLog10SumLog10( matchMetricArray[indI][indJ - 1] + constants[4], + YMetricArray[indI][indJ - 1] + constants[5]); + } +} diff --git a/protected/java/src/org/broadinstitute/sting/utils/pairhmm/LoglessCachingPairHMM.java b/protected/java/src/org/broadinstitute/sting/utils/pairhmm/LoglessCachingPairHMM.java new file mode 100644 index 000000000..d2aef5bb5 --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/utils/pairhmm/LoglessCachingPairHMM.java @@ -0,0 +1,187 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.pairhmm; + +import org.broadinstitute.sting.utils.QualityUtils; + +import java.util.Arrays; + +/** + * Created with IntelliJ IDEA. + * User: rpoplin, carneiro + * Date: 10/16/12 + */ + +public class LoglessCachingPairHMM extends CachingPairHMM { + + protected static final double SCALE_FACTOR_LOG10 = 300.0; + + protected static final double [] firstRowConstantMatrix = { + QualityUtils.qualToProb((byte) (DEFAULT_GOP + DEFAULT_GOP)), + QualityUtils.qualToProb(DEFAULT_GCP), + QualityUtils.qualToErrorProb(DEFAULT_GOP), + QualityUtils.qualToErrorProb(DEFAULT_GCP), + 1.0, + 1.0 + }; + + @Override + public void initialize( final int READ_MAX_LENGTH, final int HAPLOTYPE_MAX_LENGTH ) { + + // M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment + final int X_METRIC_LENGTH = READ_MAX_LENGTH + 2; + final int Y_METRIC_LENGTH = HAPLOTYPE_MAX_LENGTH + 2; + + matchMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; + XMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; + YMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; + + for( int iii=0; iii < X_METRIC_LENGTH; iii++ ) { + Arrays.fill(matchMetricArray[iii], 0.0); + Arrays.fill(XMetricArray[iii], 0.0); + Arrays.fill(YMetricArray[iii], 0.0); + } + + // the initial condition + matchMetricArray[1][1] = Math.pow(10.0, SCALE_FACTOR_LOG10); // Math.log10(1.0); + + constantMatrix = new double[X_METRIC_LENGTH][6]; + distanceMatrix = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; + + // fill in the first row + for( int jjj = 2; jjj < Y_METRIC_LENGTH; jjj++ ) { + updateCell(1, jjj, 1.0, firstRowConstantMatrix, matchMetricArray, XMetricArray, YMetricArray); + } + } + + @Override + public double computeReadLikelihoodGivenHaplotypeLog10( final byte[] haplotypeBases, + final byte[] readBases, + final byte[] readQuals, + final byte[] insertionGOP, + final byte[] deletionGOP, + final byte[] overallGCP, + final int hapStartIndex, + final boolean recacheReadValues ) { + + if( recacheReadValues ) { + initializeConstants( insertionGOP, deletionGOP, overallGCP ); + } + initializeDistanceMatrix( haplotypeBases, readBases, readQuals, hapStartIndex ); + + // M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment + final int X_METRIC_LENGTH = readBases.length + 2; + final int Y_METRIC_LENGTH = haplotypeBases.length + 2; + + for (int i = 2; i < X_METRIC_LENGTH; i++) { + for (int j = hapStartIndex+1; j < Y_METRIC_LENGTH; j++) { + updateCell(i, j, distanceMatrix[i][j], constantMatrix[i], matchMetricArray, XMetricArray, YMetricArray); + } + } + + // final probability is the log10 sum of the last element in all three state arrays + final int endI = X_METRIC_LENGTH - 1; + final int endJ = Y_METRIC_LENGTH - 1; + return Math.log10( matchMetricArray[endI][endJ] + XMetricArray[endI][endJ] + YMetricArray[endI][endJ] ) - SCALE_FACTOR_LOG10; + } + + /** + * Initializes the matrix that holds all the constants related to the editing + * distance between the read and the haplotype. + * + * @param haplotypeBases the bases of the haplotype + * @param readBases the bases of the read + * @param readQuals the base quality scores of the read + * @param startIndex where to start updating the distanceMatrix (in case this read is similar to the previous read) + */ + public void initializeDistanceMatrix( final byte[] haplotypeBases, + final byte[] readBases, + final byte[] readQuals, + final int startIndex ) { + + // initialize the pBaseReadLog10 matrix for all combinations of read x haplotype bases + // Abusing the fact that java initializes arrays with 0.0, so no need to fill in rows and columns below 2. + + for (int i = 0; i < readBases.length; i++) { + final byte x = readBases[i]; + final byte qual = readQuals[i]; + for (int j = startIndex; j < haplotypeBases.length; j++) { + final byte y = haplotypeBases[j]; + distanceMatrix[i+2][j+2] = ( x == y || x == (byte) 'N' || y == (byte) 'N' ? + QualityUtils.qualToProb(qual) : QualityUtils.qualToErrorProb(qual) ); + } + } + } + + /** + * Initializes the matrix that holds all the constants related to quality scores. + * + * @param insertionGOP insertion quality scores of the read + * @param deletionGOP deletion quality scores of the read + * @param overallGCP overall gap continuation penalty + */ + public void initializeConstants( final byte[] insertionGOP, + final byte[] deletionGOP, + final byte[] overallGCP ) { + + final int l = insertionGOP.length; + constantMatrix[1] = firstRowConstantMatrix; + for (int i = 0; i < l; i++) { + final int qualIndexGOP = Math.min(insertionGOP[i] + deletionGOP[i], Byte.MAX_VALUE); + constantMatrix[i+2][0] = QualityUtils.qualToProb((byte) qualIndexGOP); + constantMatrix[i+2][1] = QualityUtils.qualToProb(overallGCP[i]); + constantMatrix[i+2][2] = QualityUtils.qualToErrorProb(insertionGOP[i]); + constantMatrix[i+2][3] = QualityUtils.qualToErrorProb(overallGCP[i]); + constantMatrix[i+2][4] = QualityUtils.qualToErrorProb(deletionGOP[i]); + constantMatrix[i+2][5] = QualityUtils.qualToErrorProb(overallGCP[i]); + } + constantMatrix[l+1][4] = 1.0; + constantMatrix[l+1][5] = 1.0; + } + + /** + * Updates a cell in the HMM matrix + * + * The read and haplotype indices are offset by one because the state arrays have an extra column to hold the + * initial conditions + + * @param indI row index in the matrices to update + * @param indJ column index in the matrices to update + * @param prior the likelihood editing distance matrix for the read x haplotype + * @param constants an array with the six constants relevant to this location + * @param matchMetricArray the matches likelihood matrix + * @param XMetricArray the insertions likelihood matrix + * @param YMetricArray the deletions likelihood matrix + */ + private void updateCell( final int indI, final int indJ, final double prior, final double[] constants, + final double[][] matchMetricArray, final double[][] XMetricArray, final double[][] YMetricArray ) { + + matchMetricArray[indI][indJ] = prior * ( matchMetricArray[indI - 1][indJ - 1] * constants[0] + + XMetricArray[indI - 1][indJ - 1] * constants[1] + + YMetricArray[indI - 1][indJ - 1] * constants[1] ); + XMetricArray[indI][indJ] = matchMetricArray[indI - 1][indJ] * constants[2] + XMetricArray[indI - 1][indJ] * constants[3]; + YMetricArray[indI][indJ] = matchMetricArray[indI][indJ - 1] * constants[4] + YMetricArray[indI][indJ - 1] * constants[5]; + } +} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index a8ea4b7da..a441e6c77 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -70,7 +70,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void HCTestProblematicReadsModifiedInActiveRegions() { final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3"; - final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("fa5c5eb996e95aed12c50d70e6dd74d7")); + final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("c54c0c9411054bf629bfd98b616e53fc")); executeTest("HCTestProblematicReadsModifiedInActiveRegions: ", spec); } diff --git a/public/java/test/org/broadinstitute/sting/utils/PairHMMUnitTest.java b/protected/java/test/org/broadinstitute/sting/utils/pairhmm/PairHMMUnitTest.java similarity index 56% rename from public/java/test/org/broadinstitute/sting/utils/PairHMMUnitTest.java rename to protected/java/test/org/broadinstitute/sting/utils/pairhmm/PairHMMUnitTest.java index 22bcb1bbf..6281054b1 100644 --- a/public/java/test/org/broadinstitute/sting/utils/PairHMMUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/utils/pairhmm/PairHMMUnitTest.java @@ -23,24 +23,26 @@ */ // our package -package org.broadinstitute.sting.utils; +package org.broadinstitute.sting.utils.pairhmm; // the imports for unit testing. - import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.utils.Utils; import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; import java.util.*; - public class PairHMMUnitTest extends BaseTest { final static boolean EXTENSIVE_TESTING = true; - PairHMM hmm = new PairHMM( false ); // reference implementation - PairHMM bandedHMM = new PairHMM( true ); // algorithm with banding + PairHMM exactHMM = new ExactPairHMM(); // the log truth implementation + PairHMM originalHMM = new OriginalPairHMM(); // the reference implementation + PairHMM cachingHMM = new CachingPairHMM(); + PairHMM loglessHMM = new LoglessCachingPairHMM(); // -------------------------------------------------------------------------------- // @@ -57,7 +59,7 @@ public class PairHMMUnitTest extends BaseTest { final static String LEFT_FLANK = "GATTTATCATCGAGTCTGC"; final static String RIGHT_FLANK = "CATGGATCGTTATCAGCTATCTCGAGGGATTCACTTAACAGTTTTA"; - public BasicLikelihoodTestProvider(final String ref, final String read, final int baseQual, final int insQual, final int delQual, final int expectedQual, final int gcp) { + public BasicLikelihoodTestProvider(final String ref, final String read, final int baseQual, final int insQual, final int delQual, final int expectedQual, final int gcp ) { this(ref, read, baseQual, insQual, delQual, expectedQual, gcp, false, false); } @@ -76,115 +78,51 @@ public class PairHMMUnitTest extends BaseTest { } public double expectedLogL() { - return expectedQual / -10.0; + return (expectedQual / -10.0) + 0.03 ; } - public double tolerance() { - return 0.1; // TODO FIXME arbitrary + public double toleranceFromTheoretical() { + return 0.2; } - public double calcLogL() { + public double toleranceFromReference() { + return 1E-4; + } - double logL = hmm.computeReadLikelihoodGivenHaplotype( + public double toleranceFromExact() { + return 1E-9; + } + + public double calcLogL( final PairHMM pairHMM, boolean anchorIndel ) { + pairHMM.initialize(readBasesWithContext.length, refBasesWithContext.length); + return pairHMM.computeReadLikelihoodGivenHaplotypeLog10( refBasesWithContext, readBasesWithContext, - qualAsBytes(baseQual, false), qualAsBytes(insQual, true), qualAsBytes(delQual, true), - qualAsBytes(gcp, false)); - - return logL; + qualAsBytes(baseQual, false, anchorIndel), qualAsBytes(insQual, true, anchorIndel), qualAsBytes(delQual, true, anchorIndel), + qualAsBytes(gcp, false, anchorIndel), 0, true); } private final byte[] asBytes(final String bases, final boolean left, final boolean right) { return ( (left ? LEFT_FLANK : "") + CONTEXT + bases + CONTEXT + (right ? RIGHT_FLANK : "")).getBytes(); } - private byte[] qualAsBytes(final int phredQual, final boolean doGOP) { + private byte[] qualAsBytes(final int phredQual, final boolean doGOP, final boolean anchorIndel) { final byte phredQuals[] = new byte[readBasesWithContext.length]; - // initialize everything to MASSIVE_QUAL so it cannot be moved by HMM - Arrays.fill(phredQuals, (byte)100); - // update just the bases corresponding to the provided micro read with the quality scores - if( doGOP ) { - phredQuals[0 + CONTEXT.length()] = (byte)phredQual; - } else { - for ( int i = 0; i < read.length(); i++) - phredQuals[i + CONTEXT.length()] = (byte)phredQual; - } + if( anchorIndel ) { + // initialize everything to MASSIVE_QUAL so it cannot be moved by HMM + Arrays.fill(phredQuals, (byte)100); - return phredQuals; - } - } - - final Random random = new Random(87865573); - private class BandedLikelihoodTestProvider extends TestDataProvider { - final String ref, read; - final byte[] refBasesWithContext, readBasesWithContext; - final int baseQual, insQual, delQual, gcp; - final int expectedQual; - final static String LEFT_CONTEXT = "ACGTAATGACGCTACATGTCGCCAACCGTC"; - final static String RIGHT_CONTEXT = "TACGGCTTCATATAGGGCAATGTGTGTGGCAAAA"; - final static String LEFT_FLANK = "GATTTATCATCGAGTCTGTT"; - final static String RIGHT_FLANK = "CATGGATCGTTATCAGCTATCTCGAGGGATTCACTTAACAGTTTCCGTA"; - final byte[] baseQuals, insQuals, delQuals, gcps; - - public BandedLikelihoodTestProvider(final String ref, final String read, final int baseQual, final int insQual, final int delQual, final int expectedQual, final int gcp) { - this(ref, read, baseQual, insQual, delQual, expectedQual, gcp, false, false); - } - - public BandedLikelihoodTestProvider(final String ref, final String read, final int baseQual, final int insQual, final int delQual, final int expectedQual, final int gcp, final boolean left, final boolean right) { - super(BandedLikelihoodTestProvider.class, String.format("BANDED: ref=%s read=%s b/i/d/c quals = %d/%d/%d/%d l/r flank = %b/%b e[qual]=%d", ref, read, baseQual, insQual, delQual, gcp, left, right, expectedQual)); - this.baseQual = baseQual; - this.delQual = delQual; - this.insQual = insQual; - this.gcp = gcp; - this.read = read; - this.ref = ref; - this.expectedQual = expectedQual; - - refBasesWithContext = asBytes(ref, left, right); - readBasesWithContext = asBytes(read, false, false); - baseQuals = qualAsBytes(baseQual); - insQuals = qualAsBytes(insQual); - delQuals = qualAsBytes(delQual); - gcps = qualAsBytes(gcp, false); - } - - public double expectedLogL() { - double logL = hmm.computeReadLikelihoodGivenHaplotype( - refBasesWithContext, readBasesWithContext, - baseQuals, insQuals, delQuals, gcps); - - return logL; - } - - public double tolerance() { - return 0.2; // TODO FIXME arbitrary - } - - public double calcLogL() { - - double logL = bandedHMM.computeReadLikelihoodGivenHaplotype( - refBasesWithContext, readBasesWithContext, - baseQuals, insQuals, delQuals, gcps); - - return logL; - } - - private final byte[] asBytes(final String bases, final boolean left, final boolean right) { - return ( (left ? LEFT_FLANK : "") + LEFT_CONTEXT + bases + RIGHT_CONTEXT + (right ? RIGHT_FLANK : "")).getBytes(); - } - - private byte[] qualAsBytes(final int phredQual) { - return qualAsBytes(phredQual, true); - } - - private byte[] qualAsBytes(final int phredQual, final boolean addRandom) { - final byte phredQuals[] = new byte[readBasesWithContext.length]; - Arrays.fill(phredQuals, (byte)phredQual); - if(addRandom) { - for( int iii = 0; iii < phredQuals.length; iii++) { - phredQuals[iii] = (byte) ((int) phredQuals[iii] + (random.nextInt(7) - 3)); + // update just the bases corresponding to the provided micro read with the quality scores + if( doGOP ) { + phredQuals[0 + CONTEXT.length()] = (byte)phredQual; + } else { + for ( int i = 0; i < read.length(); i++) + phredQuals[i + CONTEXT.length()] = (byte)phredQual; } + } else { + Arrays.fill(phredQuals, (byte)phredQual); } + return phredQuals; } } @@ -195,8 +133,8 @@ public class PairHMMUnitTest extends BaseTest { // test all combinations final List baseQuals = EXTENSIVE_TESTING ? Arrays.asList(10, 20, 30, 40, 50) : Arrays.asList(30); final List indelQuals = EXTENSIVE_TESTING ? Arrays.asList(20, 30, 40, 50) : Arrays.asList(40); - final List gcps = EXTENSIVE_TESTING ? Arrays.asList(10, 20, 30) : Arrays.asList(10); - final List sizes = EXTENSIVE_TESTING ? Arrays.asList(2,3,4,5,7,8,9,10,20) : Arrays.asList(2); + final List gcps = EXTENSIVE_TESTING ? Arrays.asList(8, 10, 20) : Arrays.asList(10); + final List sizes = EXTENSIVE_TESTING ? Arrays.asList(2,3,4,5,7,8,9,10,20,30,35) : Arrays.asList(2); for ( final int baseQual : baseQuals ) { for ( final int indelQual : indelQuals ) { @@ -219,7 +157,7 @@ public class PairHMMUnitTest extends BaseTest { for ( boolean insertionP : Arrays.asList(true, false)) { final String small = Utils.dupString((char)base, 1); - final String big = Utils.dupString((char)base, size); + final String big = Utils.dupString((char) base, size); final String ref = insertionP ? small : big; final String read = insertionP ? big : small; @@ -238,69 +176,65 @@ public class PairHMMUnitTest extends BaseTest { return BasicLikelihoodTestProvider.getTests(BasicLikelihoodTestProvider.class); } - @Test(dataProvider = "BasicLikelihoodTestProvider", enabled = true) - public void testBasicLikelihoods(BasicLikelihoodTestProvider cfg) { - double calculatedLogL = cfg.calcLogL(); - double expectedLogL = cfg.expectedLogL(); - logger.warn(String.format("Test: logL calc=%.2f expected=%.2f for %s", calculatedLogL, expectedLogL, cfg.toString())); - Assert.assertEquals(calculatedLogL, expectedLogL, cfg.tolerance()); - } - - @DataProvider(name = "BandedLikelihoodTestProvider") - public Object[][] makeBandedLikelihoodTests() { + final Random random = new Random(87860573); + @DataProvider(name = "OptimizedLikelihoodTestProvider") + public Object[][] makeOptimizedLikelihoodTests() { // context on either side is ACGTTGCA REF ACGTTGCA // test all combinations - final List baseQuals = EXTENSIVE_TESTING ? Arrays.asList(25, 30, 40, 50) : Arrays.asList(30); - final List indelQuals = EXTENSIVE_TESTING ? Arrays.asList(30, 40, 50) : Arrays.asList(40); - final List gcps = EXTENSIVE_TESTING ? Arrays.asList(10, 12) : Arrays.asList(10); - final List sizes = EXTENSIVE_TESTING ? Arrays.asList(2,3,4,5,7,8,9,10,20) : Arrays.asList(2); + final List baseQuals = EXTENSIVE_TESTING ? Arrays.asList(10, 30, 40, 60) : Arrays.asList(30); + final List indelQuals = EXTENSIVE_TESTING ? Arrays.asList(20, 40, 60) : Arrays.asList(40); + final List gcps = EXTENSIVE_TESTING ? Arrays.asList(10, 20, 30) : Arrays.asList(10); + final List sizes = EXTENSIVE_TESTING ? Arrays.asList(3, 20, 50, 90, 160) : Arrays.asList(2); for ( final int baseQual : baseQuals ) { for ( final int indelQual : indelQuals ) { for ( final int gcp : gcps ) { - - // test substitutions - for ( final byte refBase : BaseUtils.BASES ) { - for ( final byte readBase : BaseUtils.BASES ) { - final String ref = new String(new byte[]{refBase}); - final String read = new String(new byte[]{readBase}); - final int expected = refBase == readBase ? 0 : baseQual; - new BandedLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp); - } - } - - // test insertions and deletions - for ( final int size : sizes ) { - for ( final byte base : BaseUtils.BASES ) { - final int expected = indelQual + (size - 2) * gcp; - - for ( boolean insertionP : Arrays.asList(true, false)) { - final String small = Utils.dupString((char)base, 1); - final String big = Utils.dupString((char)base, size); - - final String ref = insertionP ? small : big; - final String read = insertionP ? big : small; - - new BandedLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp); - new BandedLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp, true, false); - new BandedLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp, false, true); - new BandedLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp, true, true); + for ( final int refSize : sizes ) { + for ( final int readSize : sizes ) { + String ref = ""; + String read = ""; + for( int iii = 0; iii < refSize; iii++) { + ref += (char) BaseUtils.BASES[random.nextInt(4)]; } + for( int iii = 0; iii < readSize; iii++) { + read += (char) BaseUtils.BASES[random.nextInt(4)]; + } + new BasicLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, -0, gcp); + new BasicLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, -0, gcp, true, false); + new BasicLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, -0, gcp, false, true); + new BasicLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, -0, gcp, true, true); } } } } } - return BandedLikelihoodTestProvider.getTests(BandedLikelihoodTestProvider.class); + return BasicLikelihoodTestProvider.getTests(BasicLikelihoodTestProvider.class); } - @Test(dataProvider = "BandedLikelihoodTestProvider", enabled = true) - public void testBandedLikelihoods(BandedLikelihoodTestProvider cfg) { - double calculatedLogL = cfg.calcLogL(); + @Test(dataProvider = "BasicLikelihoodTestProvider", enabled = true) + public void testBasicLikelihoods(BasicLikelihoodTestProvider cfg) { + double exactLogL = cfg.calcLogL( exactHMM, true ); + double calculatedLogL = cfg.calcLogL( originalHMM, true ); + double optimizedLogL = cfg.calcLogL( cachingHMM, true ); + double loglessLogL = cfg.calcLogL( loglessHMM, true ); double expectedLogL = cfg.expectedLogL(); - logger.warn(String.format("Test: logL calc=%.2f expected=%.2f for %s", calculatedLogL, expectedLogL, cfg.toString())); - Assert.assertEquals(calculatedLogL, expectedLogL, cfg.tolerance()); + //logger.warn(String.format("Test: logL calc=%.2f optimized=%.2f logless=%.2f expected=%.2f for %s", calculatedLogL, optimizedLogL, loglessLogL, expectedLogL, cfg.toString())); + Assert.assertEquals(exactLogL, expectedLogL, cfg.toleranceFromTheoretical()); + Assert.assertEquals(calculatedLogL, expectedLogL, cfg.toleranceFromTheoretical()); + Assert.assertEquals(optimizedLogL, calculatedLogL, cfg.toleranceFromReference()); + Assert.assertEquals(loglessLogL, exactLogL, cfg.toleranceFromExact()); + } + + @Test(dataProvider = "OptimizedLikelihoodTestProvider", enabled = true) + public void testOptimizedLikelihoods(BasicLikelihoodTestProvider cfg) { + double exactLogL = cfg.calcLogL( exactHMM, false ); + double calculatedLogL = cfg.calcLogL( originalHMM, false ); + double optimizedLogL = cfg.calcLogL( cachingHMM, false ); + double loglessLogL = cfg.calcLogL( loglessHMM, false ); + //logger.warn(String.format("Test: logL calc=%.2f optimized=%.2f logless=%.2f expected=%.2f for %s", calculatedLogL, optimizedLogL, loglessLogL, expectedLogL, cfg.toString())); + Assert.assertEquals(optimizedLogL, calculatedLogL, cfg.toleranceFromReference()); + Assert.assertEquals(loglessLogL, exactLogL, cfg.toleranceFromExact()); } @Test @@ -322,11 +256,11 @@ public class PairHMMUnitTest extends BaseTest { byte[] mread = Arrays.copyOfRange(haplotype1,offset,haplotype1.length-offset); // change single base at position k to C. If it's a C, change to T mread[k] = ( mread[k] == (byte)'C' ? (byte)'T' : (byte)'C'); - double res1 = hmm.computeReadLikelihoodGivenHaplotype( + originalHMM.initialize(mread.length, haplotype1.length); + double res1 = originalHMM.computeReadLikelihoodGivenHaplotypeLog10( haplotype1, mread, quals, gop, gop, - gcp); - + gcp, 0, false); System.out.format("H:%s\nR: %s\n Pos:%d Result:%4.2f\n",new String(haplotype1), new String(mread), k,res1); @@ -353,11 +287,11 @@ public class PairHMMUnitTest extends BaseTest { byte[] mread = Arrays.copyOfRange(haplotype1,offset,haplotype1.length); // change single base at position k to C. If it's a C, change to T mread[k] = ( mread[k] == (byte)'C' ? (byte)'T' : (byte)'C'); - double res1 = hmm.computeReadLikelihoodGivenHaplotype( + originalHMM.initialize(mread.length, haplotype1.length); + double res1 = originalHMM.computeReadLikelihoodGivenHaplotypeLog10( haplotype1, mread, quals, gop, gop, - gcp); - + gcp, 0, false); System.out.format("H:%s\nR: %s\n Pos:%d Result:%4.2f\n",new String(haplotype1), new String(mread), k,res1); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java index ebfbc49fe..e0ffb2ba6 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java @@ -57,7 +57,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood protected IndelGenotypeLikelihoodsCalculationModel(UnifiedArgumentCollection UAC, Logger logger) { super(UAC, logger); pairModel = new PairHMMIndelErrorModel(UAC.INDEL_GAP_OPEN_PENALTY, UAC.INDEL_GAP_CONTINUATION_PENALTY, - UAC.OUTPUT_DEBUG_INDEL_INFO, !UAC.DONT_DO_BANDED_INDEL_COMPUTATION); + UAC.OUTPUT_DEBUG_INDEL_INFO, UAC.pairHMM); DEBUG = UAC.OUTPUT_DEBUG_INDEL_INFO; haplotypeMap = new LinkedHashMap(); ignoreSNPAllelesWhenGenotypingIndels = UAC.IGNORE_SNP_ALLELES; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java index 885463fcb..3eda2017c 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java @@ -28,6 +28,7 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.gatk.arguments.StandardCallerArgumentCollection; import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.AFCalcFactory; +import org.broadinstitute.sting.utils.pairhmm.PairHMM; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; @@ -65,6 +66,12 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection @Argument(fullName = "annotateNDA", shortName = "nda", doc = "If provided, we will annotate records with the number of alternate alleles that were discovered (but not necessarily genotyped) at a given site", required = false) public boolean ANNOTATE_NUMBER_OF_ALLELES_DISCOVERED = false; + /** + * The PairHMM implementation to use for -glm INDEL genotype likelihood calculations. The various implementations balance a tradeoff of accuracy and runtime. + */ + @Argument(fullName = "pair_hmm_implementation", shortName = "pairHMM", doc = "The PairHMM implementation to use for -glm INDEL genotype likelihood calculations", required = false) + public PairHMM.HMM_IMPLEMENTATION pairHMM = PairHMM.HMM_IMPLEMENTATION.ORIGINAL; + /** * The minimum confidence needed in a given base for it to be used in variant calling. Note that the base quality of a base * is capped by the mapping quality so that bases on reads with low mapping quality may get filtered out depending on this value. @@ -112,10 +119,6 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection @Argument(fullName = "indelHaplotypeSize", shortName = "indelHSize", doc = "Indel haplotype size", required = false) public int INDEL_HAPLOTYPE_SIZE = 80; - @Hidden - @Argument(fullName = "noBandedIndel", shortName = "noBandedIndel", doc = "Don't do Banded Indel likelihood computation", required = false) - public boolean DONT_DO_BANDED_INDEL_COMPUTATION = false; - @Hidden @Argument(fullName = "indelDebug", shortName = "indelDebug", doc = "Output indel debug info", required = false) public boolean OUTPUT_DEBUG_INDEL_INFO = false; @@ -221,10 +224,10 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection uac.EXCLUDE_FILTERED_REFERENCE_SITES = EXCLUDE_FILTERED_REFERENCE_SITES; uac.IGNORE_LANE_INFO = IGNORE_LANE_INFO; uac.exactCallsLog = exactCallsLog; + uac.pairHMM = pairHMM; // todo- arguments to remove uac.IGNORE_SNP_ALLELES = IGNORE_SNP_ALLELES; - uac.DONT_DO_BANDED_INDEL_COMPUTATION = DONT_DO_BANDED_INDEL_COMPUTATION; return uac; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java index 9234a9fe8..3d287057c 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java @@ -30,8 +30,11 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.walkers.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.Haplotype; import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.PairHMM; import org.broadinstitute.sting.utils.clipping.ReadClipper; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.pairhmm.ExactPairHMM; +import org.broadinstitute.sting.utils.pairhmm.OriginalPairHMM; +import org.broadinstitute.sting.utils.pairhmm.PairHMM; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; @@ -48,7 +51,6 @@ public class PairHMMIndelErrorModel { public static final int BASE_QUAL_THRESHOLD = 20; private boolean DEBUG = false; - private boolean bandedLikelihoods = false; private static final int MAX_CACHED_QUAL = 127; @@ -67,6 +69,8 @@ public class PairHMMIndelErrorModel { private final byte[] GAP_OPEN_PROB_TABLE; private final byte[] GAP_CONT_PROB_TABLE; + private final PairHMM pairHMM; + ///////////////////////////// // Private Member Variables ///////////////////////////// @@ -85,15 +89,26 @@ public class PairHMMIndelErrorModel { } } - public PairHMMIndelErrorModel(byte indelGOP, byte indelGCP, boolean deb, boolean bandedLikelihoods) { + public PairHMMIndelErrorModel(byte indelGOP, byte indelGCP, boolean deb, final PairHMM.HMM_IMPLEMENTATION hmmType ) { this.DEBUG = deb; - this.bandedLikelihoods = bandedLikelihoods; + + switch (hmmType) { + case EXACT: + pairHMM = new ExactPairHMM(); + break; + case ORIGINAL: + pairHMM = new OriginalPairHMM(); + break; + case CACHING: + case LOGLESS_CACHING: + default: + throw new UserException.BadArgumentValue("pairHMM", "Specified pairHMM implementation is unrecognized or incompatible with the UnifiedGenotyper. Acceptable options are ORIGINAL and EXACT."); + } // fill gap penalty table, affine naive model: this.GAP_CONT_PROB_TABLE = new byte[MAX_HRUN_GAP_IDX]; this.GAP_OPEN_PROB_TABLE = new byte[MAX_HRUN_GAP_IDX]; - for (int i = 0; i < START_HRUN_GAP_IDX; i++) { GAP_OPEN_PROB_TABLE[i] = indelGOP; GAP_CONT_PROB_TABLE[i] = indelGCP; @@ -190,7 +205,6 @@ public class PairHMMIndelErrorModel { final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap, final int[] readCounts) { final double readLikelihoods[][] = new double[pileup.getNumberOfElements()][haplotypeMap.size()]; - final PairHMM pairHMM = new PairHMM(bandedLikelihoods); int readIdx=0; for (PileupElement p: pileup) { @@ -303,8 +317,6 @@ public class PairHMMIndelErrorModel { final byte[] readQuals = Arrays.copyOfRange(unclippedReadQuals,numStartSoftClippedBases, unclippedReadBases.length-numEndSoftClippedBases); int j=0; - // initialize path metric and traceback memories for likelihood computation - double[][] matchMetricArray = null, XMetricArray = null, YMetricArray = null; byte[] previousHaplotypeSeen = null; final byte[] contextLogGapOpenProbabilities = new byte[readBases.length]; final byte[] contextLogGapContinuationProbabilities = new byte[readBases.length]; @@ -341,14 +353,9 @@ public class PairHMMIndelErrorModel { final int X_METRIC_LENGTH = readBases.length+2; final int Y_METRIC_LENGTH = haplotypeBases.length+2; - if (matchMetricArray == null) { + if (previousHaplotypeSeen == null) { //no need to reallocate arrays for each new haplotype, as length won't change - matchMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; - XMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; - YMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; - - - PairHMM.initializeArrays(matchMetricArray, XMetricArray, YMetricArray, X_METRIC_LENGTH); + pairHMM.initialize(X_METRIC_LENGTH, Y_METRIC_LENGTH); } int startIndexInHaplotype = 0; @@ -356,11 +363,10 @@ public class PairHMMIndelErrorModel { startIndexInHaplotype = computeFirstDifferingPosition(haplotypeBases, previousHaplotypeSeen); previousHaplotypeSeen = haplotypeBases.clone(); - readLikelihood = pairHMM.computeReadLikelihoodGivenHaplotype(haplotypeBases, readBases, readQuals, + readLikelihood = pairHMM.computeReadLikelihoodGivenHaplotypeLog10(haplotypeBases, readBases, readQuals, (read.hasBaseIndelQualities() ? read.getBaseInsertionQualities() : contextLogGapOpenProbabilities), (read.hasBaseIndelQualities() ? read.getBaseDeletionQualities() : contextLogGapOpenProbabilities), - contextLogGapContinuationProbabilities, - startIndexInHaplotype, matchMetricArray, XMetricArray, YMetricArray); + contextLogGapContinuationProbabilities, startIndexInHaplotype, false); if (DEBUG) { diff --git a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java index befd24307..b30d47074 100755 --- a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java +++ b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java @@ -34,6 +34,7 @@ import org.broadinstitute.sting.utils.sam.ReadUtils; import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import java.io.Serializable; import java.util.*; public class Haplotype { @@ -184,6 +185,21 @@ public class Haplotype { return new Haplotype(newHaplotypeBases); } + public static class HaplotypeBaseComparator implements Comparator, Serializable { + @Override + public int compare( final Haplotype hap1, final Haplotype hap2 ) { + final byte[] arr1 = hap1.getBases(); + final byte[] arr2 = hap2.getBases(); + // compares byte arrays using lexical ordering + final int len = Math.min(arr1.length, arr2.length); + for( int iii = 0; iii < len; iii++ ) { + final int cmp = arr1[iii] - arr2[iii]; + if (cmp != 0) { return cmp; } + } + return arr2.length - arr1.length; + } + } + public static LinkedHashMap makeHaplotypeListFromAlleles(final List alleleList, final int startPos, final ReferenceContext ref, diff --git a/public/java/src/org/broadinstitute/sting/utils/PairHMM.java b/public/java/src/org/broadinstitute/sting/utils/PairHMM.java deleted file mode 100644 index 15f7a7869..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/PairHMM.java +++ /dev/null @@ -1,259 +0,0 @@ -/* - * Copyright (c) 2012, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.utils; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; - -import java.util.*; - -/** - * Util class for performing the pair HMM for local alignment. Figure 4.3 in Durbin 1998 book. - * User: rpoplin - * Date: 3/1/12 - */ - -public class PairHMM { - private static final Byte MAX_CACHED_QUAL = Byte.MAX_VALUE; - private static final byte DEFAULT_GOP = (byte) 45; - private static final byte DEFAULT_GCP = (byte) 10; - private static final double BANDING_TOLERANCE = 22.0; - private static final int BANDING_CLUSTER_WINDOW = 12; - private final boolean noBanded; - - public PairHMM() { - noBanded = false; - } - - public PairHMM( final boolean noBanded ) { - this.noBanded = noBanded; - } - - - public static void initializeArrays(final double[][] matchMetricArray, final double[][] XMetricArray, final double[][] YMetricArray, - final int X_METRIC_LENGTH) { - - for( int iii=0; iii < X_METRIC_LENGTH; iii++ ) { - Arrays.fill(matchMetricArray[iii], Double.NEGATIVE_INFINITY); - Arrays.fill(XMetricArray[iii], Double.NEGATIVE_INFINITY); - Arrays.fill(YMetricArray[iii], Double.NEGATIVE_INFINITY); - } - - // the initial condition - matchMetricArray[1][1] = 0.0; // Math.log10(1.0); - - } - - @Requires({"readBases.length == readQuals.length","readBases.length == insertionGOP.length","readBases.length == deletionGOP.length","readBases.length == overallGCP.length"}) - @Ensures({"!Double.isInfinite(result)", "!Double.isNaN(result)"}) // Result should be a proper log10 probability - public double computeReadLikelihoodGivenHaplotype( final byte[] haplotypeBases, final byte[] readBases, final byte[] readQuals, - final byte[] insertionGOP, final byte[] deletionGOP, final byte[] overallGCP ) { - - // M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment - final int X_METRIC_LENGTH = readBases.length + 2; - final int Y_METRIC_LENGTH = haplotypeBases.length + 2; - - // initial arrays to hold the probabilities of being in the match, insertion and deletion cases - final double[][] matchMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; - final double[][] XMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; - final double[][] YMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; - - initializeArrays(matchMetricArray, XMetricArray, YMetricArray, X_METRIC_LENGTH); - - return computeReadLikelihoodGivenHaplotype(haplotypeBases, readBases, readQuals, insertionGOP, deletionGOP, overallGCP, 0, matchMetricArray, XMetricArray, YMetricArray); - } - - @Requires({"readBases.length == readQuals.length","readBases.length == insertionGOP.length","readBases.length == deletionGOP.length","readBases.length == overallGCP.length"}) - @Ensures({"!Double.isInfinite(result)", "!Double.isNaN(result)"}) // Result should be a proper log10 probability - public double computeReadLikelihoodGivenHaplotype( final byte[] haplotypeBases, final byte[] readBases, final byte[] readQuals, - final byte[] insertionGOP, final byte[] deletionGOP, final byte[] overallGCP, final int hapStartIndex, - final double[][] matchMetricArray, final double[][] XMetricArray, final double[][] YMetricArray ) { - - // M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment - final int X_METRIC_LENGTH = readBases.length + 2; - final int Y_METRIC_LENGTH = haplotypeBases.length + 2; - - // ensure that all the qual scores have valid values - for( int iii = 0; iii < readQuals.length; iii++ ) { - readQuals[iii] = ( readQuals[iii] < QualityUtils.MIN_USABLE_Q_SCORE ? QualityUtils.MIN_USABLE_Q_SCORE : (readQuals[iii] > MAX_CACHED_QUAL ? MAX_CACHED_QUAL : readQuals[iii]) ); - } - - if( false ) { - final ArrayList workQueue = new ArrayList(); // holds a queue of starting work location (indices along the diagonal). Will be sorted each step - final ArrayList workToBeAdded = new ArrayList(); - final ArrayList calculatedValues = new ArrayList(); - final int numDiags = X_METRIC_LENGTH + Y_METRIC_LENGTH - 1; - workQueue.add( 1 ); // Always start a new thread at the baseline because of partially repeating sequences that match better in the latter half of the haplotype - - for(int diag = 3; diag < numDiags; diag++) { // diag = 3 is the (1,2) element of the metric arrays. (1,1) is the initial condition and is purposefully skipped over - //Collections.sort(workQueue); // no need to sort because elements are guaranteed to be in ascending order - int el = 1; - for( int work : workQueue ) { - // choose the appropriate diagonal baseline location - int iii = 0; - int jjj = diag; - if( diag > Y_METRIC_LENGTH ) { - iii = diag - Y_METRIC_LENGTH; - jjj = Y_METRIC_LENGTH; - } - // move to the starting work location along the diagonal - iii += work; - jjj -= work; - while( iii >= X_METRIC_LENGTH || jjj <= 0 ) { - iii--; - jjj++; - work--; - } - if( !detectClusteredStartLocations(workToBeAdded, work ) ) { - workToBeAdded.add(work); // keep this thread going once it has started - } - - if( work >= el - 3 ) { - // step along the diagonal in the forward direction, updating the match matrices and looking for a drop off from the maximum observed value - double maxElement = Double.NEGATIVE_INFINITY; - for( el = work; el < numDiags + 1; el++ ) { - updateCell(iii, jjj, haplotypeBases, readBases, readQuals, - insertionGOP, deletionGOP, overallGCP, matchMetricArray, XMetricArray, YMetricArray); - final double bestMetric = MathUtils.max(matchMetricArray[iii][jjj], XMetricArray[iii][jjj], YMetricArray[iii][jjj]); - calculatedValues.add(bestMetric); - if( bestMetric > maxElement ) { - maxElement = bestMetric; - } else if( maxElement - bestMetric > BANDING_TOLERANCE ) { - break; - } - if( ++iii >= X_METRIC_LENGTH ) { // don't walk off the edge of the matrix - break; - } - if( --jjj <= 0 ) { // don't walk off the edge of the matrix - break; - } - } - - // find a local maximum to start a new band in the work queue - double localMaxElement = Double.NEGATIVE_INFINITY; - int localMaxElementIndex = 0; - for(int kkk = calculatedValues.size()-1; kkk >= 1; kkk--) { - final double bestMetric = calculatedValues.get(kkk); - if( bestMetric > localMaxElement ) { - localMaxElement = bestMetric; - localMaxElementIndex = kkk; - } else if( localMaxElement - bestMetric > BANDING_TOLERANCE * 0.5 ) { // find a local maximum - if( !detectClusteredStartLocations(workToBeAdded, work + localMaxElementIndex ) ) { - workToBeAdded.add( work + localMaxElementIndex ); - } - break; - } - } - calculatedValues.clear(); - - // reset iii and jjj to the appropriate diagonal baseline location - iii = 0; - jjj = diag; - if( diag > Y_METRIC_LENGTH ) { - iii = diag - Y_METRIC_LENGTH; - jjj = Y_METRIC_LENGTH; - } - // move to the starting work location along the diagonal - iii += work-1; - jjj -= work-1; - - // step along the diagonal in the reverse direction, updating the match matrices and looking for a drop off from the maximum observed value - for( int traceBack = work - 1; traceBack > 0 && iii > 0 && jjj < Y_METRIC_LENGTH; traceBack--,iii--,jjj++ ) { - updateCell(iii, jjj, haplotypeBases, readBases, readQuals, - insertionGOP, deletionGOP, overallGCP, matchMetricArray, XMetricArray, YMetricArray); - final double bestMetric = MathUtils.max(matchMetricArray[iii][jjj], XMetricArray[iii][jjj], YMetricArray[iii][jjj]); - if( bestMetric > maxElement ) { - maxElement = bestMetric; - } else if( maxElement - bestMetric > BANDING_TOLERANCE ) { - break; - } - } - } - } - workQueue.clear(); - workQueue.addAll(workToBeAdded); - workToBeAdded.clear(); - } - } else { - // simple rectangular version of update loop, slow - for( int iii = 1; iii < X_METRIC_LENGTH; iii++ ) { - for( int jjj = hapStartIndex + 1; jjj < Y_METRIC_LENGTH; jjj++ ) { - if( (iii == 1 && jjj == 1) ) { continue; } - updateCell(iii, jjj, haplotypeBases, readBases, readQuals, insertionGOP, deletionGOP, overallGCP, - matchMetricArray, XMetricArray, YMetricArray); - } - } - } - - // final probability is the log10 sum of the last element in all three state arrays - final int endI = X_METRIC_LENGTH - 1; - final int endJ = Y_METRIC_LENGTH - 1; - return MathUtils.approximateLog10SumLog10(matchMetricArray[endI][endJ], XMetricArray[endI][endJ], YMetricArray[endI][endJ]); - } - - private void updateCell( final int indI, final int indJ, final byte[] haplotypeBases, final byte[] readBases, - final byte[] readQuals, final byte[] insertionGOP, final byte[] deletionGOP, final byte[] overallGCP, - final double[][] matchMetricArray, final double[][] XMetricArray, final double[][] YMetricArray ) { - - // the read and haplotype indices are offset by one because the state arrays have an extra column to hold the initial conditions - final int im1 = indI - 1; - final int jm1 = indJ - 1; - - // update the match array - double pBaseReadLog10 = 0.0; // Math.log10(1.0); - if( im1 > 0 && jm1 > 0 ) { // the emission probability is applied when leaving the state - final byte x = readBases[im1-1]; - final byte y = haplotypeBases[jm1-1]; - final byte qual = readQuals[im1-1]; - pBaseReadLog10 = ( x == y || x == (byte) 'N' || y == (byte) 'N' ? QualityUtils.qualToProbLog10(qual) : QualityUtils.qualToErrorProbLog10(qual) ); - } - final int qualIndexGOP = ( im1 == 0 ? DEFAULT_GOP + DEFAULT_GOP : ( insertionGOP[im1-1] + deletionGOP[im1-1] > MAX_CACHED_QUAL ? MAX_CACHED_QUAL : insertionGOP[im1-1] + deletionGOP[im1-1]) ); - final double d0 = QualityUtils.qualToProbLog10((byte)qualIndexGOP); - final double e0 = ( im1 == 0 ? QualityUtils.qualToProbLog10(DEFAULT_GCP) : QualityUtils.qualToProbLog10(overallGCP[im1-1]) ); - matchMetricArray[indI][indJ] = pBaseReadLog10 + MathUtils.approximateLog10SumLog10(matchMetricArray[indI-1][indJ-1] + d0, XMetricArray[indI-1][indJ-1] + e0, YMetricArray[indI-1][indJ-1] + e0); - - // update the X (insertion) array - final double d1 = ( im1 == 0 ? QualityUtils.qualToErrorProbLog10(DEFAULT_GOP) : QualityUtils.qualToErrorProbLog10(insertionGOP[im1-1]) ); - final double e1 = ( im1 == 0 ? QualityUtils.qualToErrorProbLog10(DEFAULT_GCP) : QualityUtils.qualToErrorProbLog10(overallGCP[im1-1]) ); - final double qBaseReadLog10 = 0.0; // Math.log10(1.0) -- we don't have an estimate for this emission probability so assume q=1.0 - XMetricArray[indI][indJ] = qBaseReadLog10 + MathUtils.approximateLog10SumLog10(matchMetricArray[indI-1][indJ] + d1, XMetricArray[indI-1][indJ] + e1); - - // update the Y (deletion) array, with penalty of zero on the left and right flanks to allow for a local alignment within the haplotype - final double d2 = ( im1 == 0 || im1 == readBases.length ? 0.0 : QualityUtils.qualToErrorProbLog10(deletionGOP[im1-1]) ); - final double e2 = ( im1 == 0 || im1 == readBases.length ? 0.0 : QualityUtils.qualToErrorProbLog10(overallGCP[im1-1]) ); - final double qBaseRefLog10 = 0.0; // Math.log10(1.0) -- we don't have an estimate for this emission probability so assume q=1.0 - YMetricArray[indI][indJ] = qBaseRefLog10 + MathUtils.approximateLog10SumLog10(matchMetricArray[indI][indJ-1] + d2, YMetricArray[indI][indJ-1] + e2); - } - - // private function used by the banded approach to ensure the proposed bands are sufficiently distinct from each other - private boolean detectClusteredStartLocations( final ArrayList list, int loc ) { - for(int x : list) { - if( Math.abs(x-loc) <= BANDING_CLUSTER_WINDOW ) { - return true; - } - } - return false; - } -} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/pairhmm/ExactPairHMM.java b/public/java/src/org/broadinstitute/sting/utils/pairhmm/ExactPairHMM.java new file mode 100644 index 000000000..17089ee81 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/pairhmm/ExactPairHMM.java @@ -0,0 +1,107 @@ +package org.broadinstitute.sting.utils.pairhmm; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.QualityUtils; + +import java.util.ArrayList; +import java.util.Arrays; + +/** + * Created with IntelliJ IDEA. + * User: rpoplin + * Date: 10/16/12 + */ + +public class ExactPairHMM extends PairHMM { + + @Override + public void initialize( final int READ_MAX_LENGTH, final int HAPLOTYPE_MAX_LENGTH ) { + + // M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment + final int X_METRIC_LENGTH = READ_MAX_LENGTH + 2; + final int Y_METRIC_LENGTH = HAPLOTYPE_MAX_LENGTH + 2; + + matchMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; + XMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; + YMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; + + for( int iii=0; iii < X_METRIC_LENGTH; iii++ ) { + Arrays.fill(matchMetricArray[iii], Double.NEGATIVE_INFINITY); + Arrays.fill(XMetricArray[iii], Double.NEGATIVE_INFINITY); + Arrays.fill(YMetricArray[iii], Double.NEGATIVE_INFINITY); + } + + // the initial condition + matchMetricArray[1][1] = 0.0; // Math.log10(1.0); + } + + @Override + public double computeReadLikelihoodGivenHaplotypeLog10( final byte[] haplotypeBases, + final byte[] readBases, + final byte[] readQuals, + final byte[] insertionGOP, + final byte[] deletionGOP, + final byte[] overallGCP, + final int hapStartIndex, + final boolean recacheReadValues ) { + + // M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment + final int X_METRIC_LENGTH = readBases.length + 2; + final int Y_METRIC_LENGTH = haplotypeBases.length + 2; + + // ensure that all the qual scores have valid values + for( int iii = 0; iii < readQuals.length; iii++ ) { + readQuals[iii] = ( readQuals[iii] < QualityUtils.MIN_USABLE_Q_SCORE ? QualityUtils.MIN_USABLE_Q_SCORE : (readQuals[iii] > MAX_CACHED_QUAL ? MAX_CACHED_QUAL : readQuals[iii]) ); + } + + // simple rectangular version of update loop, slow + for( int iii = 1; iii < X_METRIC_LENGTH; iii++ ) { + for( int jjj = hapStartIndex + 1; jjj < Y_METRIC_LENGTH; jjj++ ) { + if( (iii == 1 && jjj == 1) ) { continue; } + updateCell(iii, jjj, haplotypeBases, readBases, readQuals, insertionGOP, deletionGOP, overallGCP, + matchMetricArray, XMetricArray, YMetricArray); + } + } + + // final probability is the log10 sum of the last element in all three state arrays + final int endI = X_METRIC_LENGTH - 1; + final int endJ = Y_METRIC_LENGTH - 1; + return MathUtils.log10sumLog10(new double[]{matchMetricArray[endI][endJ], XMetricArray[endI][endJ], YMetricArray[endI][endJ]}); + } + + private void updateCell( final int indI, final int indJ, final byte[] haplotypeBases, final byte[] readBases, + final byte[] readQuals, final byte[] insertionGOP, final byte[] deletionGOP, final byte[] overallGCP, + final double[][] matchMetricArray, final double[][] XMetricArray, final double[][] YMetricArray ) { + + // the read and haplotype indices are offset by one because the state arrays have an extra column to hold the initial conditions + final int im1 = indI - 1; + final int jm1 = indJ - 1; + + // update the match array + double pBaseReadLog10 = 0.0; // Math.log10(1.0); + if( im1 > 0 && jm1 > 0 ) { // the emission probability is applied when leaving the state + final byte x = readBases[im1-1]; + final byte y = haplotypeBases[jm1-1]; + final byte qual = readQuals[im1-1]; + pBaseReadLog10 = ( x == y || x == (byte) 'N' || y == (byte) 'N' ? QualityUtils.qualToProbLog10(qual) : QualityUtils.qualToErrorProbLog10(qual) ); + } + final int qualIndexGOP = ( im1 == 0 ? DEFAULT_GOP + DEFAULT_GOP : ( insertionGOP[im1-1] + deletionGOP[im1-1] > MAX_CACHED_QUAL ? MAX_CACHED_QUAL : insertionGOP[im1-1] + deletionGOP[im1-1]) ); + final double d0 = QualityUtils.qualToProbLog10((byte)qualIndexGOP); + final double e0 = ( im1 == 0 ? QualityUtils.qualToProbLog10(DEFAULT_GCP) : QualityUtils.qualToProbLog10(overallGCP[im1-1]) ); + matchMetricArray[indI][indJ] = pBaseReadLog10 + MathUtils.log10sumLog10(new double[]{matchMetricArray[indI-1][indJ-1] + d0, XMetricArray[indI-1][indJ-1] + e0, YMetricArray[indI-1][indJ-1] + e0}); + + // update the X (insertion) array + final double d1 = ( im1 == 0 ? QualityUtils.qualToErrorProbLog10(DEFAULT_GOP) : QualityUtils.qualToErrorProbLog10(insertionGOP[im1-1]) ); + final double e1 = ( im1 == 0 ? QualityUtils.qualToErrorProbLog10(DEFAULT_GCP) : QualityUtils.qualToErrorProbLog10(overallGCP[im1-1]) ); + final double qBaseReadLog10 = 0.0; // Math.log10(1.0) -- we don't have an estimate for this emission probability so assume q=1.0 + XMetricArray[indI][indJ] = qBaseReadLog10 + MathUtils.log10sumLog10(new double[]{matchMetricArray[indI-1][indJ] + d1, XMetricArray[indI-1][indJ] + e1}); + + // update the Y (deletion) array, with penalty of zero on the left and right flanks to allow for a local alignment within the haplotype + final double d2 = ( im1 == 0 || im1 == readBases.length ? 0.0 : QualityUtils.qualToErrorProbLog10(deletionGOP[im1-1]) ); + final double e2 = ( im1 == 0 || im1 == readBases.length ? 0.0 : QualityUtils.qualToErrorProbLog10(overallGCP[im1-1]) ); + final double qBaseRefLog10 = 0.0; // Math.log10(1.0) -- we don't have an estimate for this emission probability so assume q=1.0 + YMetricArray[indI][indJ] = qBaseRefLog10 + MathUtils.log10sumLog10(new double[]{matchMetricArray[indI][indJ-1] + d2, YMetricArray[indI][indJ-1] + e2}); + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/pairhmm/OriginalPairHMM.java b/public/java/src/org/broadinstitute/sting/utils/pairhmm/OriginalPairHMM.java new file mode 100644 index 000000000..cd946cdf1 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/pairhmm/OriginalPairHMM.java @@ -0,0 +1,105 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.pairhmm; + +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.QualityUtils; + +/** + * Util class for performing the pair HMM for local alignment. Figure 4.3 in Durbin 1998 book. + * User: rpoplin + * Date: 3/1/12 + */ + +public class OriginalPairHMM extends ExactPairHMM { + + @Override + public double computeReadLikelihoodGivenHaplotypeLog10( final byte[] haplotypeBases, + final byte[] readBases, + final byte[] readQuals, + final byte[] insertionGOP, + final byte[] deletionGOP, + final byte[] overallGCP, + final int hapStartIndex, + final boolean recacheReadValues ) { + + // M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment + final int X_METRIC_LENGTH = readBases.length + 2; + final int Y_METRIC_LENGTH = haplotypeBases.length + 2; + + // ensure that all the qual scores have valid values + for( int iii = 0; iii < readQuals.length; iii++ ) { + readQuals[iii] = ( readQuals[iii] < QualityUtils.MIN_USABLE_Q_SCORE ? QualityUtils.MIN_USABLE_Q_SCORE : (readQuals[iii] > MAX_CACHED_QUAL ? MAX_CACHED_QUAL : readQuals[iii]) ); + } + + // simple rectangular version of update loop, slow + for( int iii = 1; iii < X_METRIC_LENGTH; iii++ ) { + for( int jjj = hapStartIndex + 1; jjj < Y_METRIC_LENGTH; jjj++ ) { + if( (iii == 1 && jjj == 1) ) { continue; } + updateCell(iii, jjj, haplotypeBases, readBases, readQuals, insertionGOP, deletionGOP, overallGCP, + matchMetricArray, XMetricArray, YMetricArray); + } + } + + // final probability is the log10 sum of the last element in all three state arrays + final int endI = X_METRIC_LENGTH - 1; + final int endJ = Y_METRIC_LENGTH - 1; + return MathUtils.approximateLog10SumLog10(matchMetricArray[endI][endJ], XMetricArray[endI][endJ], YMetricArray[endI][endJ]); + } + + private void updateCell( final int indI, final int indJ, final byte[] haplotypeBases, final byte[] readBases, + final byte[] readQuals, final byte[] insertionGOP, final byte[] deletionGOP, final byte[] overallGCP, + final double[][] matchMetricArray, final double[][] XMetricArray, final double[][] YMetricArray ) { + + // the read and haplotype indices are offset by one because the state arrays have an extra column to hold the initial conditions + final int im1 = indI - 1; + final int jm1 = indJ - 1; + + // update the match array + double pBaseReadLog10 = 0.0; // Math.log10(1.0); + if( im1 > 0 && jm1 > 0 ) { // the emission probability is applied when leaving the state + final byte x = readBases[im1-1]; + final byte y = haplotypeBases[jm1-1]; + final byte qual = readQuals[im1-1]; + pBaseReadLog10 = ( x == y || x == (byte) 'N' || y == (byte) 'N' ? QualityUtils.qualToProbLog10(qual) : QualityUtils.qualToErrorProbLog10(qual) ); + } + final int qualIndexGOP = ( im1 == 0 ? DEFAULT_GOP + DEFAULT_GOP : ( insertionGOP[im1-1] + deletionGOP[im1-1] > MAX_CACHED_QUAL ? MAX_CACHED_QUAL : insertionGOP[im1-1] + deletionGOP[im1-1]) ); + final double d0 = QualityUtils.qualToProbLog10((byte)qualIndexGOP); + final double e0 = ( im1 == 0 ? QualityUtils.qualToProbLog10(DEFAULT_GCP) : QualityUtils.qualToProbLog10(overallGCP[im1-1]) ); + matchMetricArray[indI][indJ] = pBaseReadLog10 + MathUtils.approximateLog10SumLog10(matchMetricArray[indI-1][indJ-1] + d0, XMetricArray[indI-1][indJ-1] + e0, YMetricArray[indI-1][indJ-1] + e0); + + // update the X (insertion) array + final double d1 = ( im1 == 0 ? QualityUtils.qualToErrorProbLog10(DEFAULT_GOP) : QualityUtils.qualToErrorProbLog10(insertionGOP[im1-1]) ); + final double e1 = ( im1 == 0 ? QualityUtils.qualToErrorProbLog10(DEFAULT_GCP) : QualityUtils.qualToErrorProbLog10(overallGCP[im1-1]) ); + final double qBaseReadLog10 = 0.0; // Math.log10(1.0) -- we don't have an estimate for this emission probability so assume q=1.0 + XMetricArray[indI][indJ] = qBaseReadLog10 + MathUtils.approximateLog10SumLog10(matchMetricArray[indI-1][indJ] + d1, XMetricArray[indI-1][indJ] + e1); + + // update the Y (deletion) array, with penalty of zero on the left and right flanks to allow for a local alignment within the haplotype + final double d2 = ( im1 == 0 || im1 == readBases.length ? 0.0 : QualityUtils.qualToErrorProbLog10(deletionGOP[im1-1]) ); + final double e2 = ( im1 == 0 || im1 == readBases.length ? 0.0 : QualityUtils.qualToErrorProbLog10(overallGCP[im1-1]) ); + final double qBaseRefLog10 = 0.0; // Math.log10(1.0) -- we don't have an estimate for this emission probability so assume q=1.0 + YMetricArray[indI][indJ] = qBaseRefLog10 + MathUtils.approximateLog10SumLog10(matchMetricArray[indI][indJ-1] + d2, YMetricArray[indI][indJ-1] + e2); + } +} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/pairhmm/PairHMM.java b/public/java/src/org/broadinstitute/sting/utils/pairhmm/PairHMM.java new file mode 100644 index 000000000..7a1399c32 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/pairhmm/PairHMM.java @@ -0,0 +1,45 @@ +package org.broadinstitute.sting.utils.pairhmm; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; + +/** + * Created with IntelliJ IDEA. + * User: rpoplin + * Date: 10/16/12 + */ + +public abstract class PairHMM { + protected static final Byte MAX_CACHED_QUAL = Byte.MAX_VALUE; + protected static final byte DEFAULT_GOP = (byte) 45; + protected static final byte DEFAULT_GCP = (byte) 10; + + public enum HMM_IMPLEMENTATION { + /* Very slow implementation which uses very accurate log10 sum functions. Only meant to be used as a reference test implementation */ + EXACT, + /* PairHMM as implemented for the UnifiedGenotyper. Uses log10 sum functions accurate to only 1E-4 */ + ORIGINAL, + /* Optimized version of the PairHMM which caches per-read computations */ + CACHING, + /* Optimized version of the PairHMM which caches per-read computations and operations in real space to avoid costly sums of log10'ed likelihoods */ + LOGLESS_CACHING + } + + protected double[][] matchMetricArray = null; + protected double[][] XMetricArray = null; + protected double[][] YMetricArray = null; + + public abstract void initialize( final int READ_MAX_LENGTH, final int HAPLOTYPE_MAX_LENGTH ); + + @Requires({"readBases.length == readQuals.length", "readBases.length == insertionGOP.length", "readBases.length == deletionGOP.length", + "readBases.length == overallGCP.length", "matchMetricArray!=null", "XMetricArray!=null", "YMetricArray!=null"}) + @Ensures({"!Double.isInfinite(result)", "!Double.isNaN(result)"}) // Result should be a proper log10 likelihood + public abstract double computeReadLikelihoodGivenHaplotypeLog10( final byte[] haplotypeBases, + final byte[] readBases, + final byte[] readQuals, + final byte[] insertionGOP, + final byte[] deletionGOP, + final byte[] overallGCP, + final int hapStartIndex, + final boolean recacheReadValues ); +} From 2c624f76c83f8cfec61001f89928ebe39ab8d713 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Sat, 20 Oct 2012 20:35:54 -0400 Subject: [PATCH 3/6] Refactoring the Unified (and Standard) Argument Collections because it was really ugly that the subclass had to do all the cloning for the super class. The clone() method is really not recommended best practice in Java anyways, so I changed it so that we use standard overloaded constructors. Confirmed that the Haplotype Caller --help docs do not include UG-specific arguments. --- .../haplotypecaller/HaplotypeCaller.java | 4 +- .../StandardCallerArgumentCollection.java | 26 ++++++ .../genotyper/UnifiedArgumentCollection.java | 90 +++++++------------ 3 files changed, 62 insertions(+), 58 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index 5f2b5775c..6d6351fc5 100755 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -241,14 +241,14 @@ public class HaplotypeCaller extends ActiveRegionWalker implem samplesList.addAll( samples ); // initialize the UnifiedGenotyper Engine which is used to call into the exact model final UnifiedArgumentCollection UAC = new UnifiedArgumentCollection( SCAC ); // this adapter is used so that the full set of unused UG arguments aren't exposed to the HC user - UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC.clone(), logger, null, null, samples, VariantContextUtils.DEFAULT_PLOIDY); + UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, null, null, samples, VariantContextUtils.DEFAULT_PLOIDY); UAC.OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY; // low values used for isActive determination only, default/user-specified values used for actual calling UAC.GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY; // low values used for isActive determination only, default/user-specified values used for actual calling UAC.STANDARD_CONFIDENCE_FOR_CALLING = Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_CALLING ); UAC.STANDARD_CONFIDENCE_FOR_EMITTING = Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_EMITTING ); // create a UAC but with the exactCallsLog = null, so we only output the log for the HC caller itself, if requested - UnifiedArgumentCollection simpleUAC = UAC.clone(); + UnifiedArgumentCollection simpleUAC = new UnifiedArgumentCollection(UAC); simpleUAC.exactCallsLog = null; UG_engine_simple_genotyper = new UnifiedGenotyperEngine(getToolkit(), simpleUAC, logger, null, null, samples, VariantContextUtils.DEFAULT_PLOIDY); diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java index 085a60191..9b9f04228 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java @@ -69,7 +69,33 @@ public class StandardCallerArgumentCollection { @Argument(fullName = "max_alternate_alleles_for_indels", shortName = "maxAltAllelesForIndels", doc = "Maximum number of alternate alleles to genotype for indels only", required = false) public int MAX_ALTERNATE_ALLELES_FOR_INDELS = 2; + /** + * If this fraction is greater is than zero, the caller will aggressively attempt to remove contamination through biased down-sampling of reads. + * Basically, it will ignore the contamination fraction of reads for each alternate allele. So if the pileup contains N total bases, then we + * will try to remove (N * contamination fraction) bases for each alternate allele. + */ + @Hidden + @Argument(fullName = "contamination_percentage_to_filter", shortName = "contamination", doc = "Fraction of contamination in sequencing data (for all samples) to aggressively remove", required = false) + public double CONTAMINATION_PERCENTAGE = 0.0; + @Hidden @Argument(shortName = "logExactCalls", doc="x", required=false) public File exactCallsLog = null; + + + public StandardCallerArgumentCollection() { } + + // Developers must remember to add any newly added arguments to the list here as well otherwise they won't get changed from their default value! + public StandardCallerArgumentCollection(final StandardCallerArgumentCollection SCAC) { + this.alleles = SCAC.alleles; + this.GenotypingMode = SCAC.GenotypingMode; + this.heterozygosity = SCAC.heterozygosity; + this.MAX_ALTERNATE_ALLELES = SCAC.MAX_ALTERNATE_ALLELES; + this.MAX_ALTERNATE_ALLELES_FOR_INDELS = SCAC.MAX_ALTERNATE_ALLELES_FOR_INDELS; + this.OutputMode = SCAC.OutputMode; + this.STANDARD_CONFIDENCE_FOR_CALLING = SCAC.STANDARD_CONFIDENCE_FOR_CALLING; + this.STANDARD_CONFIDENCE_FOR_EMITTING = SCAC.STANDARD_CONFIDENCE_FOR_EMITTING; + this.CONTAMINATION_PERCENTAGE = SCAC.CONTAMINATION_PERCENTAGE; + this.exactCallsLog = SCAC.exactCallsLog; + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java index 3eda2017c..17137c5e9 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java @@ -186,63 +186,41 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection @Argument(shortName="ef", fullName="exclude_filtered_reference_sites", doc="Don't include in the analysis sites where the reference sample VCF is filtered. Default: false.", required=false) boolean EXCLUDE_FILTERED_REFERENCE_SITES = false; - // Developers must remember to add any newly added arguments to the list here as well otherwise they won't get changed from their default value! - public UnifiedArgumentCollection clone() { - UnifiedArgumentCollection uac = new UnifiedArgumentCollection(); - - uac.GLmodel = GLmodel; - uac.AFmodel = AFmodel; - uac.heterozygosity = heterozygosity; - uac.PCR_error = PCR_error; - uac.GenotypingMode = GenotypingMode; - uac.OutputMode = OutputMode; - uac.NO_SLOD = NO_SLOD; - uac.ANNOTATE_NUMBER_OF_ALLELES_DISCOVERED = ANNOTATE_NUMBER_OF_ALLELES_DISCOVERED; - uac.STANDARD_CONFIDENCE_FOR_CALLING = STANDARD_CONFIDENCE_FOR_CALLING; - uac.STANDARD_CONFIDENCE_FOR_EMITTING = STANDARD_CONFIDENCE_FOR_EMITTING; - uac.MIN_BASE_QUALTY_SCORE = MIN_BASE_QUALTY_SCORE; - uac.MAX_DELETION_FRACTION = MAX_DELETION_FRACTION; - uac.MIN_INDEL_COUNT_FOR_GENOTYPING = MIN_INDEL_COUNT_FOR_GENOTYPING; - uac.MIN_INDEL_FRACTION_PER_SAMPLE = MIN_INDEL_FRACTION_PER_SAMPLE; - uac.INDEL_HETEROZYGOSITY = INDEL_HETEROZYGOSITY; - uac.INDEL_GAP_OPEN_PENALTY = INDEL_GAP_OPEN_PENALTY; - uac.INDEL_GAP_CONTINUATION_PENALTY = INDEL_GAP_CONTINUATION_PENALTY; - uac.OUTPUT_DEBUG_INDEL_INFO = OUTPUT_DEBUG_INDEL_INFO; - uac.INDEL_HAPLOTYPE_SIZE = INDEL_HAPLOTYPE_SIZE; - uac.alleles = alleles; - uac.MAX_ALTERNATE_ALLELES = MAX_ALTERNATE_ALLELES; - uac.MAX_ALTERNATE_ALLELES_FOR_INDELS = MAX_ALTERNATE_ALLELES_FOR_INDELS; - uac.GLmodel = GLmodel; - uac.TREAT_ALL_READS_AS_SINGLE_POOL = TREAT_ALL_READS_AS_SINGLE_POOL; - uac.referenceSampleRod = referenceSampleRod; - uac.referenceSampleName = referenceSampleName; - uac.samplePloidy = samplePloidy; - uac.maxQualityScore = minQualityScore; - uac.phredScaledPrior = phredScaledPrior; - uac.minPower = minPower; - uac.minReferenceDepth = minReferenceDepth; - uac.EXCLUDE_FILTERED_REFERENCE_SITES = EXCLUDE_FILTERED_REFERENCE_SITES; - uac.IGNORE_LANE_INFO = IGNORE_LANE_INFO; - uac.exactCallsLog = exactCallsLog; - uac.pairHMM = pairHMM; - - // todo- arguments to remove - uac.IGNORE_SNP_ALLELES = IGNORE_SNP_ALLELES; - return uac; - } - public UnifiedArgumentCollection() { } - public UnifiedArgumentCollection( final StandardCallerArgumentCollection SCAC ) { - super(); - this.alleles = SCAC.alleles; - this.GenotypingMode = SCAC.GenotypingMode; - this.heterozygosity = SCAC.heterozygosity; - this.MAX_ALTERNATE_ALLELES = SCAC.MAX_ALTERNATE_ALLELES; - this.MAX_ALTERNATE_ALLELES_FOR_INDELS = SCAC.MAX_ALTERNATE_ALLELES_FOR_INDELS; - this.OutputMode = SCAC.OutputMode; - this.STANDARD_CONFIDENCE_FOR_CALLING = SCAC.STANDARD_CONFIDENCE_FOR_CALLING; - this.STANDARD_CONFIDENCE_FOR_EMITTING = SCAC.STANDARD_CONFIDENCE_FOR_EMITTING; - this.exactCallsLog = SCAC.exactCallsLog; + public UnifiedArgumentCollection(final StandardCallerArgumentCollection SCAC) { + super(SCAC); + } + + // Developers must remember to add any newly added arguments to the list here as well otherwise they won't get changed from their default value! + public UnifiedArgumentCollection(final UnifiedArgumentCollection uac) { + this.GLmodel = uac.GLmodel; + this.AFmodel = uac.AFmodel; + this.PCR_error = uac.PCR_error; + this.NO_SLOD = uac.NO_SLOD; + this.ANNOTATE_NUMBER_OF_ALLELES_DISCOVERED = uac.ANNOTATE_NUMBER_OF_ALLELES_DISCOVERED; + this.MIN_BASE_QUALTY_SCORE = uac.MIN_BASE_QUALTY_SCORE; + this.MAX_DELETION_FRACTION = uac.MAX_DELETION_FRACTION; + this.MIN_INDEL_COUNT_FOR_GENOTYPING = uac.MIN_INDEL_COUNT_FOR_GENOTYPING; + this.MIN_INDEL_FRACTION_PER_SAMPLE = uac.MIN_INDEL_FRACTION_PER_SAMPLE; + this.INDEL_HETEROZYGOSITY = uac.INDEL_HETEROZYGOSITY; + this.INDEL_GAP_OPEN_PENALTY = uac.INDEL_GAP_OPEN_PENALTY; + this.INDEL_GAP_CONTINUATION_PENALTY = uac.INDEL_GAP_CONTINUATION_PENALTY; + this.OUTPUT_DEBUG_INDEL_INFO = uac.OUTPUT_DEBUG_INDEL_INFO; + this.INDEL_HAPLOTYPE_SIZE = uac.INDEL_HAPLOTYPE_SIZE; + this.TREAT_ALL_READS_AS_SINGLE_POOL = uac.TREAT_ALL_READS_AS_SINGLE_POOL; + this.referenceSampleRod = uac.referenceSampleRod; + this.referenceSampleName = uac.referenceSampleName; + this.samplePloidy = uac.samplePloidy; + this.maxQualityScore = uac.minQualityScore; + this.phredScaledPrior = uac.phredScaledPrior; + this.minPower = uac.minPower; + this.minReferenceDepth = uac.minReferenceDepth; + this.EXCLUDE_FILTERED_REFERENCE_SITES = uac.EXCLUDE_FILTERED_REFERENCE_SITES; + this.IGNORE_LANE_INFO = uac.IGNORE_LANE_INFO; + this.pairHMM = uac.pairHMM; + + // todo- arguments to remove + this.IGNORE_SNP_ALLELES = uac.IGNORE_SNP_ALLELES; } } From 841a906f210ac363817fe3e84794281501e4b30f Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Sat, 20 Oct 2012 23:31:56 -0400 Subject: [PATCH 4/6] Adding a hidden (for now) argument to UG (and HC) that tells the caller that the incoming samples are contaminated by N% and to fix it by aggressively down-sampling all alleles. This actually works. Yes, you read that right: given that we know what N is, we can make good calls on bams that have N% contamination. Only hooked up for SNPS right now. No tests added yet. --- ...NPGenotypeLikelihoodsCalculationModel.java | 67 +++++++++++++++++-- 1 file changed, 60 insertions(+), 7 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java index 76ba72017..d053e13a8 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java @@ -41,19 +41,20 @@ import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl; import org.broadinstitute.sting.utils.variantcontext.*; -import java.util.ArrayList; -import java.util.List; -import java.util.Map; +import java.util.*; public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsCalculationModel { private final boolean useAlleleFromVCF; private final double[] likelihoodSums = new double[4]; - + private final ArrayList[] alleleStratifiedElements = new ArrayList[4]; + protected SNPGenotypeLikelihoodsCalculationModel(UnifiedArgumentCollection UAC, Logger logger) { super(UAC, logger); useAlleleFromVCF = UAC.GenotypingMode == GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES; + for ( int i = 0; i < 4; i++ ) + alleleStratifiedElements[i] = new ArrayList(); } public VariantContext getLikelihoods(final RefMetaDataTracker tracker, @@ -78,8 +79,10 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC ArrayList GLs = new ArrayList(contexts.size()); for ( Map.Entry sample : contexts.entrySet() ) { ReadBackedPileup pileup = AlignmentContextUtils.stratify(sample.getValue(), contextType).getBasePileup(); + if ( UAC.CONTAMINATION_PERCENTAGE > 0.0 ) + pileup = createDecontaminatedPileup(pileup, UAC.CONTAMINATION_PERCENTAGE); if ( useBAQedPileup ) - pileup = createBAQedPileup( pileup ); + pileup = createBAQedPileup(pileup); // create the GenotypeLikelihoods object final DiploidSNPGenotypeLikelihoods GL = new DiploidSNPGenotypeLikelihoods(UAC.PCR_error); @@ -150,8 +153,6 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC // create the genotypes; no-call everyone for now final GenotypesContext genotypes = GenotypesContext.create(); - final List noCall = new ArrayList(); - noCall.add(Allele.NO_CALL); for ( SampleGenotypeData sampleData : GLs ) { final double[] allLikelihoods = sampleData.GL.getLikelihoods(); @@ -202,6 +203,42 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC return allelesToUse; } + public ReadBackedPileup createDecontaminatedPileup(final ReadBackedPileup pileup, final double contaminationPercentage) { + // special case removal of all reads + if ( contaminationPercentage >= 1.0 ) + return new ReadBackedPileupImpl(pileup.getLocation(), new ArrayList()); + + // start by stratifying the reads by the alleles they represent at this position + for( final PileupElement pe : pileup ) { + final int baseIndex = BaseUtils.simpleBaseToBaseIndex(pe.getBase()); + if ( baseIndex != -1 ) + alleleStratifiedElements[baseIndex].add(pe); + } + + // Down-sample *each* allele by the contamination fraction applied to the entire pileup. + // Unfortunately, we need to maintain the original pileup ordering of reads or FragmentUtils will complain later. + int numReadsToRemove = (int)Math.ceil((double)pileup.getNumberOfElements() * contaminationPercentage); + final TreeSet elementsToKeep = new TreeSet(new Comparator() { + @Override + public int compare(PileupElement element1, PileupElement element2) { + final int difference = element1.getRead().getAlignmentStart() - element2.getRead().getAlignmentStart(); + return difference != 0 ? difference : element1.getRead().getReadName().compareTo(element2.getRead().getReadName()); + } + }); + + for ( int i = 0; i < 4; i++ ) { + final ArrayList alleleList = alleleStratifiedElements[i]; + if ( alleleList.size() > numReadsToRemove ) + elementsToKeep.addAll(downsampleElements(alleleList, numReadsToRemove)); + } + + // clean up pointers so memory can be garbage collected if needed + for ( int i = 0; i < 4; i++ ) + alleleStratifiedElements[i].clear(); + + return new ReadBackedPileupImpl(pileup.getLocation(), new ArrayList(elementsToKeep)); + } + public ReadBackedPileup createBAQedPileup( final ReadBackedPileup pileup ) { final List BAQedElements = new ArrayList(); for( final PileupElement PE : pileup ) { @@ -220,6 +257,22 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC public byte getQual( final int offset ) { return BAQ.calcBAQFromTag(getRead(), offset, true); } } + private List downsampleElements(final ArrayList elements, final int numElementsToRemove) { + final int pileupSize = elements.size(); + final BitSet itemsToRemove = new BitSet(pileupSize); + for ( Integer selectedIndex : MathUtils.sampleIndicesWithoutReplacement(pileupSize, numElementsToRemove) ) { + itemsToRemove.set(selectedIndex); + } + + ArrayList elementsToKeep = new ArrayList(pileupSize - numElementsToRemove); + for ( int i = 0; i < pileupSize; i++ ) { + if ( !itemsToRemove.get(i) ) + elementsToKeep.add(elements.get(i)); + } + + return elementsToKeep; + } + private static class SampleGenotypeData { public final String name; From d44d5b827538789bae68c47c1634080e6c7d04c9 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Sun, 21 Oct 2012 01:29:59 -0400 Subject: [PATCH 5/6] Fix RawHapMapCodec so that it can build indexes. Minor fixes to VCF codec. --- .../sting/utils/codecs/hapmap/RawHapMapCodec.java | 9 +++++++++ .../broadinstitute/sting/utils/codecs/vcf/VCFCodec.java | 4 +--- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/hapmap/RawHapMapCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/hapmap/RawHapMapCodec.java index 916fb43ea..6b3fce966 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/hapmap/RawHapMapCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/hapmap/RawHapMapCodec.java @@ -25,8 +25,11 @@ package org.broadinstitute.sting.utils.codecs.hapmap; import org.broad.tribble.AsciiFeatureCodec; +import org.broad.tribble.FeatureCodecHeader; import org.broad.tribble.annotation.Strand; +import org.broad.tribble.readers.AsciiLineReader; import org.broad.tribble.readers.LineReader; +import org.broad.tribble.readers.PositionalBufferedStream; import java.io.IOException; import java.util.Arrays; @@ -116,4 +119,10 @@ public class RawHapMapCodec extends AsciiFeatureCodec { } return headerLine; } + + @Override + public FeatureCodecHeader readHeader(final PositionalBufferedStream stream) throws IOException { + final AsciiLineReader br = new AsciiLineReader(stream); + return new FeatureCodecHeader(readHeader(br), br.getPosition()); + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java index 4df1efee7..f12f13dc7 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java @@ -2,8 +2,6 @@ package org.broadinstitute.sting.utils.codecs.vcf; import org.broad.tribble.TribbleException; import org.broad.tribble.readers.LineReader; -import org.broad.tribble.util.ParsingUtils; -import org.broadinstitute.sting.utils.variantcontext.*; import java.io.IOException; import java.util.*; @@ -119,7 +117,7 @@ public class VCFCodec extends AbstractVCFCodec { // empty set for passes filters List fFields = new LinkedList(); // otherwise we have to parse and cache the value - if ( filterString.indexOf(VCFConstants.FILTER_CODE_SEPARATOR) == -1 ) + if ( !filterString.contains(VCFConstants.FILTER_CODE_SEPARATOR) ) fFields.add(filterString); else fFields.addAll(Arrays.asList(filterString.split(VCFConstants.FILTER_CODE_SEPARATOR))); From 0616b98551e7b3dbfa571ff93c65c9a1ef645029 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Sun, 21 Oct 2012 08:26:26 -0400 Subject: [PATCH 6/6] Not sure why we were setting the UAC variables instead of the simpleUAC ones when that's what we wanted. --- .../gatk/walkers/haplotypecaller/HaplotypeCaller.java | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index 6d6351fc5..a08614deb 100755 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -242,13 +242,13 @@ public class HaplotypeCaller extends ActiveRegionWalker implem // initialize the UnifiedGenotyper Engine which is used to call into the exact model final UnifiedArgumentCollection UAC = new UnifiedArgumentCollection( SCAC ); // this adapter is used so that the full set of unused UG arguments aren't exposed to the HC user UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, null, null, samples, VariantContextUtils.DEFAULT_PLOIDY); - UAC.OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY; // low values used for isActive determination only, default/user-specified values used for actual calling - UAC.GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY; // low values used for isActive determination only, default/user-specified values used for actual calling - UAC.STANDARD_CONFIDENCE_FOR_CALLING = Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_CALLING ); - UAC.STANDARD_CONFIDENCE_FOR_EMITTING = Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_EMITTING ); // create a UAC but with the exactCallsLog = null, so we only output the log for the HC caller itself, if requested UnifiedArgumentCollection simpleUAC = new UnifiedArgumentCollection(UAC); + simpleUAC.OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY; // low values used for isActive determination only, default/user-specified values used for actual calling + simpleUAC.GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY; // low values used for isActive determination only, default/user-specified values used for actual calling + simpleUAC.STANDARD_CONFIDENCE_FOR_CALLING = Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_CALLING ); + simpleUAC.STANDARD_CONFIDENCE_FOR_EMITTING = Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_EMITTING ); simpleUAC.exactCallsLog = null; UG_engine_simple_genotyper = new UnifiedGenotyperEngine(getToolkit(), simpleUAC, logger, null, null, samples, VariantContextUtils.DEFAULT_PLOIDY);