From a647f1e076a37d2fbfbbc7650a2111bcff74a367 Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Sat, 20 Oct 2012 16:38:18 -0400 Subject: [PATCH 02/34] Refactoring the PairHMM util class to allow for multiple implementations which can be specified by the callers via an enum argument. Adding an optimized PairHMM implementation which caches per-read calculations as well as a logless implementation which drastically reduces the runtime of the HMM while also increasing the precision of the result. In the HaplotypeCaller we now lexicographically sort the haplotypes to take maximal benefit of the haplotype offset optimization which only recalculates the HMM matrices after the first differing base in the haplotype. Many thanks to Mauricio for all the initial groundwork for these optimizations. The change to the one HC integration test is in the fourth decimal of HaplotypeScore. --- .../gatk/walkers/genotyper/ErrorModel.java | 2 +- ...elGenotypeLikelihoodsCalculationModel.java | 2 +- .../haplotypecaller/HaplotypeCaller.java | 12 +- .../LikelihoodCalculationEngine.java | 53 ++-- .../sting/utils/pairhmm/CachingPairHMM.java | 181 ++++++++++++ .../utils/pairhmm/LoglessCachingPairHMM.java | 187 +++++++++++++ .../HaplotypeCallerIntegrationTest.java | 2 +- .../sting/utils/pairhmm}/PairHMMUnitTest.java | 242 ++++++---------- ...elGenotypeLikelihoodsCalculationModel.java | 2 +- .../genotyper/UnifiedArgumentCollection.java | 13 +- .../indels/PairHMMIndelErrorModel.java | 42 +-- .../broadinstitute/sting/utils/Haplotype.java | 16 ++ .../broadinstitute/sting/utils/PairHMM.java | 259 ------------------ .../sting/utils/pairhmm/ExactPairHMM.java | 107 ++++++++ .../sting/utils/pairhmm/OriginalPairHMM.java | 105 +++++++ .../sting/utils/pairhmm/PairHMM.java | 45 +++ 16 files changed, 813 insertions(+), 457 deletions(-) create mode 100644 protected/java/src/org/broadinstitute/sting/utils/pairhmm/CachingPairHMM.java create mode 100644 protected/java/src/org/broadinstitute/sting/utils/pairhmm/LoglessCachingPairHMM.java rename {public/java/test/org/broadinstitute/sting/utils => protected/java/test/org/broadinstitute/sting/utils/pairhmm}/PairHMMUnitTest.java (56%) delete mode 100644 public/java/src/org/broadinstitute/sting/utils/PairHMM.java create mode 100644 public/java/src/org/broadinstitute/sting/utils/pairhmm/ExactPairHMM.java create mode 100644 public/java/src/org/broadinstitute/sting/utils/pairhmm/OriginalPairHMM.java create mode 100644 public/java/src/org/broadinstitute/sting/utils/pairhmm/PairHMM.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ErrorModel.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ErrorModel.java index f76225134..8042c15d8 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ErrorModel.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ErrorModel.java @@ -72,7 +72,7 @@ public class ErrorModel { haplotypeMap = new LinkedHashMap(); if (refSampleVC.isIndel()) { pairModel = new PairHMMIndelErrorModel(UAC.INDEL_GAP_OPEN_PENALTY, UAC.INDEL_GAP_CONTINUATION_PENALTY, - UAC.OUTPUT_DEBUG_INDEL_INFO, !UAC.DONT_DO_BANDED_INDEL_COMPUTATION); + UAC.OUTPUT_DEBUG_INDEL_INFO, UAC.pairHMM); IndelGenotypeLikelihoodsCalculationModel.getHaplotypeMapFromAlleles(refSampleVC.getAlleles(), refContext, refContext.getLocus(), haplotypeMap); // will update haplotypeMap adding elements } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoodsCalculationModel.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoodsCalculationModel.java index fc0c526bc..f09a1ea3e 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoodsCalculationModel.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoodsCalculationModel.java @@ -62,7 +62,7 @@ public class GeneralPloidyIndelGenotypeLikelihoodsCalculationModel extends Gener pairModel = new PairHMMIndelErrorModel(UAC.INDEL_GAP_OPEN_PENALTY, UAC.INDEL_GAP_CONTINUATION_PENALTY, - UAC.OUTPUT_DEBUG_INDEL_INFO, !UAC.DONT_DO_BANDED_INDEL_COMPUTATION); + UAC.OUTPUT_DEBUG_INDEL_INFO, UAC.pairHMM); haplotypeMap = new LinkedHashMap(); } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index 71e4f5f8a..5f2b5775c 100755 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -52,6 +52,7 @@ import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; import org.broadinstitute.sting.utils.fragments.FragmentCollection; import org.broadinstitute.sting.utils.fragments.FragmentUtils; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; +import org.broadinstitute.sting.utils.pairhmm.PairHMM; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.sam.AlignmentUtils; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; @@ -114,6 +115,12 @@ public class HaplotypeCaller extends ActiveRegionWalker implem @Output(fullName="graphOutput", shortName="graph", doc="File to which debug assembly graph information should be written", required = false) protected PrintStream graphWriter = null; + /** + * The PairHMM implementation to use for genotype likelihood calculations. The various implementations balance a tradeoff of accuracy and runtime. + */ + @Argument(fullName = "pair_hmm_implementation", shortName = "pairHMM", doc = "The PairHMM implementation to use for genotype likelihood calculations", required = false) + public PairHMM.HMM_IMPLEMENTATION pairHMM = PairHMM.HMM_IMPLEMENTATION.LOGLESS_CACHING; + @Hidden @Argument(fullName="keepRG", shortName="keepRG", doc="Only use read from this read group when making calls (but use all reads to build the assembly)", required = false) protected String keepRG = null; @@ -287,7 +294,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem } assemblyEngine = new SimpleDeBruijnAssembler( DEBUG, graphWriter ); - likelihoodCalculationEngine = new LikelihoodCalculationEngine( (byte)gcpHMM, DEBUG, false ); + likelihoodCalculationEngine = new LikelihoodCalculationEngine( (byte)gcpHMM, DEBUG, pairHMM ); genotypingEngine = new GenotypingEngine( DEBUG, OUTPUT_FULL_HAPLOTYPE_SEQUENCE ); } @@ -400,6 +407,9 @@ public class HaplotypeCaller extends ActiveRegionWalker implem final List filteredReads = filterNonPassingReads( activeRegion ); // filter out reads from genotyping which fail mapping quality based criteria if( activeRegion.size() == 0 ) { return 1; } // no reads remain after filtering so nothing else to do! + // sort haplotypes to take full advantage of haplotype start offset optimizations in PairHMM + Collections.sort( haplotypes, new Haplotype.HaplotypeBaseComparator() ); + // evaluate each sample's reads against all haplotypes final HashMap> perSampleReadList = splitReadsBySample( activeRegion.getReads() ); final HashMap> perSampleFilteredReadList = splitReadsBySample( filteredReads ); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java index 14c1cd59d..62554c4ab 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java @@ -30,6 +30,9 @@ import com.google.java.contract.Requires; import org.broadinstitute.sting.gatk.walkers.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.pairhmm.*; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.ReadUtils; import org.broadinstitute.sting.utils.variantcontext.Allele; @@ -44,8 +47,25 @@ public class LikelihoodCalculationEngine { private final boolean DEBUG; private final PairHMM pairHMM; - public LikelihoodCalculationEngine( final byte constantGCP, final boolean debug, final boolean noBanded ) { - pairHMM = new PairHMM( noBanded ); + public LikelihoodCalculationEngine( final byte constantGCP, final boolean debug, final PairHMM.HMM_IMPLEMENTATION hmmType ) { + + switch (hmmType) { + case EXACT: + pairHMM = new ExactPairHMM(); + break; + case ORIGINAL: + pairHMM = new OriginalPairHMM(); + break; + case CACHING: + pairHMM = new CachingPairHMM(); + break; + case LOGLESS_CACHING: + pairHMM = new LoglessCachingPairHMM(); + break; + default: + throw new UserException.BadArgumentValue("pairHMM", "Specified pairHMM implementation is unrecognized or incompatible with the HaplotypeCaller. Acceptable options are ORIGINAL, EXACT, CACHING, and LOGLESS_CACHING."); + } + this.constantGCP = constantGCP; DEBUG = debug; } @@ -69,23 +89,18 @@ public class LikelihoodCalculationEngine { X_METRIC_LENGTH += 2; Y_METRIC_LENGTH += 2; - // initial arrays to hold the probabilities of being in the match, insertion and deletion cases - final double[][] matchMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; - final double[][] XMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; - final double[][] YMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; - - PairHMM.initializeArrays(matchMetricArray, XMetricArray, YMetricArray, X_METRIC_LENGTH); + // initialize arrays to hold the probabilities of being in the match, insertion and deletion cases + pairHMM.initialize(X_METRIC_LENGTH, Y_METRIC_LENGTH); // for each sample's reads for( final Map.Entry> sampleEntry : perSampleReadList.entrySet() ) { //if( DEBUG ) { System.out.println("Evaluating sample " + sample + " with " + perSampleReadList.get( sample ).size() + " passing reads"); } // evaluate the likelihood of the reads given those haplotypes - computeReadLikelihoods( haplotypes, sampleEntry.getValue(), sampleEntry.getKey(), matchMetricArray, XMetricArray, YMetricArray ); + computeReadLikelihoods( haplotypes, sampleEntry.getValue(), sampleEntry.getKey() ); } } - private void computeReadLikelihoods( final ArrayList haplotypes, final ArrayList reads, final String sample, - final double[][] matchMetricArray, final double[][] XMetricArray, final double[][] YMetricArray ) { + private void computeReadLikelihoods( final ArrayList haplotypes, final ArrayList reads, final String sample ) { final int numHaplotypes = haplotypes.size(); final int numReads = reads.size(); @@ -113,9 +128,8 @@ public class LikelihoodCalculationEngine { final int haplotypeStart = ( previousHaplotypeSeen == null ? 0 : computeFirstDifferingPosition(haplotype.getBases(), previousHaplotypeSeen.getBases()) ); previousHaplotypeSeen = haplotype; - readLikelihoods[jjj][iii] = pairHMM.computeReadLikelihoodGivenHaplotype(haplotype.getBases(), read.getReadBases(), - readQuals, readInsQuals, readDelQuals, overallGCP, - haplotypeStart, matchMetricArray, XMetricArray, YMetricArray); + readLikelihoods[jjj][iii] = pairHMM.computeReadLikelihoodGivenHaplotypeLog10(haplotype.getBases(), read.getReadBases(), + readQuals, readInsQuals, readDelQuals, overallGCP, haplotypeStart, jjj == 0); readCounts[jjj][iii] = readCount; } } @@ -130,7 +144,7 @@ public class LikelihoodCalculationEngine { return iii; } } - return b1.length; + return Math.min(b1.length, b2.length); } @Requires({"haplotypes.size() > 0"}) @@ -280,7 +294,7 @@ public class LikelihoodCalculationEngine { final int numHaplotypes = haplotypes.size(); final Set sampleKeySet = haplotypes.get(0).getSampleKeySet(); // BUGBUG: assume all haplotypes saw the same samples final ArrayList bestHaplotypesIndexList = new ArrayList(); - bestHaplotypesIndexList.add(0); // always start with the reference haplotype + bestHaplotypesIndexList.add( findReferenceIndex(haplotypes) ); // always start with the reference haplotype // set up the default 1-to-1 haplotype mapping object final ArrayList> haplotypeMapping = new ArrayList>(); for( final Haplotype h : haplotypes ) { @@ -322,6 +336,13 @@ public class LikelihoodCalculationEngine { return bestHaplotypes; } + public static int findReferenceIndex( final List haplotypes ) { + for( final Haplotype h : haplotypes ) { + if( h.isReference() ) { return haplotypes.indexOf(h); } + } + throw new ReviewedStingException( "No reference haplotype found in the list of haplotypes!" ); + } + public static Map partitionReadsBasedOnLikelihoods( final GenomeLocParser parser, final HashMap> perSampleReadList, final HashMap> perSampleFilteredReadList, diff --git a/protected/java/src/org/broadinstitute/sting/utils/pairhmm/CachingPairHMM.java b/protected/java/src/org/broadinstitute/sting/utils/pairhmm/CachingPairHMM.java new file mode 100644 index 000000000..282db45d5 --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/utils/pairhmm/CachingPairHMM.java @@ -0,0 +1,181 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.pairhmm; + +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.QualityUtils; + +import java.util.Arrays; + +/** + * Created with IntelliJ IDEA. + * User: rpoplin, carneiro + * Date: 10/16/12 + */ + +public class CachingPairHMM extends OriginalPairHMM { + + double[][] constantMatrix = null; // The cache in the CachingPairHMM + double[][] distanceMatrix = null; // The cache in the CachingPairHMM + + protected static final double [] firstRowConstantMatrix = { + QualityUtils.qualToProbLog10((byte) (DEFAULT_GOP + DEFAULT_GOP)), + QualityUtils.qualToProbLog10(DEFAULT_GCP), + QualityUtils.qualToErrorProbLog10(DEFAULT_GOP), + QualityUtils.qualToErrorProbLog10(DEFAULT_GCP), + 0.0, + 0.0 + }; + + @Override + public void initialize( final int READ_MAX_LENGTH, final int HAPLOTYPE_MAX_LENGTH ) { + + super.initialize(READ_MAX_LENGTH, HAPLOTYPE_MAX_LENGTH); + + // M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment + final int X_METRIC_LENGTH = READ_MAX_LENGTH + 2; + final int Y_METRIC_LENGTH = HAPLOTYPE_MAX_LENGTH + 2; + + constantMatrix = new double[X_METRIC_LENGTH][6]; + distanceMatrix = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; + + // fill in the first row + for( int jjj = 2; jjj < Y_METRIC_LENGTH; jjj++ ) { + updateCell(1, jjj, 0.0, firstRowConstantMatrix, matchMetricArray, XMetricArray, YMetricArray); + } + } + + @Override + public double computeReadLikelihoodGivenHaplotypeLog10( final byte[] haplotypeBases, + final byte[] readBases, + final byte[] readQuals, + final byte[] insertionGOP, + final byte[] deletionGOP, + final byte[] overallGCP, + final int hapStartIndex, + final boolean recacheReadValues ) { + + if( recacheReadValues ) { + initializeConstants( insertionGOP, deletionGOP, overallGCP ); + } + initializeDistanceMatrix( haplotypeBases, readBases, readQuals, hapStartIndex ); + + // M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment + final int X_METRIC_LENGTH = readBases.length + 2; + final int Y_METRIC_LENGTH = haplotypeBases.length + 2; + + for (int i = 2; i < X_METRIC_LENGTH; i++) { + for (int j = hapStartIndex+1; j < Y_METRIC_LENGTH; j++) { + updateCell(i, j, distanceMatrix[i][j], constantMatrix[i], matchMetricArray, XMetricArray, YMetricArray); + } + } + + // final probability is the log10 sum of the last element in all three state arrays + final int endI = X_METRIC_LENGTH - 1; + final int endJ = Y_METRIC_LENGTH - 1; + return MathUtils.approximateLog10SumLog10(matchMetricArray[endI][endJ], XMetricArray[endI][endJ], YMetricArray[endI][endJ]); + } + + /** + * Initializes the matrix that holds all the constants related to the editing + * distance between the read and the haplotype. + * + * @param haplotypeBases the bases of the haplotype + * @param readBases the bases of the read + * @param readQuals the base quality scores of the read + * @param startIndex where to start updating the distanceMatrix (in case this read is similar to the previous read) + */ + public void initializeDistanceMatrix( final byte[] haplotypeBases, + final byte[] readBases, + final byte[] readQuals, + final int startIndex ) { + + // initialize the pBaseReadLog10 matrix for all combinations of read x haplotype bases + // Abusing the fact that java initializes arrays with 0.0, so no need to fill in rows and columns below 2. + + for (int i = 0; i < readBases.length; i++) { + final byte x = readBases[i]; + final byte qual = readQuals[i]; + for (int j = startIndex; j < haplotypeBases.length; j++) { + final byte y = haplotypeBases[j]; + distanceMatrix[i+2][j+2] = ( x == y || x == (byte) 'N' || y == (byte) 'N' ? + QualityUtils.qualToProbLog10(qual) : QualityUtils.qualToErrorProbLog10(qual) ); + } + } + } + + /** + * Initializes the matrix that holds all the constants related to quality scores. + * + * @param insertionGOP insertion quality scores of the read + * @param deletionGOP deletion quality scores of the read + * @param overallGCP overall gap continuation penalty + */ + public void initializeConstants( final byte[] insertionGOP, + final byte[] deletionGOP, + final byte[] overallGCP ) { + + final int l = insertionGOP.length; + constantMatrix[1] = firstRowConstantMatrix; + for (int i = 0; i < l; i++) { + final int qualIndexGOP = Math.min(insertionGOP[i] + deletionGOP[i], Byte.MAX_VALUE); + constantMatrix[i+2][0] = QualityUtils.qualToProbLog10((byte) qualIndexGOP); + constantMatrix[i+2][1] = QualityUtils.qualToProbLog10(overallGCP[i]); + constantMatrix[i+2][2] = QualityUtils.qualToErrorProbLog10(insertionGOP[i]); + constantMatrix[i+2][3] = QualityUtils.qualToErrorProbLog10(overallGCP[i]); + constantMatrix[i+2][4] = QualityUtils.qualToErrorProbLog10(deletionGOP[i]); + constantMatrix[i+2][5] = QualityUtils.qualToErrorProbLog10(overallGCP[i]); + } + constantMatrix[l+1][4] = 0.0; + constantMatrix[l+1][5] = 0.0; + } + + /** + * Updates a cell in the HMM matrix + * + * The read and haplotype indices are offset by one because the state arrays have an extra column to hold the + * initial conditions + + * @param indI row index in the matrices to update + * @param indJ column index in the matrices to update + * @param prior the likelihood editing distance matrix for the read x haplotype + * @param constants an array with the six constants relevant to this location + * @param matchMetricArray the matches likelihood matrix + * @param XMetricArray the insertions likelihood matrix + * @param YMetricArray the deletions likelihood matrix + */ + private void updateCell( final int indI, final int indJ, final double prior, final double[] constants, + final double[][] matchMetricArray, final double[][] XMetricArray, final double[][] YMetricArray ) { + + matchMetricArray[indI][indJ] = prior + + MathUtils.approximateLog10SumLog10( matchMetricArray[indI - 1][indJ - 1] + constants[0], + XMetricArray[indI - 1][indJ - 1] + constants[1], + YMetricArray[indI - 1][indJ - 1] + constants[1] ); + XMetricArray[indI][indJ] = MathUtils.approximateLog10SumLog10( matchMetricArray[indI - 1][indJ] + constants[2], + XMetricArray[indI - 1][indJ] + constants[3]); + YMetricArray[indI][indJ] = MathUtils.approximateLog10SumLog10( matchMetricArray[indI][indJ - 1] + constants[4], + YMetricArray[indI][indJ - 1] + constants[5]); + } +} diff --git a/protected/java/src/org/broadinstitute/sting/utils/pairhmm/LoglessCachingPairHMM.java b/protected/java/src/org/broadinstitute/sting/utils/pairhmm/LoglessCachingPairHMM.java new file mode 100644 index 000000000..d2aef5bb5 --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/utils/pairhmm/LoglessCachingPairHMM.java @@ -0,0 +1,187 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.pairhmm; + +import org.broadinstitute.sting.utils.QualityUtils; + +import java.util.Arrays; + +/** + * Created with IntelliJ IDEA. + * User: rpoplin, carneiro + * Date: 10/16/12 + */ + +public class LoglessCachingPairHMM extends CachingPairHMM { + + protected static final double SCALE_FACTOR_LOG10 = 300.0; + + protected static final double [] firstRowConstantMatrix = { + QualityUtils.qualToProb((byte) (DEFAULT_GOP + DEFAULT_GOP)), + QualityUtils.qualToProb(DEFAULT_GCP), + QualityUtils.qualToErrorProb(DEFAULT_GOP), + QualityUtils.qualToErrorProb(DEFAULT_GCP), + 1.0, + 1.0 + }; + + @Override + public void initialize( final int READ_MAX_LENGTH, final int HAPLOTYPE_MAX_LENGTH ) { + + // M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment + final int X_METRIC_LENGTH = READ_MAX_LENGTH + 2; + final int Y_METRIC_LENGTH = HAPLOTYPE_MAX_LENGTH + 2; + + matchMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; + XMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; + YMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; + + for( int iii=0; iii < X_METRIC_LENGTH; iii++ ) { + Arrays.fill(matchMetricArray[iii], 0.0); + Arrays.fill(XMetricArray[iii], 0.0); + Arrays.fill(YMetricArray[iii], 0.0); + } + + // the initial condition + matchMetricArray[1][1] = Math.pow(10.0, SCALE_FACTOR_LOG10); // Math.log10(1.0); + + constantMatrix = new double[X_METRIC_LENGTH][6]; + distanceMatrix = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; + + // fill in the first row + for( int jjj = 2; jjj < Y_METRIC_LENGTH; jjj++ ) { + updateCell(1, jjj, 1.0, firstRowConstantMatrix, matchMetricArray, XMetricArray, YMetricArray); + } + } + + @Override + public double computeReadLikelihoodGivenHaplotypeLog10( final byte[] haplotypeBases, + final byte[] readBases, + final byte[] readQuals, + final byte[] insertionGOP, + final byte[] deletionGOP, + final byte[] overallGCP, + final int hapStartIndex, + final boolean recacheReadValues ) { + + if( recacheReadValues ) { + initializeConstants( insertionGOP, deletionGOP, overallGCP ); + } + initializeDistanceMatrix( haplotypeBases, readBases, readQuals, hapStartIndex ); + + // M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment + final int X_METRIC_LENGTH = readBases.length + 2; + final int Y_METRIC_LENGTH = haplotypeBases.length + 2; + + for (int i = 2; i < X_METRIC_LENGTH; i++) { + for (int j = hapStartIndex+1; j < Y_METRIC_LENGTH; j++) { + updateCell(i, j, distanceMatrix[i][j], constantMatrix[i], matchMetricArray, XMetricArray, YMetricArray); + } + } + + // final probability is the log10 sum of the last element in all three state arrays + final int endI = X_METRIC_LENGTH - 1; + final int endJ = Y_METRIC_LENGTH - 1; + return Math.log10( matchMetricArray[endI][endJ] + XMetricArray[endI][endJ] + YMetricArray[endI][endJ] ) - SCALE_FACTOR_LOG10; + } + + /** + * Initializes the matrix that holds all the constants related to the editing + * distance between the read and the haplotype. + * + * @param haplotypeBases the bases of the haplotype + * @param readBases the bases of the read + * @param readQuals the base quality scores of the read + * @param startIndex where to start updating the distanceMatrix (in case this read is similar to the previous read) + */ + public void initializeDistanceMatrix( final byte[] haplotypeBases, + final byte[] readBases, + final byte[] readQuals, + final int startIndex ) { + + // initialize the pBaseReadLog10 matrix for all combinations of read x haplotype bases + // Abusing the fact that java initializes arrays with 0.0, so no need to fill in rows and columns below 2. + + for (int i = 0; i < readBases.length; i++) { + final byte x = readBases[i]; + final byte qual = readQuals[i]; + for (int j = startIndex; j < haplotypeBases.length; j++) { + final byte y = haplotypeBases[j]; + distanceMatrix[i+2][j+2] = ( x == y || x == (byte) 'N' || y == (byte) 'N' ? + QualityUtils.qualToProb(qual) : QualityUtils.qualToErrorProb(qual) ); + } + } + } + + /** + * Initializes the matrix that holds all the constants related to quality scores. + * + * @param insertionGOP insertion quality scores of the read + * @param deletionGOP deletion quality scores of the read + * @param overallGCP overall gap continuation penalty + */ + public void initializeConstants( final byte[] insertionGOP, + final byte[] deletionGOP, + final byte[] overallGCP ) { + + final int l = insertionGOP.length; + constantMatrix[1] = firstRowConstantMatrix; + for (int i = 0; i < l; i++) { + final int qualIndexGOP = Math.min(insertionGOP[i] + deletionGOP[i], Byte.MAX_VALUE); + constantMatrix[i+2][0] = QualityUtils.qualToProb((byte) qualIndexGOP); + constantMatrix[i+2][1] = QualityUtils.qualToProb(overallGCP[i]); + constantMatrix[i+2][2] = QualityUtils.qualToErrorProb(insertionGOP[i]); + constantMatrix[i+2][3] = QualityUtils.qualToErrorProb(overallGCP[i]); + constantMatrix[i+2][4] = QualityUtils.qualToErrorProb(deletionGOP[i]); + constantMatrix[i+2][5] = QualityUtils.qualToErrorProb(overallGCP[i]); + } + constantMatrix[l+1][4] = 1.0; + constantMatrix[l+1][5] = 1.0; + } + + /** + * Updates a cell in the HMM matrix + * + * The read and haplotype indices are offset by one because the state arrays have an extra column to hold the + * initial conditions + + * @param indI row index in the matrices to update + * @param indJ column index in the matrices to update + * @param prior the likelihood editing distance matrix for the read x haplotype + * @param constants an array with the six constants relevant to this location + * @param matchMetricArray the matches likelihood matrix + * @param XMetricArray the insertions likelihood matrix + * @param YMetricArray the deletions likelihood matrix + */ + private void updateCell( final int indI, final int indJ, final double prior, final double[] constants, + final double[][] matchMetricArray, final double[][] XMetricArray, final double[][] YMetricArray ) { + + matchMetricArray[indI][indJ] = prior * ( matchMetricArray[indI - 1][indJ - 1] * constants[0] + + XMetricArray[indI - 1][indJ - 1] * constants[1] + + YMetricArray[indI - 1][indJ - 1] * constants[1] ); + XMetricArray[indI][indJ] = matchMetricArray[indI - 1][indJ] * constants[2] + XMetricArray[indI - 1][indJ] * constants[3]; + YMetricArray[indI][indJ] = matchMetricArray[indI][indJ - 1] * constants[4] + YMetricArray[indI][indJ - 1] * constants[5]; + } +} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index a8ea4b7da..a441e6c77 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -70,7 +70,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void HCTestProblematicReadsModifiedInActiveRegions() { final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3"; - final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("fa5c5eb996e95aed12c50d70e6dd74d7")); + final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("c54c0c9411054bf629bfd98b616e53fc")); executeTest("HCTestProblematicReadsModifiedInActiveRegions: ", spec); } diff --git a/public/java/test/org/broadinstitute/sting/utils/PairHMMUnitTest.java b/protected/java/test/org/broadinstitute/sting/utils/pairhmm/PairHMMUnitTest.java similarity index 56% rename from public/java/test/org/broadinstitute/sting/utils/PairHMMUnitTest.java rename to protected/java/test/org/broadinstitute/sting/utils/pairhmm/PairHMMUnitTest.java index 22bcb1bbf..6281054b1 100644 --- a/public/java/test/org/broadinstitute/sting/utils/PairHMMUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/utils/pairhmm/PairHMMUnitTest.java @@ -23,24 +23,26 @@ */ // our package -package org.broadinstitute.sting.utils; +package org.broadinstitute.sting.utils.pairhmm; // the imports for unit testing. - import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.utils.Utils; import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; import java.util.*; - public class PairHMMUnitTest extends BaseTest { final static boolean EXTENSIVE_TESTING = true; - PairHMM hmm = new PairHMM( false ); // reference implementation - PairHMM bandedHMM = new PairHMM( true ); // algorithm with banding + PairHMM exactHMM = new ExactPairHMM(); // the log truth implementation + PairHMM originalHMM = new OriginalPairHMM(); // the reference implementation + PairHMM cachingHMM = new CachingPairHMM(); + PairHMM loglessHMM = new LoglessCachingPairHMM(); // -------------------------------------------------------------------------------- // @@ -57,7 +59,7 @@ public class PairHMMUnitTest extends BaseTest { final static String LEFT_FLANK = "GATTTATCATCGAGTCTGC"; final static String RIGHT_FLANK = "CATGGATCGTTATCAGCTATCTCGAGGGATTCACTTAACAGTTTTA"; - public BasicLikelihoodTestProvider(final String ref, final String read, final int baseQual, final int insQual, final int delQual, final int expectedQual, final int gcp) { + public BasicLikelihoodTestProvider(final String ref, final String read, final int baseQual, final int insQual, final int delQual, final int expectedQual, final int gcp ) { this(ref, read, baseQual, insQual, delQual, expectedQual, gcp, false, false); } @@ -76,115 +78,51 @@ public class PairHMMUnitTest extends BaseTest { } public double expectedLogL() { - return expectedQual / -10.0; + return (expectedQual / -10.0) + 0.03 ; } - public double tolerance() { - return 0.1; // TODO FIXME arbitrary + public double toleranceFromTheoretical() { + return 0.2; } - public double calcLogL() { + public double toleranceFromReference() { + return 1E-4; + } - double logL = hmm.computeReadLikelihoodGivenHaplotype( + public double toleranceFromExact() { + return 1E-9; + } + + public double calcLogL( final PairHMM pairHMM, boolean anchorIndel ) { + pairHMM.initialize(readBasesWithContext.length, refBasesWithContext.length); + return pairHMM.computeReadLikelihoodGivenHaplotypeLog10( refBasesWithContext, readBasesWithContext, - qualAsBytes(baseQual, false), qualAsBytes(insQual, true), qualAsBytes(delQual, true), - qualAsBytes(gcp, false)); - - return logL; + qualAsBytes(baseQual, false, anchorIndel), qualAsBytes(insQual, true, anchorIndel), qualAsBytes(delQual, true, anchorIndel), + qualAsBytes(gcp, false, anchorIndel), 0, true); } private final byte[] asBytes(final String bases, final boolean left, final boolean right) { return ( (left ? LEFT_FLANK : "") + CONTEXT + bases + CONTEXT + (right ? RIGHT_FLANK : "")).getBytes(); } - private byte[] qualAsBytes(final int phredQual, final boolean doGOP) { + private byte[] qualAsBytes(final int phredQual, final boolean doGOP, final boolean anchorIndel) { final byte phredQuals[] = new byte[readBasesWithContext.length]; - // initialize everything to MASSIVE_QUAL so it cannot be moved by HMM - Arrays.fill(phredQuals, (byte)100); - // update just the bases corresponding to the provided micro read with the quality scores - if( doGOP ) { - phredQuals[0 + CONTEXT.length()] = (byte)phredQual; - } else { - for ( int i = 0; i < read.length(); i++) - phredQuals[i + CONTEXT.length()] = (byte)phredQual; - } + if( anchorIndel ) { + // initialize everything to MASSIVE_QUAL so it cannot be moved by HMM + Arrays.fill(phredQuals, (byte)100); - return phredQuals; - } - } - - final Random random = new Random(87865573); - private class BandedLikelihoodTestProvider extends TestDataProvider { - final String ref, read; - final byte[] refBasesWithContext, readBasesWithContext; - final int baseQual, insQual, delQual, gcp; - final int expectedQual; - final static String LEFT_CONTEXT = "ACGTAATGACGCTACATGTCGCCAACCGTC"; - final static String RIGHT_CONTEXT = "TACGGCTTCATATAGGGCAATGTGTGTGGCAAAA"; - final static String LEFT_FLANK = "GATTTATCATCGAGTCTGTT"; - final static String RIGHT_FLANK = "CATGGATCGTTATCAGCTATCTCGAGGGATTCACTTAACAGTTTCCGTA"; - final byte[] baseQuals, insQuals, delQuals, gcps; - - public BandedLikelihoodTestProvider(final String ref, final String read, final int baseQual, final int insQual, final int delQual, final int expectedQual, final int gcp) { - this(ref, read, baseQual, insQual, delQual, expectedQual, gcp, false, false); - } - - public BandedLikelihoodTestProvider(final String ref, final String read, final int baseQual, final int insQual, final int delQual, final int expectedQual, final int gcp, final boolean left, final boolean right) { - super(BandedLikelihoodTestProvider.class, String.format("BANDED: ref=%s read=%s b/i/d/c quals = %d/%d/%d/%d l/r flank = %b/%b e[qual]=%d", ref, read, baseQual, insQual, delQual, gcp, left, right, expectedQual)); - this.baseQual = baseQual; - this.delQual = delQual; - this.insQual = insQual; - this.gcp = gcp; - this.read = read; - this.ref = ref; - this.expectedQual = expectedQual; - - refBasesWithContext = asBytes(ref, left, right); - readBasesWithContext = asBytes(read, false, false); - baseQuals = qualAsBytes(baseQual); - insQuals = qualAsBytes(insQual); - delQuals = qualAsBytes(delQual); - gcps = qualAsBytes(gcp, false); - } - - public double expectedLogL() { - double logL = hmm.computeReadLikelihoodGivenHaplotype( - refBasesWithContext, readBasesWithContext, - baseQuals, insQuals, delQuals, gcps); - - return logL; - } - - public double tolerance() { - return 0.2; // TODO FIXME arbitrary - } - - public double calcLogL() { - - double logL = bandedHMM.computeReadLikelihoodGivenHaplotype( - refBasesWithContext, readBasesWithContext, - baseQuals, insQuals, delQuals, gcps); - - return logL; - } - - private final byte[] asBytes(final String bases, final boolean left, final boolean right) { - return ( (left ? LEFT_FLANK : "") + LEFT_CONTEXT + bases + RIGHT_CONTEXT + (right ? RIGHT_FLANK : "")).getBytes(); - } - - private byte[] qualAsBytes(final int phredQual) { - return qualAsBytes(phredQual, true); - } - - private byte[] qualAsBytes(final int phredQual, final boolean addRandom) { - final byte phredQuals[] = new byte[readBasesWithContext.length]; - Arrays.fill(phredQuals, (byte)phredQual); - if(addRandom) { - for( int iii = 0; iii < phredQuals.length; iii++) { - phredQuals[iii] = (byte) ((int) phredQuals[iii] + (random.nextInt(7) - 3)); + // update just the bases corresponding to the provided micro read with the quality scores + if( doGOP ) { + phredQuals[0 + CONTEXT.length()] = (byte)phredQual; + } else { + for ( int i = 0; i < read.length(); i++) + phredQuals[i + CONTEXT.length()] = (byte)phredQual; } + } else { + Arrays.fill(phredQuals, (byte)phredQual); } + return phredQuals; } } @@ -195,8 +133,8 @@ public class PairHMMUnitTest extends BaseTest { // test all combinations final List baseQuals = EXTENSIVE_TESTING ? Arrays.asList(10, 20, 30, 40, 50) : Arrays.asList(30); final List indelQuals = EXTENSIVE_TESTING ? Arrays.asList(20, 30, 40, 50) : Arrays.asList(40); - final List gcps = EXTENSIVE_TESTING ? Arrays.asList(10, 20, 30) : Arrays.asList(10); - final List sizes = EXTENSIVE_TESTING ? Arrays.asList(2,3,4,5,7,8,9,10,20) : Arrays.asList(2); + final List gcps = EXTENSIVE_TESTING ? Arrays.asList(8, 10, 20) : Arrays.asList(10); + final List sizes = EXTENSIVE_TESTING ? Arrays.asList(2,3,4,5,7,8,9,10,20,30,35) : Arrays.asList(2); for ( final int baseQual : baseQuals ) { for ( final int indelQual : indelQuals ) { @@ -219,7 +157,7 @@ public class PairHMMUnitTest extends BaseTest { for ( boolean insertionP : Arrays.asList(true, false)) { final String small = Utils.dupString((char)base, 1); - final String big = Utils.dupString((char)base, size); + final String big = Utils.dupString((char) base, size); final String ref = insertionP ? small : big; final String read = insertionP ? big : small; @@ -238,69 +176,65 @@ public class PairHMMUnitTest extends BaseTest { return BasicLikelihoodTestProvider.getTests(BasicLikelihoodTestProvider.class); } - @Test(dataProvider = "BasicLikelihoodTestProvider", enabled = true) - public void testBasicLikelihoods(BasicLikelihoodTestProvider cfg) { - double calculatedLogL = cfg.calcLogL(); - double expectedLogL = cfg.expectedLogL(); - logger.warn(String.format("Test: logL calc=%.2f expected=%.2f for %s", calculatedLogL, expectedLogL, cfg.toString())); - Assert.assertEquals(calculatedLogL, expectedLogL, cfg.tolerance()); - } - - @DataProvider(name = "BandedLikelihoodTestProvider") - public Object[][] makeBandedLikelihoodTests() { + final Random random = new Random(87860573); + @DataProvider(name = "OptimizedLikelihoodTestProvider") + public Object[][] makeOptimizedLikelihoodTests() { // context on either side is ACGTTGCA REF ACGTTGCA // test all combinations - final List baseQuals = EXTENSIVE_TESTING ? Arrays.asList(25, 30, 40, 50) : Arrays.asList(30); - final List indelQuals = EXTENSIVE_TESTING ? Arrays.asList(30, 40, 50) : Arrays.asList(40); - final List gcps = EXTENSIVE_TESTING ? Arrays.asList(10, 12) : Arrays.asList(10); - final List sizes = EXTENSIVE_TESTING ? Arrays.asList(2,3,4,5,7,8,9,10,20) : Arrays.asList(2); + final List baseQuals = EXTENSIVE_TESTING ? Arrays.asList(10, 30, 40, 60) : Arrays.asList(30); + final List indelQuals = EXTENSIVE_TESTING ? Arrays.asList(20, 40, 60) : Arrays.asList(40); + final List gcps = EXTENSIVE_TESTING ? Arrays.asList(10, 20, 30) : Arrays.asList(10); + final List sizes = EXTENSIVE_TESTING ? Arrays.asList(3, 20, 50, 90, 160) : Arrays.asList(2); for ( final int baseQual : baseQuals ) { for ( final int indelQual : indelQuals ) { for ( final int gcp : gcps ) { - - // test substitutions - for ( final byte refBase : BaseUtils.BASES ) { - for ( final byte readBase : BaseUtils.BASES ) { - final String ref = new String(new byte[]{refBase}); - final String read = new String(new byte[]{readBase}); - final int expected = refBase == readBase ? 0 : baseQual; - new BandedLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp); - } - } - - // test insertions and deletions - for ( final int size : sizes ) { - for ( final byte base : BaseUtils.BASES ) { - final int expected = indelQual + (size - 2) * gcp; - - for ( boolean insertionP : Arrays.asList(true, false)) { - final String small = Utils.dupString((char)base, 1); - final String big = Utils.dupString((char)base, size); - - final String ref = insertionP ? small : big; - final String read = insertionP ? big : small; - - new BandedLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp); - new BandedLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp, true, false); - new BandedLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp, false, true); - new BandedLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp, true, true); + for ( final int refSize : sizes ) { + for ( final int readSize : sizes ) { + String ref = ""; + String read = ""; + for( int iii = 0; iii < refSize; iii++) { + ref += (char) BaseUtils.BASES[random.nextInt(4)]; } + for( int iii = 0; iii < readSize; iii++) { + read += (char) BaseUtils.BASES[random.nextInt(4)]; + } + new BasicLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, -0, gcp); + new BasicLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, -0, gcp, true, false); + new BasicLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, -0, gcp, false, true); + new BasicLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, -0, gcp, true, true); } } } } } - return BandedLikelihoodTestProvider.getTests(BandedLikelihoodTestProvider.class); + return BasicLikelihoodTestProvider.getTests(BasicLikelihoodTestProvider.class); } - @Test(dataProvider = "BandedLikelihoodTestProvider", enabled = true) - public void testBandedLikelihoods(BandedLikelihoodTestProvider cfg) { - double calculatedLogL = cfg.calcLogL(); + @Test(dataProvider = "BasicLikelihoodTestProvider", enabled = true) + public void testBasicLikelihoods(BasicLikelihoodTestProvider cfg) { + double exactLogL = cfg.calcLogL( exactHMM, true ); + double calculatedLogL = cfg.calcLogL( originalHMM, true ); + double optimizedLogL = cfg.calcLogL( cachingHMM, true ); + double loglessLogL = cfg.calcLogL( loglessHMM, true ); double expectedLogL = cfg.expectedLogL(); - logger.warn(String.format("Test: logL calc=%.2f expected=%.2f for %s", calculatedLogL, expectedLogL, cfg.toString())); - Assert.assertEquals(calculatedLogL, expectedLogL, cfg.tolerance()); + //logger.warn(String.format("Test: logL calc=%.2f optimized=%.2f logless=%.2f expected=%.2f for %s", calculatedLogL, optimizedLogL, loglessLogL, expectedLogL, cfg.toString())); + Assert.assertEquals(exactLogL, expectedLogL, cfg.toleranceFromTheoretical()); + Assert.assertEquals(calculatedLogL, expectedLogL, cfg.toleranceFromTheoretical()); + Assert.assertEquals(optimizedLogL, calculatedLogL, cfg.toleranceFromReference()); + Assert.assertEquals(loglessLogL, exactLogL, cfg.toleranceFromExact()); + } + + @Test(dataProvider = "OptimizedLikelihoodTestProvider", enabled = true) + public void testOptimizedLikelihoods(BasicLikelihoodTestProvider cfg) { + double exactLogL = cfg.calcLogL( exactHMM, false ); + double calculatedLogL = cfg.calcLogL( originalHMM, false ); + double optimizedLogL = cfg.calcLogL( cachingHMM, false ); + double loglessLogL = cfg.calcLogL( loglessHMM, false ); + //logger.warn(String.format("Test: logL calc=%.2f optimized=%.2f logless=%.2f expected=%.2f for %s", calculatedLogL, optimizedLogL, loglessLogL, expectedLogL, cfg.toString())); + Assert.assertEquals(optimizedLogL, calculatedLogL, cfg.toleranceFromReference()); + Assert.assertEquals(loglessLogL, exactLogL, cfg.toleranceFromExact()); } @Test @@ -322,11 +256,11 @@ public class PairHMMUnitTest extends BaseTest { byte[] mread = Arrays.copyOfRange(haplotype1,offset,haplotype1.length-offset); // change single base at position k to C. If it's a C, change to T mread[k] = ( mread[k] == (byte)'C' ? (byte)'T' : (byte)'C'); - double res1 = hmm.computeReadLikelihoodGivenHaplotype( + originalHMM.initialize(mread.length, haplotype1.length); + double res1 = originalHMM.computeReadLikelihoodGivenHaplotypeLog10( haplotype1, mread, quals, gop, gop, - gcp); - + gcp, 0, false); System.out.format("H:%s\nR: %s\n Pos:%d Result:%4.2f\n",new String(haplotype1), new String(mread), k,res1); @@ -353,11 +287,11 @@ public class PairHMMUnitTest extends BaseTest { byte[] mread = Arrays.copyOfRange(haplotype1,offset,haplotype1.length); // change single base at position k to C. If it's a C, change to T mread[k] = ( mread[k] == (byte)'C' ? (byte)'T' : (byte)'C'); - double res1 = hmm.computeReadLikelihoodGivenHaplotype( + originalHMM.initialize(mread.length, haplotype1.length); + double res1 = originalHMM.computeReadLikelihoodGivenHaplotypeLog10( haplotype1, mread, quals, gop, gop, - gcp); - + gcp, 0, false); System.out.format("H:%s\nR: %s\n Pos:%d Result:%4.2f\n",new String(haplotype1), new String(mread), k,res1); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java index ebfbc49fe..e0ffb2ba6 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java @@ -57,7 +57,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood protected IndelGenotypeLikelihoodsCalculationModel(UnifiedArgumentCollection UAC, Logger logger) { super(UAC, logger); pairModel = new PairHMMIndelErrorModel(UAC.INDEL_GAP_OPEN_PENALTY, UAC.INDEL_GAP_CONTINUATION_PENALTY, - UAC.OUTPUT_DEBUG_INDEL_INFO, !UAC.DONT_DO_BANDED_INDEL_COMPUTATION); + UAC.OUTPUT_DEBUG_INDEL_INFO, UAC.pairHMM); DEBUG = UAC.OUTPUT_DEBUG_INDEL_INFO; haplotypeMap = new LinkedHashMap(); ignoreSNPAllelesWhenGenotypingIndels = UAC.IGNORE_SNP_ALLELES; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java index 885463fcb..3eda2017c 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java @@ -28,6 +28,7 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.gatk.arguments.StandardCallerArgumentCollection; import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.AFCalcFactory; +import org.broadinstitute.sting.utils.pairhmm.PairHMM; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; @@ -65,6 +66,12 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection @Argument(fullName = "annotateNDA", shortName = "nda", doc = "If provided, we will annotate records with the number of alternate alleles that were discovered (but not necessarily genotyped) at a given site", required = false) public boolean ANNOTATE_NUMBER_OF_ALLELES_DISCOVERED = false; + /** + * The PairHMM implementation to use for -glm INDEL genotype likelihood calculations. The various implementations balance a tradeoff of accuracy and runtime. + */ + @Argument(fullName = "pair_hmm_implementation", shortName = "pairHMM", doc = "The PairHMM implementation to use for -glm INDEL genotype likelihood calculations", required = false) + public PairHMM.HMM_IMPLEMENTATION pairHMM = PairHMM.HMM_IMPLEMENTATION.ORIGINAL; + /** * The minimum confidence needed in a given base for it to be used in variant calling. Note that the base quality of a base * is capped by the mapping quality so that bases on reads with low mapping quality may get filtered out depending on this value. @@ -112,10 +119,6 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection @Argument(fullName = "indelHaplotypeSize", shortName = "indelHSize", doc = "Indel haplotype size", required = false) public int INDEL_HAPLOTYPE_SIZE = 80; - @Hidden - @Argument(fullName = "noBandedIndel", shortName = "noBandedIndel", doc = "Don't do Banded Indel likelihood computation", required = false) - public boolean DONT_DO_BANDED_INDEL_COMPUTATION = false; - @Hidden @Argument(fullName = "indelDebug", shortName = "indelDebug", doc = "Output indel debug info", required = false) public boolean OUTPUT_DEBUG_INDEL_INFO = false; @@ -221,10 +224,10 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection uac.EXCLUDE_FILTERED_REFERENCE_SITES = EXCLUDE_FILTERED_REFERENCE_SITES; uac.IGNORE_LANE_INFO = IGNORE_LANE_INFO; uac.exactCallsLog = exactCallsLog; + uac.pairHMM = pairHMM; // todo- arguments to remove uac.IGNORE_SNP_ALLELES = IGNORE_SNP_ALLELES; - uac.DONT_DO_BANDED_INDEL_COMPUTATION = DONT_DO_BANDED_INDEL_COMPUTATION; return uac; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java index 9234a9fe8..3d287057c 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java @@ -30,8 +30,11 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.walkers.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.Haplotype; import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.PairHMM; import org.broadinstitute.sting.utils.clipping.ReadClipper; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.pairhmm.ExactPairHMM; +import org.broadinstitute.sting.utils.pairhmm.OriginalPairHMM; +import org.broadinstitute.sting.utils.pairhmm.PairHMM; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; @@ -48,7 +51,6 @@ public class PairHMMIndelErrorModel { public static final int BASE_QUAL_THRESHOLD = 20; private boolean DEBUG = false; - private boolean bandedLikelihoods = false; private static final int MAX_CACHED_QUAL = 127; @@ -67,6 +69,8 @@ public class PairHMMIndelErrorModel { private final byte[] GAP_OPEN_PROB_TABLE; private final byte[] GAP_CONT_PROB_TABLE; + private final PairHMM pairHMM; + ///////////////////////////// // Private Member Variables ///////////////////////////// @@ -85,15 +89,26 @@ public class PairHMMIndelErrorModel { } } - public PairHMMIndelErrorModel(byte indelGOP, byte indelGCP, boolean deb, boolean bandedLikelihoods) { + public PairHMMIndelErrorModel(byte indelGOP, byte indelGCP, boolean deb, final PairHMM.HMM_IMPLEMENTATION hmmType ) { this.DEBUG = deb; - this.bandedLikelihoods = bandedLikelihoods; + + switch (hmmType) { + case EXACT: + pairHMM = new ExactPairHMM(); + break; + case ORIGINAL: + pairHMM = new OriginalPairHMM(); + break; + case CACHING: + case LOGLESS_CACHING: + default: + throw new UserException.BadArgumentValue("pairHMM", "Specified pairHMM implementation is unrecognized or incompatible with the UnifiedGenotyper. Acceptable options are ORIGINAL and EXACT."); + } // fill gap penalty table, affine naive model: this.GAP_CONT_PROB_TABLE = new byte[MAX_HRUN_GAP_IDX]; this.GAP_OPEN_PROB_TABLE = new byte[MAX_HRUN_GAP_IDX]; - for (int i = 0; i < START_HRUN_GAP_IDX; i++) { GAP_OPEN_PROB_TABLE[i] = indelGOP; GAP_CONT_PROB_TABLE[i] = indelGCP; @@ -190,7 +205,6 @@ public class PairHMMIndelErrorModel { final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap, final int[] readCounts) { final double readLikelihoods[][] = new double[pileup.getNumberOfElements()][haplotypeMap.size()]; - final PairHMM pairHMM = new PairHMM(bandedLikelihoods); int readIdx=0; for (PileupElement p: pileup) { @@ -303,8 +317,6 @@ public class PairHMMIndelErrorModel { final byte[] readQuals = Arrays.copyOfRange(unclippedReadQuals,numStartSoftClippedBases, unclippedReadBases.length-numEndSoftClippedBases); int j=0; - // initialize path metric and traceback memories for likelihood computation - double[][] matchMetricArray = null, XMetricArray = null, YMetricArray = null; byte[] previousHaplotypeSeen = null; final byte[] contextLogGapOpenProbabilities = new byte[readBases.length]; final byte[] contextLogGapContinuationProbabilities = new byte[readBases.length]; @@ -341,14 +353,9 @@ public class PairHMMIndelErrorModel { final int X_METRIC_LENGTH = readBases.length+2; final int Y_METRIC_LENGTH = haplotypeBases.length+2; - if (matchMetricArray == null) { + if (previousHaplotypeSeen == null) { //no need to reallocate arrays for each new haplotype, as length won't change - matchMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; - XMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; - YMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; - - - PairHMM.initializeArrays(matchMetricArray, XMetricArray, YMetricArray, X_METRIC_LENGTH); + pairHMM.initialize(X_METRIC_LENGTH, Y_METRIC_LENGTH); } int startIndexInHaplotype = 0; @@ -356,11 +363,10 @@ public class PairHMMIndelErrorModel { startIndexInHaplotype = computeFirstDifferingPosition(haplotypeBases, previousHaplotypeSeen); previousHaplotypeSeen = haplotypeBases.clone(); - readLikelihood = pairHMM.computeReadLikelihoodGivenHaplotype(haplotypeBases, readBases, readQuals, + readLikelihood = pairHMM.computeReadLikelihoodGivenHaplotypeLog10(haplotypeBases, readBases, readQuals, (read.hasBaseIndelQualities() ? read.getBaseInsertionQualities() : contextLogGapOpenProbabilities), (read.hasBaseIndelQualities() ? read.getBaseDeletionQualities() : contextLogGapOpenProbabilities), - contextLogGapContinuationProbabilities, - startIndexInHaplotype, matchMetricArray, XMetricArray, YMetricArray); + contextLogGapContinuationProbabilities, startIndexInHaplotype, false); if (DEBUG) { diff --git a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java index befd24307..b30d47074 100755 --- a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java +++ b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java @@ -34,6 +34,7 @@ import org.broadinstitute.sting.utils.sam.ReadUtils; import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import java.io.Serializable; import java.util.*; public class Haplotype { @@ -184,6 +185,21 @@ public class Haplotype { return new Haplotype(newHaplotypeBases); } + public static class HaplotypeBaseComparator implements Comparator, Serializable { + @Override + public int compare( final Haplotype hap1, final Haplotype hap2 ) { + final byte[] arr1 = hap1.getBases(); + final byte[] arr2 = hap2.getBases(); + // compares byte arrays using lexical ordering + final int len = Math.min(arr1.length, arr2.length); + for( int iii = 0; iii < len; iii++ ) { + final int cmp = arr1[iii] - arr2[iii]; + if (cmp != 0) { return cmp; } + } + return arr2.length - arr1.length; + } + } + public static LinkedHashMap makeHaplotypeListFromAlleles(final List alleleList, final int startPos, final ReferenceContext ref, diff --git a/public/java/src/org/broadinstitute/sting/utils/PairHMM.java b/public/java/src/org/broadinstitute/sting/utils/PairHMM.java deleted file mode 100644 index 15f7a7869..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/PairHMM.java +++ /dev/null @@ -1,259 +0,0 @@ -/* - * Copyright (c) 2012, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.utils; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; - -import java.util.*; - -/** - * Util class for performing the pair HMM for local alignment. Figure 4.3 in Durbin 1998 book. - * User: rpoplin - * Date: 3/1/12 - */ - -public class PairHMM { - private static final Byte MAX_CACHED_QUAL = Byte.MAX_VALUE; - private static final byte DEFAULT_GOP = (byte) 45; - private static final byte DEFAULT_GCP = (byte) 10; - private static final double BANDING_TOLERANCE = 22.0; - private static final int BANDING_CLUSTER_WINDOW = 12; - private final boolean noBanded; - - public PairHMM() { - noBanded = false; - } - - public PairHMM( final boolean noBanded ) { - this.noBanded = noBanded; - } - - - public static void initializeArrays(final double[][] matchMetricArray, final double[][] XMetricArray, final double[][] YMetricArray, - final int X_METRIC_LENGTH) { - - for( int iii=0; iii < X_METRIC_LENGTH; iii++ ) { - Arrays.fill(matchMetricArray[iii], Double.NEGATIVE_INFINITY); - Arrays.fill(XMetricArray[iii], Double.NEGATIVE_INFINITY); - Arrays.fill(YMetricArray[iii], Double.NEGATIVE_INFINITY); - } - - // the initial condition - matchMetricArray[1][1] = 0.0; // Math.log10(1.0); - - } - - @Requires({"readBases.length == readQuals.length","readBases.length == insertionGOP.length","readBases.length == deletionGOP.length","readBases.length == overallGCP.length"}) - @Ensures({"!Double.isInfinite(result)", "!Double.isNaN(result)"}) // Result should be a proper log10 probability - public double computeReadLikelihoodGivenHaplotype( final byte[] haplotypeBases, final byte[] readBases, final byte[] readQuals, - final byte[] insertionGOP, final byte[] deletionGOP, final byte[] overallGCP ) { - - // M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment - final int X_METRIC_LENGTH = readBases.length + 2; - final int Y_METRIC_LENGTH = haplotypeBases.length + 2; - - // initial arrays to hold the probabilities of being in the match, insertion and deletion cases - final double[][] matchMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; - final double[][] XMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; - final double[][] YMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; - - initializeArrays(matchMetricArray, XMetricArray, YMetricArray, X_METRIC_LENGTH); - - return computeReadLikelihoodGivenHaplotype(haplotypeBases, readBases, readQuals, insertionGOP, deletionGOP, overallGCP, 0, matchMetricArray, XMetricArray, YMetricArray); - } - - @Requires({"readBases.length == readQuals.length","readBases.length == insertionGOP.length","readBases.length == deletionGOP.length","readBases.length == overallGCP.length"}) - @Ensures({"!Double.isInfinite(result)", "!Double.isNaN(result)"}) // Result should be a proper log10 probability - public double computeReadLikelihoodGivenHaplotype( final byte[] haplotypeBases, final byte[] readBases, final byte[] readQuals, - final byte[] insertionGOP, final byte[] deletionGOP, final byte[] overallGCP, final int hapStartIndex, - final double[][] matchMetricArray, final double[][] XMetricArray, final double[][] YMetricArray ) { - - // M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment - final int X_METRIC_LENGTH = readBases.length + 2; - final int Y_METRIC_LENGTH = haplotypeBases.length + 2; - - // ensure that all the qual scores have valid values - for( int iii = 0; iii < readQuals.length; iii++ ) { - readQuals[iii] = ( readQuals[iii] < QualityUtils.MIN_USABLE_Q_SCORE ? QualityUtils.MIN_USABLE_Q_SCORE : (readQuals[iii] > MAX_CACHED_QUAL ? MAX_CACHED_QUAL : readQuals[iii]) ); - } - - if( false ) { - final ArrayList workQueue = new ArrayList(); // holds a queue of starting work location (indices along the diagonal). Will be sorted each step - final ArrayList workToBeAdded = new ArrayList(); - final ArrayList calculatedValues = new ArrayList(); - final int numDiags = X_METRIC_LENGTH + Y_METRIC_LENGTH - 1; - workQueue.add( 1 ); // Always start a new thread at the baseline because of partially repeating sequences that match better in the latter half of the haplotype - - for(int diag = 3; diag < numDiags; diag++) { // diag = 3 is the (1,2) element of the metric arrays. (1,1) is the initial condition and is purposefully skipped over - //Collections.sort(workQueue); // no need to sort because elements are guaranteed to be in ascending order - int el = 1; - for( int work : workQueue ) { - // choose the appropriate diagonal baseline location - int iii = 0; - int jjj = diag; - if( diag > Y_METRIC_LENGTH ) { - iii = diag - Y_METRIC_LENGTH; - jjj = Y_METRIC_LENGTH; - } - // move to the starting work location along the diagonal - iii += work; - jjj -= work; - while( iii >= X_METRIC_LENGTH || jjj <= 0 ) { - iii--; - jjj++; - work--; - } - if( !detectClusteredStartLocations(workToBeAdded, work ) ) { - workToBeAdded.add(work); // keep this thread going once it has started - } - - if( work >= el - 3 ) { - // step along the diagonal in the forward direction, updating the match matrices and looking for a drop off from the maximum observed value - double maxElement = Double.NEGATIVE_INFINITY; - for( el = work; el < numDiags + 1; el++ ) { - updateCell(iii, jjj, haplotypeBases, readBases, readQuals, - insertionGOP, deletionGOP, overallGCP, matchMetricArray, XMetricArray, YMetricArray); - final double bestMetric = MathUtils.max(matchMetricArray[iii][jjj], XMetricArray[iii][jjj], YMetricArray[iii][jjj]); - calculatedValues.add(bestMetric); - if( bestMetric > maxElement ) { - maxElement = bestMetric; - } else if( maxElement - bestMetric > BANDING_TOLERANCE ) { - break; - } - if( ++iii >= X_METRIC_LENGTH ) { // don't walk off the edge of the matrix - break; - } - if( --jjj <= 0 ) { // don't walk off the edge of the matrix - break; - } - } - - // find a local maximum to start a new band in the work queue - double localMaxElement = Double.NEGATIVE_INFINITY; - int localMaxElementIndex = 0; - for(int kkk = calculatedValues.size()-1; kkk >= 1; kkk--) { - final double bestMetric = calculatedValues.get(kkk); - if( bestMetric > localMaxElement ) { - localMaxElement = bestMetric; - localMaxElementIndex = kkk; - } else if( localMaxElement - bestMetric > BANDING_TOLERANCE * 0.5 ) { // find a local maximum - if( !detectClusteredStartLocations(workToBeAdded, work + localMaxElementIndex ) ) { - workToBeAdded.add( work + localMaxElementIndex ); - } - break; - } - } - calculatedValues.clear(); - - // reset iii and jjj to the appropriate diagonal baseline location - iii = 0; - jjj = diag; - if( diag > Y_METRIC_LENGTH ) { - iii = diag - Y_METRIC_LENGTH; - jjj = Y_METRIC_LENGTH; - } - // move to the starting work location along the diagonal - iii += work-1; - jjj -= work-1; - - // step along the diagonal in the reverse direction, updating the match matrices and looking for a drop off from the maximum observed value - for( int traceBack = work - 1; traceBack > 0 && iii > 0 && jjj < Y_METRIC_LENGTH; traceBack--,iii--,jjj++ ) { - updateCell(iii, jjj, haplotypeBases, readBases, readQuals, - insertionGOP, deletionGOP, overallGCP, matchMetricArray, XMetricArray, YMetricArray); - final double bestMetric = MathUtils.max(matchMetricArray[iii][jjj], XMetricArray[iii][jjj], YMetricArray[iii][jjj]); - if( bestMetric > maxElement ) { - maxElement = bestMetric; - } else if( maxElement - bestMetric > BANDING_TOLERANCE ) { - break; - } - } - } - } - workQueue.clear(); - workQueue.addAll(workToBeAdded); - workToBeAdded.clear(); - } - } else { - // simple rectangular version of update loop, slow - for( int iii = 1; iii < X_METRIC_LENGTH; iii++ ) { - for( int jjj = hapStartIndex + 1; jjj < Y_METRIC_LENGTH; jjj++ ) { - if( (iii == 1 && jjj == 1) ) { continue; } - updateCell(iii, jjj, haplotypeBases, readBases, readQuals, insertionGOP, deletionGOP, overallGCP, - matchMetricArray, XMetricArray, YMetricArray); - } - } - } - - // final probability is the log10 sum of the last element in all three state arrays - final int endI = X_METRIC_LENGTH - 1; - final int endJ = Y_METRIC_LENGTH - 1; - return MathUtils.approximateLog10SumLog10(matchMetricArray[endI][endJ], XMetricArray[endI][endJ], YMetricArray[endI][endJ]); - } - - private void updateCell( final int indI, final int indJ, final byte[] haplotypeBases, final byte[] readBases, - final byte[] readQuals, final byte[] insertionGOP, final byte[] deletionGOP, final byte[] overallGCP, - final double[][] matchMetricArray, final double[][] XMetricArray, final double[][] YMetricArray ) { - - // the read and haplotype indices are offset by one because the state arrays have an extra column to hold the initial conditions - final int im1 = indI - 1; - final int jm1 = indJ - 1; - - // update the match array - double pBaseReadLog10 = 0.0; // Math.log10(1.0); - if( im1 > 0 && jm1 > 0 ) { // the emission probability is applied when leaving the state - final byte x = readBases[im1-1]; - final byte y = haplotypeBases[jm1-1]; - final byte qual = readQuals[im1-1]; - pBaseReadLog10 = ( x == y || x == (byte) 'N' || y == (byte) 'N' ? QualityUtils.qualToProbLog10(qual) : QualityUtils.qualToErrorProbLog10(qual) ); - } - final int qualIndexGOP = ( im1 == 0 ? DEFAULT_GOP + DEFAULT_GOP : ( insertionGOP[im1-1] + deletionGOP[im1-1] > MAX_CACHED_QUAL ? MAX_CACHED_QUAL : insertionGOP[im1-1] + deletionGOP[im1-1]) ); - final double d0 = QualityUtils.qualToProbLog10((byte)qualIndexGOP); - final double e0 = ( im1 == 0 ? QualityUtils.qualToProbLog10(DEFAULT_GCP) : QualityUtils.qualToProbLog10(overallGCP[im1-1]) ); - matchMetricArray[indI][indJ] = pBaseReadLog10 + MathUtils.approximateLog10SumLog10(matchMetricArray[indI-1][indJ-1] + d0, XMetricArray[indI-1][indJ-1] + e0, YMetricArray[indI-1][indJ-1] + e0); - - // update the X (insertion) array - final double d1 = ( im1 == 0 ? QualityUtils.qualToErrorProbLog10(DEFAULT_GOP) : QualityUtils.qualToErrorProbLog10(insertionGOP[im1-1]) ); - final double e1 = ( im1 == 0 ? QualityUtils.qualToErrorProbLog10(DEFAULT_GCP) : QualityUtils.qualToErrorProbLog10(overallGCP[im1-1]) ); - final double qBaseReadLog10 = 0.0; // Math.log10(1.0) -- we don't have an estimate for this emission probability so assume q=1.0 - XMetricArray[indI][indJ] = qBaseReadLog10 + MathUtils.approximateLog10SumLog10(matchMetricArray[indI-1][indJ] + d1, XMetricArray[indI-1][indJ] + e1); - - // update the Y (deletion) array, with penalty of zero on the left and right flanks to allow for a local alignment within the haplotype - final double d2 = ( im1 == 0 || im1 == readBases.length ? 0.0 : QualityUtils.qualToErrorProbLog10(deletionGOP[im1-1]) ); - final double e2 = ( im1 == 0 || im1 == readBases.length ? 0.0 : QualityUtils.qualToErrorProbLog10(overallGCP[im1-1]) ); - final double qBaseRefLog10 = 0.0; // Math.log10(1.0) -- we don't have an estimate for this emission probability so assume q=1.0 - YMetricArray[indI][indJ] = qBaseRefLog10 + MathUtils.approximateLog10SumLog10(matchMetricArray[indI][indJ-1] + d2, YMetricArray[indI][indJ-1] + e2); - } - - // private function used by the banded approach to ensure the proposed bands are sufficiently distinct from each other - private boolean detectClusteredStartLocations( final ArrayList list, int loc ) { - for(int x : list) { - if( Math.abs(x-loc) <= BANDING_CLUSTER_WINDOW ) { - return true; - } - } - return false; - } -} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/pairhmm/ExactPairHMM.java b/public/java/src/org/broadinstitute/sting/utils/pairhmm/ExactPairHMM.java new file mode 100644 index 000000000..17089ee81 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/pairhmm/ExactPairHMM.java @@ -0,0 +1,107 @@ +package org.broadinstitute.sting.utils.pairhmm; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.QualityUtils; + +import java.util.ArrayList; +import java.util.Arrays; + +/** + * Created with IntelliJ IDEA. + * User: rpoplin + * Date: 10/16/12 + */ + +public class ExactPairHMM extends PairHMM { + + @Override + public void initialize( final int READ_MAX_LENGTH, final int HAPLOTYPE_MAX_LENGTH ) { + + // M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment + final int X_METRIC_LENGTH = READ_MAX_LENGTH + 2; + final int Y_METRIC_LENGTH = HAPLOTYPE_MAX_LENGTH + 2; + + matchMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; + XMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; + YMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; + + for( int iii=0; iii < X_METRIC_LENGTH; iii++ ) { + Arrays.fill(matchMetricArray[iii], Double.NEGATIVE_INFINITY); + Arrays.fill(XMetricArray[iii], Double.NEGATIVE_INFINITY); + Arrays.fill(YMetricArray[iii], Double.NEGATIVE_INFINITY); + } + + // the initial condition + matchMetricArray[1][1] = 0.0; // Math.log10(1.0); + } + + @Override + public double computeReadLikelihoodGivenHaplotypeLog10( final byte[] haplotypeBases, + final byte[] readBases, + final byte[] readQuals, + final byte[] insertionGOP, + final byte[] deletionGOP, + final byte[] overallGCP, + final int hapStartIndex, + final boolean recacheReadValues ) { + + // M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment + final int X_METRIC_LENGTH = readBases.length + 2; + final int Y_METRIC_LENGTH = haplotypeBases.length + 2; + + // ensure that all the qual scores have valid values + for( int iii = 0; iii < readQuals.length; iii++ ) { + readQuals[iii] = ( readQuals[iii] < QualityUtils.MIN_USABLE_Q_SCORE ? QualityUtils.MIN_USABLE_Q_SCORE : (readQuals[iii] > MAX_CACHED_QUAL ? MAX_CACHED_QUAL : readQuals[iii]) ); + } + + // simple rectangular version of update loop, slow + for( int iii = 1; iii < X_METRIC_LENGTH; iii++ ) { + for( int jjj = hapStartIndex + 1; jjj < Y_METRIC_LENGTH; jjj++ ) { + if( (iii == 1 && jjj == 1) ) { continue; } + updateCell(iii, jjj, haplotypeBases, readBases, readQuals, insertionGOP, deletionGOP, overallGCP, + matchMetricArray, XMetricArray, YMetricArray); + } + } + + // final probability is the log10 sum of the last element in all three state arrays + final int endI = X_METRIC_LENGTH - 1; + final int endJ = Y_METRIC_LENGTH - 1; + return MathUtils.log10sumLog10(new double[]{matchMetricArray[endI][endJ], XMetricArray[endI][endJ], YMetricArray[endI][endJ]}); + } + + private void updateCell( final int indI, final int indJ, final byte[] haplotypeBases, final byte[] readBases, + final byte[] readQuals, final byte[] insertionGOP, final byte[] deletionGOP, final byte[] overallGCP, + final double[][] matchMetricArray, final double[][] XMetricArray, final double[][] YMetricArray ) { + + // the read and haplotype indices are offset by one because the state arrays have an extra column to hold the initial conditions + final int im1 = indI - 1; + final int jm1 = indJ - 1; + + // update the match array + double pBaseReadLog10 = 0.0; // Math.log10(1.0); + if( im1 > 0 && jm1 > 0 ) { // the emission probability is applied when leaving the state + final byte x = readBases[im1-1]; + final byte y = haplotypeBases[jm1-1]; + final byte qual = readQuals[im1-1]; + pBaseReadLog10 = ( x == y || x == (byte) 'N' || y == (byte) 'N' ? QualityUtils.qualToProbLog10(qual) : QualityUtils.qualToErrorProbLog10(qual) ); + } + final int qualIndexGOP = ( im1 == 0 ? DEFAULT_GOP + DEFAULT_GOP : ( insertionGOP[im1-1] + deletionGOP[im1-1] > MAX_CACHED_QUAL ? MAX_CACHED_QUAL : insertionGOP[im1-1] + deletionGOP[im1-1]) ); + final double d0 = QualityUtils.qualToProbLog10((byte)qualIndexGOP); + final double e0 = ( im1 == 0 ? QualityUtils.qualToProbLog10(DEFAULT_GCP) : QualityUtils.qualToProbLog10(overallGCP[im1-1]) ); + matchMetricArray[indI][indJ] = pBaseReadLog10 + MathUtils.log10sumLog10(new double[]{matchMetricArray[indI-1][indJ-1] + d0, XMetricArray[indI-1][indJ-1] + e0, YMetricArray[indI-1][indJ-1] + e0}); + + // update the X (insertion) array + final double d1 = ( im1 == 0 ? QualityUtils.qualToErrorProbLog10(DEFAULT_GOP) : QualityUtils.qualToErrorProbLog10(insertionGOP[im1-1]) ); + final double e1 = ( im1 == 0 ? QualityUtils.qualToErrorProbLog10(DEFAULT_GCP) : QualityUtils.qualToErrorProbLog10(overallGCP[im1-1]) ); + final double qBaseReadLog10 = 0.0; // Math.log10(1.0) -- we don't have an estimate for this emission probability so assume q=1.0 + XMetricArray[indI][indJ] = qBaseReadLog10 + MathUtils.log10sumLog10(new double[]{matchMetricArray[indI-1][indJ] + d1, XMetricArray[indI-1][indJ] + e1}); + + // update the Y (deletion) array, with penalty of zero on the left and right flanks to allow for a local alignment within the haplotype + final double d2 = ( im1 == 0 || im1 == readBases.length ? 0.0 : QualityUtils.qualToErrorProbLog10(deletionGOP[im1-1]) ); + final double e2 = ( im1 == 0 || im1 == readBases.length ? 0.0 : QualityUtils.qualToErrorProbLog10(overallGCP[im1-1]) ); + final double qBaseRefLog10 = 0.0; // Math.log10(1.0) -- we don't have an estimate for this emission probability so assume q=1.0 + YMetricArray[indI][indJ] = qBaseRefLog10 + MathUtils.log10sumLog10(new double[]{matchMetricArray[indI][indJ-1] + d2, YMetricArray[indI][indJ-1] + e2}); + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/pairhmm/OriginalPairHMM.java b/public/java/src/org/broadinstitute/sting/utils/pairhmm/OriginalPairHMM.java new file mode 100644 index 000000000..cd946cdf1 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/pairhmm/OriginalPairHMM.java @@ -0,0 +1,105 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.pairhmm; + +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.QualityUtils; + +/** + * Util class for performing the pair HMM for local alignment. Figure 4.3 in Durbin 1998 book. + * User: rpoplin + * Date: 3/1/12 + */ + +public class OriginalPairHMM extends ExactPairHMM { + + @Override + public double computeReadLikelihoodGivenHaplotypeLog10( final byte[] haplotypeBases, + final byte[] readBases, + final byte[] readQuals, + final byte[] insertionGOP, + final byte[] deletionGOP, + final byte[] overallGCP, + final int hapStartIndex, + final boolean recacheReadValues ) { + + // M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment + final int X_METRIC_LENGTH = readBases.length + 2; + final int Y_METRIC_LENGTH = haplotypeBases.length + 2; + + // ensure that all the qual scores have valid values + for( int iii = 0; iii < readQuals.length; iii++ ) { + readQuals[iii] = ( readQuals[iii] < QualityUtils.MIN_USABLE_Q_SCORE ? QualityUtils.MIN_USABLE_Q_SCORE : (readQuals[iii] > MAX_CACHED_QUAL ? MAX_CACHED_QUAL : readQuals[iii]) ); + } + + // simple rectangular version of update loop, slow + for( int iii = 1; iii < X_METRIC_LENGTH; iii++ ) { + for( int jjj = hapStartIndex + 1; jjj < Y_METRIC_LENGTH; jjj++ ) { + if( (iii == 1 && jjj == 1) ) { continue; } + updateCell(iii, jjj, haplotypeBases, readBases, readQuals, insertionGOP, deletionGOP, overallGCP, + matchMetricArray, XMetricArray, YMetricArray); + } + } + + // final probability is the log10 sum of the last element in all three state arrays + final int endI = X_METRIC_LENGTH - 1; + final int endJ = Y_METRIC_LENGTH - 1; + return MathUtils.approximateLog10SumLog10(matchMetricArray[endI][endJ], XMetricArray[endI][endJ], YMetricArray[endI][endJ]); + } + + private void updateCell( final int indI, final int indJ, final byte[] haplotypeBases, final byte[] readBases, + final byte[] readQuals, final byte[] insertionGOP, final byte[] deletionGOP, final byte[] overallGCP, + final double[][] matchMetricArray, final double[][] XMetricArray, final double[][] YMetricArray ) { + + // the read and haplotype indices are offset by one because the state arrays have an extra column to hold the initial conditions + final int im1 = indI - 1; + final int jm1 = indJ - 1; + + // update the match array + double pBaseReadLog10 = 0.0; // Math.log10(1.0); + if( im1 > 0 && jm1 > 0 ) { // the emission probability is applied when leaving the state + final byte x = readBases[im1-1]; + final byte y = haplotypeBases[jm1-1]; + final byte qual = readQuals[im1-1]; + pBaseReadLog10 = ( x == y || x == (byte) 'N' || y == (byte) 'N' ? QualityUtils.qualToProbLog10(qual) : QualityUtils.qualToErrorProbLog10(qual) ); + } + final int qualIndexGOP = ( im1 == 0 ? DEFAULT_GOP + DEFAULT_GOP : ( insertionGOP[im1-1] + deletionGOP[im1-1] > MAX_CACHED_QUAL ? MAX_CACHED_QUAL : insertionGOP[im1-1] + deletionGOP[im1-1]) ); + final double d0 = QualityUtils.qualToProbLog10((byte)qualIndexGOP); + final double e0 = ( im1 == 0 ? QualityUtils.qualToProbLog10(DEFAULT_GCP) : QualityUtils.qualToProbLog10(overallGCP[im1-1]) ); + matchMetricArray[indI][indJ] = pBaseReadLog10 + MathUtils.approximateLog10SumLog10(matchMetricArray[indI-1][indJ-1] + d0, XMetricArray[indI-1][indJ-1] + e0, YMetricArray[indI-1][indJ-1] + e0); + + // update the X (insertion) array + final double d1 = ( im1 == 0 ? QualityUtils.qualToErrorProbLog10(DEFAULT_GOP) : QualityUtils.qualToErrorProbLog10(insertionGOP[im1-1]) ); + final double e1 = ( im1 == 0 ? QualityUtils.qualToErrorProbLog10(DEFAULT_GCP) : QualityUtils.qualToErrorProbLog10(overallGCP[im1-1]) ); + final double qBaseReadLog10 = 0.0; // Math.log10(1.0) -- we don't have an estimate for this emission probability so assume q=1.0 + XMetricArray[indI][indJ] = qBaseReadLog10 + MathUtils.approximateLog10SumLog10(matchMetricArray[indI-1][indJ] + d1, XMetricArray[indI-1][indJ] + e1); + + // update the Y (deletion) array, with penalty of zero on the left and right flanks to allow for a local alignment within the haplotype + final double d2 = ( im1 == 0 || im1 == readBases.length ? 0.0 : QualityUtils.qualToErrorProbLog10(deletionGOP[im1-1]) ); + final double e2 = ( im1 == 0 || im1 == readBases.length ? 0.0 : QualityUtils.qualToErrorProbLog10(overallGCP[im1-1]) ); + final double qBaseRefLog10 = 0.0; // Math.log10(1.0) -- we don't have an estimate for this emission probability so assume q=1.0 + YMetricArray[indI][indJ] = qBaseRefLog10 + MathUtils.approximateLog10SumLog10(matchMetricArray[indI][indJ-1] + d2, YMetricArray[indI][indJ-1] + e2); + } +} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/pairhmm/PairHMM.java b/public/java/src/org/broadinstitute/sting/utils/pairhmm/PairHMM.java new file mode 100644 index 000000000..7a1399c32 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/pairhmm/PairHMM.java @@ -0,0 +1,45 @@ +package org.broadinstitute.sting.utils.pairhmm; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; + +/** + * Created with IntelliJ IDEA. + * User: rpoplin + * Date: 10/16/12 + */ + +public abstract class PairHMM { + protected static final Byte MAX_CACHED_QUAL = Byte.MAX_VALUE; + protected static final byte DEFAULT_GOP = (byte) 45; + protected static final byte DEFAULT_GCP = (byte) 10; + + public enum HMM_IMPLEMENTATION { + /* Very slow implementation which uses very accurate log10 sum functions. Only meant to be used as a reference test implementation */ + EXACT, + /* PairHMM as implemented for the UnifiedGenotyper. Uses log10 sum functions accurate to only 1E-4 */ + ORIGINAL, + /* Optimized version of the PairHMM which caches per-read computations */ + CACHING, + /* Optimized version of the PairHMM which caches per-read computations and operations in real space to avoid costly sums of log10'ed likelihoods */ + LOGLESS_CACHING + } + + protected double[][] matchMetricArray = null; + protected double[][] XMetricArray = null; + protected double[][] YMetricArray = null; + + public abstract void initialize( final int READ_MAX_LENGTH, final int HAPLOTYPE_MAX_LENGTH ); + + @Requires({"readBases.length == readQuals.length", "readBases.length == insertionGOP.length", "readBases.length == deletionGOP.length", + "readBases.length == overallGCP.length", "matchMetricArray!=null", "XMetricArray!=null", "YMetricArray!=null"}) + @Ensures({"!Double.isInfinite(result)", "!Double.isNaN(result)"}) // Result should be a proper log10 likelihood + public abstract double computeReadLikelihoodGivenHaplotypeLog10( final byte[] haplotypeBases, + final byte[] readBases, + final byte[] readQuals, + final byte[] insertionGOP, + final byte[] deletionGOP, + final byte[] overallGCP, + final int hapStartIndex, + final boolean recacheReadValues ); +} From 2c624f76c83f8cfec61001f89928ebe39ab8d713 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Sat, 20 Oct 2012 20:35:54 -0400 Subject: [PATCH 03/34] Refactoring the Unified (and Standard) Argument Collections because it was really ugly that the subclass had to do all the cloning for the super class. The clone() method is really not recommended best practice in Java anyways, so I changed it so that we use standard overloaded constructors. Confirmed that the Haplotype Caller --help docs do not include UG-specific arguments. --- .../haplotypecaller/HaplotypeCaller.java | 4 +- .../StandardCallerArgumentCollection.java | 26 ++++++ .../genotyper/UnifiedArgumentCollection.java | 90 +++++++------------ 3 files changed, 62 insertions(+), 58 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index 5f2b5775c..6d6351fc5 100755 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -241,14 +241,14 @@ public class HaplotypeCaller extends ActiveRegionWalker implem samplesList.addAll( samples ); // initialize the UnifiedGenotyper Engine which is used to call into the exact model final UnifiedArgumentCollection UAC = new UnifiedArgumentCollection( SCAC ); // this adapter is used so that the full set of unused UG arguments aren't exposed to the HC user - UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC.clone(), logger, null, null, samples, VariantContextUtils.DEFAULT_PLOIDY); + UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, null, null, samples, VariantContextUtils.DEFAULT_PLOIDY); UAC.OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY; // low values used for isActive determination only, default/user-specified values used for actual calling UAC.GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY; // low values used for isActive determination only, default/user-specified values used for actual calling UAC.STANDARD_CONFIDENCE_FOR_CALLING = Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_CALLING ); UAC.STANDARD_CONFIDENCE_FOR_EMITTING = Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_EMITTING ); // create a UAC but with the exactCallsLog = null, so we only output the log for the HC caller itself, if requested - UnifiedArgumentCollection simpleUAC = UAC.clone(); + UnifiedArgumentCollection simpleUAC = new UnifiedArgumentCollection(UAC); simpleUAC.exactCallsLog = null; UG_engine_simple_genotyper = new UnifiedGenotyperEngine(getToolkit(), simpleUAC, logger, null, null, samples, VariantContextUtils.DEFAULT_PLOIDY); diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java index 085a60191..9b9f04228 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java @@ -69,7 +69,33 @@ public class StandardCallerArgumentCollection { @Argument(fullName = "max_alternate_alleles_for_indels", shortName = "maxAltAllelesForIndels", doc = "Maximum number of alternate alleles to genotype for indels only", required = false) public int MAX_ALTERNATE_ALLELES_FOR_INDELS = 2; + /** + * If this fraction is greater is than zero, the caller will aggressively attempt to remove contamination through biased down-sampling of reads. + * Basically, it will ignore the contamination fraction of reads for each alternate allele. So if the pileup contains N total bases, then we + * will try to remove (N * contamination fraction) bases for each alternate allele. + */ + @Hidden + @Argument(fullName = "contamination_percentage_to_filter", shortName = "contamination", doc = "Fraction of contamination in sequencing data (for all samples) to aggressively remove", required = false) + public double CONTAMINATION_PERCENTAGE = 0.0; + @Hidden @Argument(shortName = "logExactCalls", doc="x", required=false) public File exactCallsLog = null; + + + public StandardCallerArgumentCollection() { } + + // Developers must remember to add any newly added arguments to the list here as well otherwise they won't get changed from their default value! + public StandardCallerArgumentCollection(final StandardCallerArgumentCollection SCAC) { + this.alleles = SCAC.alleles; + this.GenotypingMode = SCAC.GenotypingMode; + this.heterozygosity = SCAC.heterozygosity; + this.MAX_ALTERNATE_ALLELES = SCAC.MAX_ALTERNATE_ALLELES; + this.MAX_ALTERNATE_ALLELES_FOR_INDELS = SCAC.MAX_ALTERNATE_ALLELES_FOR_INDELS; + this.OutputMode = SCAC.OutputMode; + this.STANDARD_CONFIDENCE_FOR_CALLING = SCAC.STANDARD_CONFIDENCE_FOR_CALLING; + this.STANDARD_CONFIDENCE_FOR_EMITTING = SCAC.STANDARD_CONFIDENCE_FOR_EMITTING; + this.CONTAMINATION_PERCENTAGE = SCAC.CONTAMINATION_PERCENTAGE; + this.exactCallsLog = SCAC.exactCallsLog; + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java index 3eda2017c..17137c5e9 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java @@ -186,63 +186,41 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection @Argument(shortName="ef", fullName="exclude_filtered_reference_sites", doc="Don't include in the analysis sites where the reference sample VCF is filtered. Default: false.", required=false) boolean EXCLUDE_FILTERED_REFERENCE_SITES = false; - // Developers must remember to add any newly added arguments to the list here as well otherwise they won't get changed from their default value! - public UnifiedArgumentCollection clone() { - UnifiedArgumentCollection uac = new UnifiedArgumentCollection(); - - uac.GLmodel = GLmodel; - uac.AFmodel = AFmodel; - uac.heterozygosity = heterozygosity; - uac.PCR_error = PCR_error; - uac.GenotypingMode = GenotypingMode; - uac.OutputMode = OutputMode; - uac.NO_SLOD = NO_SLOD; - uac.ANNOTATE_NUMBER_OF_ALLELES_DISCOVERED = ANNOTATE_NUMBER_OF_ALLELES_DISCOVERED; - uac.STANDARD_CONFIDENCE_FOR_CALLING = STANDARD_CONFIDENCE_FOR_CALLING; - uac.STANDARD_CONFIDENCE_FOR_EMITTING = STANDARD_CONFIDENCE_FOR_EMITTING; - uac.MIN_BASE_QUALTY_SCORE = MIN_BASE_QUALTY_SCORE; - uac.MAX_DELETION_FRACTION = MAX_DELETION_FRACTION; - uac.MIN_INDEL_COUNT_FOR_GENOTYPING = MIN_INDEL_COUNT_FOR_GENOTYPING; - uac.MIN_INDEL_FRACTION_PER_SAMPLE = MIN_INDEL_FRACTION_PER_SAMPLE; - uac.INDEL_HETEROZYGOSITY = INDEL_HETEROZYGOSITY; - uac.INDEL_GAP_OPEN_PENALTY = INDEL_GAP_OPEN_PENALTY; - uac.INDEL_GAP_CONTINUATION_PENALTY = INDEL_GAP_CONTINUATION_PENALTY; - uac.OUTPUT_DEBUG_INDEL_INFO = OUTPUT_DEBUG_INDEL_INFO; - uac.INDEL_HAPLOTYPE_SIZE = INDEL_HAPLOTYPE_SIZE; - uac.alleles = alleles; - uac.MAX_ALTERNATE_ALLELES = MAX_ALTERNATE_ALLELES; - uac.MAX_ALTERNATE_ALLELES_FOR_INDELS = MAX_ALTERNATE_ALLELES_FOR_INDELS; - uac.GLmodel = GLmodel; - uac.TREAT_ALL_READS_AS_SINGLE_POOL = TREAT_ALL_READS_AS_SINGLE_POOL; - uac.referenceSampleRod = referenceSampleRod; - uac.referenceSampleName = referenceSampleName; - uac.samplePloidy = samplePloidy; - uac.maxQualityScore = minQualityScore; - uac.phredScaledPrior = phredScaledPrior; - uac.minPower = minPower; - uac.minReferenceDepth = minReferenceDepth; - uac.EXCLUDE_FILTERED_REFERENCE_SITES = EXCLUDE_FILTERED_REFERENCE_SITES; - uac.IGNORE_LANE_INFO = IGNORE_LANE_INFO; - uac.exactCallsLog = exactCallsLog; - uac.pairHMM = pairHMM; - - // todo- arguments to remove - uac.IGNORE_SNP_ALLELES = IGNORE_SNP_ALLELES; - return uac; - } - public UnifiedArgumentCollection() { } - public UnifiedArgumentCollection( final StandardCallerArgumentCollection SCAC ) { - super(); - this.alleles = SCAC.alleles; - this.GenotypingMode = SCAC.GenotypingMode; - this.heterozygosity = SCAC.heterozygosity; - this.MAX_ALTERNATE_ALLELES = SCAC.MAX_ALTERNATE_ALLELES; - this.MAX_ALTERNATE_ALLELES_FOR_INDELS = SCAC.MAX_ALTERNATE_ALLELES_FOR_INDELS; - this.OutputMode = SCAC.OutputMode; - this.STANDARD_CONFIDENCE_FOR_CALLING = SCAC.STANDARD_CONFIDENCE_FOR_CALLING; - this.STANDARD_CONFIDENCE_FOR_EMITTING = SCAC.STANDARD_CONFIDENCE_FOR_EMITTING; - this.exactCallsLog = SCAC.exactCallsLog; + public UnifiedArgumentCollection(final StandardCallerArgumentCollection SCAC) { + super(SCAC); + } + + // Developers must remember to add any newly added arguments to the list here as well otherwise they won't get changed from their default value! + public UnifiedArgumentCollection(final UnifiedArgumentCollection uac) { + this.GLmodel = uac.GLmodel; + this.AFmodel = uac.AFmodel; + this.PCR_error = uac.PCR_error; + this.NO_SLOD = uac.NO_SLOD; + this.ANNOTATE_NUMBER_OF_ALLELES_DISCOVERED = uac.ANNOTATE_NUMBER_OF_ALLELES_DISCOVERED; + this.MIN_BASE_QUALTY_SCORE = uac.MIN_BASE_QUALTY_SCORE; + this.MAX_DELETION_FRACTION = uac.MAX_DELETION_FRACTION; + this.MIN_INDEL_COUNT_FOR_GENOTYPING = uac.MIN_INDEL_COUNT_FOR_GENOTYPING; + this.MIN_INDEL_FRACTION_PER_SAMPLE = uac.MIN_INDEL_FRACTION_PER_SAMPLE; + this.INDEL_HETEROZYGOSITY = uac.INDEL_HETEROZYGOSITY; + this.INDEL_GAP_OPEN_PENALTY = uac.INDEL_GAP_OPEN_PENALTY; + this.INDEL_GAP_CONTINUATION_PENALTY = uac.INDEL_GAP_CONTINUATION_PENALTY; + this.OUTPUT_DEBUG_INDEL_INFO = uac.OUTPUT_DEBUG_INDEL_INFO; + this.INDEL_HAPLOTYPE_SIZE = uac.INDEL_HAPLOTYPE_SIZE; + this.TREAT_ALL_READS_AS_SINGLE_POOL = uac.TREAT_ALL_READS_AS_SINGLE_POOL; + this.referenceSampleRod = uac.referenceSampleRod; + this.referenceSampleName = uac.referenceSampleName; + this.samplePloidy = uac.samplePloidy; + this.maxQualityScore = uac.minQualityScore; + this.phredScaledPrior = uac.phredScaledPrior; + this.minPower = uac.minPower; + this.minReferenceDepth = uac.minReferenceDepth; + this.EXCLUDE_FILTERED_REFERENCE_SITES = uac.EXCLUDE_FILTERED_REFERENCE_SITES; + this.IGNORE_LANE_INFO = uac.IGNORE_LANE_INFO; + this.pairHMM = uac.pairHMM; + + // todo- arguments to remove + this.IGNORE_SNP_ALLELES = uac.IGNORE_SNP_ALLELES; } } From 841a906f210ac363817fe3e84794281501e4b30f Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Sat, 20 Oct 2012 23:31:56 -0400 Subject: [PATCH 04/34] Adding a hidden (for now) argument to UG (and HC) that tells the caller that the incoming samples are contaminated by N% and to fix it by aggressively down-sampling all alleles. This actually works. Yes, you read that right: given that we know what N is, we can make good calls on bams that have N% contamination. Only hooked up for SNPS right now. No tests added yet. --- ...NPGenotypeLikelihoodsCalculationModel.java | 67 +++++++++++++++++-- 1 file changed, 60 insertions(+), 7 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java index 76ba72017..d053e13a8 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java @@ -41,19 +41,20 @@ import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl; import org.broadinstitute.sting.utils.variantcontext.*; -import java.util.ArrayList; -import java.util.List; -import java.util.Map; +import java.util.*; public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsCalculationModel { private final boolean useAlleleFromVCF; private final double[] likelihoodSums = new double[4]; - + private final ArrayList[] alleleStratifiedElements = new ArrayList[4]; + protected SNPGenotypeLikelihoodsCalculationModel(UnifiedArgumentCollection UAC, Logger logger) { super(UAC, logger); useAlleleFromVCF = UAC.GenotypingMode == GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES; + for ( int i = 0; i < 4; i++ ) + alleleStratifiedElements[i] = new ArrayList(); } public VariantContext getLikelihoods(final RefMetaDataTracker tracker, @@ -78,8 +79,10 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC ArrayList GLs = new ArrayList(contexts.size()); for ( Map.Entry sample : contexts.entrySet() ) { ReadBackedPileup pileup = AlignmentContextUtils.stratify(sample.getValue(), contextType).getBasePileup(); + if ( UAC.CONTAMINATION_PERCENTAGE > 0.0 ) + pileup = createDecontaminatedPileup(pileup, UAC.CONTAMINATION_PERCENTAGE); if ( useBAQedPileup ) - pileup = createBAQedPileup( pileup ); + pileup = createBAQedPileup(pileup); // create the GenotypeLikelihoods object final DiploidSNPGenotypeLikelihoods GL = new DiploidSNPGenotypeLikelihoods(UAC.PCR_error); @@ -150,8 +153,6 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC // create the genotypes; no-call everyone for now final GenotypesContext genotypes = GenotypesContext.create(); - final List noCall = new ArrayList(); - noCall.add(Allele.NO_CALL); for ( SampleGenotypeData sampleData : GLs ) { final double[] allLikelihoods = sampleData.GL.getLikelihoods(); @@ -202,6 +203,42 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC return allelesToUse; } + public ReadBackedPileup createDecontaminatedPileup(final ReadBackedPileup pileup, final double contaminationPercentage) { + // special case removal of all reads + if ( contaminationPercentage >= 1.0 ) + return new ReadBackedPileupImpl(pileup.getLocation(), new ArrayList()); + + // start by stratifying the reads by the alleles they represent at this position + for( final PileupElement pe : pileup ) { + final int baseIndex = BaseUtils.simpleBaseToBaseIndex(pe.getBase()); + if ( baseIndex != -1 ) + alleleStratifiedElements[baseIndex].add(pe); + } + + // Down-sample *each* allele by the contamination fraction applied to the entire pileup. + // Unfortunately, we need to maintain the original pileup ordering of reads or FragmentUtils will complain later. + int numReadsToRemove = (int)Math.ceil((double)pileup.getNumberOfElements() * contaminationPercentage); + final TreeSet elementsToKeep = new TreeSet(new Comparator() { + @Override + public int compare(PileupElement element1, PileupElement element2) { + final int difference = element1.getRead().getAlignmentStart() - element2.getRead().getAlignmentStart(); + return difference != 0 ? difference : element1.getRead().getReadName().compareTo(element2.getRead().getReadName()); + } + }); + + for ( int i = 0; i < 4; i++ ) { + final ArrayList alleleList = alleleStratifiedElements[i]; + if ( alleleList.size() > numReadsToRemove ) + elementsToKeep.addAll(downsampleElements(alleleList, numReadsToRemove)); + } + + // clean up pointers so memory can be garbage collected if needed + for ( int i = 0; i < 4; i++ ) + alleleStratifiedElements[i].clear(); + + return new ReadBackedPileupImpl(pileup.getLocation(), new ArrayList(elementsToKeep)); + } + public ReadBackedPileup createBAQedPileup( final ReadBackedPileup pileup ) { final List BAQedElements = new ArrayList(); for( final PileupElement PE : pileup ) { @@ -220,6 +257,22 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC public byte getQual( final int offset ) { return BAQ.calcBAQFromTag(getRead(), offset, true); } } + private List downsampleElements(final ArrayList elements, final int numElementsToRemove) { + final int pileupSize = elements.size(); + final BitSet itemsToRemove = new BitSet(pileupSize); + for ( Integer selectedIndex : MathUtils.sampleIndicesWithoutReplacement(pileupSize, numElementsToRemove) ) { + itemsToRemove.set(selectedIndex); + } + + ArrayList elementsToKeep = new ArrayList(pileupSize - numElementsToRemove); + for ( int i = 0; i < pileupSize; i++ ) { + if ( !itemsToRemove.get(i) ) + elementsToKeep.add(elements.get(i)); + } + + return elementsToKeep; + } + private static class SampleGenotypeData { public final String name; From d44d5b827538789bae68c47c1634080e6c7d04c9 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Sun, 21 Oct 2012 01:29:59 -0400 Subject: [PATCH 05/34] Fix RawHapMapCodec so that it can build indexes. Minor fixes to VCF codec. --- .../sting/utils/codecs/hapmap/RawHapMapCodec.java | 9 +++++++++ .../broadinstitute/sting/utils/codecs/vcf/VCFCodec.java | 4 +--- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/hapmap/RawHapMapCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/hapmap/RawHapMapCodec.java index 916fb43ea..6b3fce966 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/hapmap/RawHapMapCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/hapmap/RawHapMapCodec.java @@ -25,8 +25,11 @@ package org.broadinstitute.sting.utils.codecs.hapmap; import org.broad.tribble.AsciiFeatureCodec; +import org.broad.tribble.FeatureCodecHeader; import org.broad.tribble.annotation.Strand; +import org.broad.tribble.readers.AsciiLineReader; import org.broad.tribble.readers.LineReader; +import org.broad.tribble.readers.PositionalBufferedStream; import java.io.IOException; import java.util.Arrays; @@ -116,4 +119,10 @@ public class RawHapMapCodec extends AsciiFeatureCodec { } return headerLine; } + + @Override + public FeatureCodecHeader readHeader(final PositionalBufferedStream stream) throws IOException { + final AsciiLineReader br = new AsciiLineReader(stream); + return new FeatureCodecHeader(readHeader(br), br.getPosition()); + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java index 4df1efee7..f12f13dc7 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java @@ -2,8 +2,6 @@ package org.broadinstitute.sting.utils.codecs.vcf; import org.broad.tribble.TribbleException; import org.broad.tribble.readers.LineReader; -import org.broad.tribble.util.ParsingUtils; -import org.broadinstitute.sting.utils.variantcontext.*; import java.io.IOException; import java.util.*; @@ -119,7 +117,7 @@ public class VCFCodec extends AbstractVCFCodec { // empty set for passes filters List fFields = new LinkedList(); // otherwise we have to parse and cache the value - if ( filterString.indexOf(VCFConstants.FILTER_CODE_SEPARATOR) == -1 ) + if ( !filterString.contains(VCFConstants.FILTER_CODE_SEPARATOR) ) fFields.add(filterString); else fFields.addAll(Arrays.asList(filterString.split(VCFConstants.FILTER_CODE_SEPARATOR))); From 0616b98551e7b3dbfa571ff93c65c9a1ef645029 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Sun, 21 Oct 2012 08:26:26 -0400 Subject: [PATCH 06/34] Not sure why we were setting the UAC variables instead of the simpleUAC ones when that's what we wanted. --- .../gatk/walkers/haplotypecaller/HaplotypeCaller.java | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index 6d6351fc5..a08614deb 100755 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -242,13 +242,13 @@ public class HaplotypeCaller extends ActiveRegionWalker implem // initialize the UnifiedGenotyper Engine which is used to call into the exact model final UnifiedArgumentCollection UAC = new UnifiedArgumentCollection( SCAC ); // this adapter is used so that the full set of unused UG arguments aren't exposed to the HC user UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, null, null, samples, VariantContextUtils.DEFAULT_PLOIDY); - UAC.OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY; // low values used for isActive determination only, default/user-specified values used for actual calling - UAC.GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY; // low values used for isActive determination only, default/user-specified values used for actual calling - UAC.STANDARD_CONFIDENCE_FOR_CALLING = Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_CALLING ); - UAC.STANDARD_CONFIDENCE_FOR_EMITTING = Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_EMITTING ); // create a UAC but with the exactCallsLog = null, so we only output the log for the HC caller itself, if requested UnifiedArgumentCollection simpleUAC = new UnifiedArgumentCollection(UAC); + simpleUAC.OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY; // low values used for isActive determination only, default/user-specified values used for actual calling + simpleUAC.GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY; // low values used for isActive determination only, default/user-specified values used for actual calling + simpleUAC.STANDARD_CONFIDENCE_FOR_CALLING = Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_CALLING ); + simpleUAC.STANDARD_CONFIDENCE_FOR_EMITTING = Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_EMITTING ); simpleUAC.exactCallsLog = null; UG_engine_simple_genotyper = new UnifiedGenotyperEngine(getToolkit(), simpleUAC, logger, null, null, samples, VariantContextUtils.DEFAULT_PLOIDY); From 67b9e7319e9be26d49ae1d75ea68aedbb0a0cfd0 Mon Sep 17 00:00:00 2001 From: Guillermo del Angel Date: Sun, 21 Oct 2012 12:38:33 -0400 Subject: [PATCH 07/34] Fix for integration tests: new criterion in AF exact calculation model to trim alleles based on likelihoods does produce better results and resulting alleles changed in 2 sites at integration tests (and all subsequent sites after this had minor annotation differences due to RankSum dithering) --- .../UnifiedGenotyperGeneralPloidyIntegrationTest.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyIntegrationTest.java index 989f06ec5..652489a71 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyIntegrationTest.java @@ -70,12 +70,12 @@ public class UnifiedGenotyperGeneralPloidyIntegrationTest extends WalkerTest { @Test(enabled = true) public void testINDEL_maxAltAlleles2_ploidy3_Pools_noRef() { - PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 3","LSV_INDEL_DISC_NOREF_p3","INDEL","9acfe0019efdc91217ee070acb071228"); + PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 3","LSV_INDEL_DISC_NOREF_p3","INDEL","06a512271631c5b511314a2618de82d7"); } @Test(enabled = true) public void testINDEL_maxAltAlleles2_ploidy1_Pools_noRef() { - PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 1","LSV_INDEL_DISC_NOREF_p1","INDEL","c1d4dd793f61710a1b1fc5d82803210f"); + PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 1","LSV_INDEL_DISC_NOREF_p1","INDEL","36a383adfdbf1f59656138b538a9920d"); } @Test(enabled = true) From 9c63cee9fcdb69a7a8e8d77a771ddb2afa18f7cd Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 17 Oct 2012 18:36:14 -0400 Subject: [PATCH 08/34] Moving pnrm to UnifiedArgumentCollection so it's available with the HaplotypeCaller --- .../arguments/StandardCallerArgumentCollection.java | 10 +++++++++- .../walkers/genotyper/UnifiedArgumentCollection.java | 11 +++-------- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java index 9b9f04228..a511364f9 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java @@ -3,6 +3,7 @@ package org.broadinstitute.sting.gatk.arguments; import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel; import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine; +import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.AFCalcFactory; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.io.File; @@ -82,7 +83,6 @@ public class StandardCallerArgumentCollection { @Argument(shortName = "logExactCalls", doc="x", required=false) public File exactCallsLog = null; - public StandardCallerArgumentCollection() { } // Developers must remember to add any newly added arguments to the list here as well otherwise they won't get changed from their default value! @@ -97,5 +97,13 @@ public class StandardCallerArgumentCollection { this.STANDARD_CONFIDENCE_FOR_EMITTING = SCAC.STANDARD_CONFIDENCE_FOR_EMITTING; this.CONTAMINATION_PERCENTAGE = SCAC.CONTAMINATION_PERCENTAGE; this.exactCallsLog = SCAC.exactCallsLog; + this.AFmodel = SCAC.AFmodel; } + + /** + * Controls the model used to calculate the probability that a site is variant plus the various sample genotypes in the data at a given locus. + */ + @Advanced + @Argument(fullName = "p_nonref_model", shortName = "pnrm", doc = "Non-reference probability calculation model to employ", required = false) + public AFCalcFactory.Calculation AFmodel = AFCalcFactory.Calculation.EXACT; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java index 17137c5e9..abf0b4420 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java @@ -27,8 +27,11 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.gatk.arguments.StandardCallerArgumentCollection; +<<<<<<< HEAD import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.AFCalcFactory; import org.broadinstitute.sting.utils.pairhmm.PairHMM; +======= +>>>>>>> 19181ee... Moving pnrm to UnifiedArgumentCollection so it's available with the HaplotypeCaller import org.broadinstitute.sting.utils.variantcontext.VariantContext; import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; @@ -38,13 +41,6 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection @Argument(fullName = "genotype_likelihoods_model", shortName = "glm", doc = "Genotype likelihoods calculation model to employ -- SNP is the default option, while INDEL is also available for calling indels and BOTH is available for calling both together", required = false) public GenotypeLikelihoodsCalculationModel.Model GLmodel = GenotypeLikelihoodsCalculationModel.Model.SNP; - /** - * Controls the model used to calculate the probability that a site is variant plus the various sample genotypes in the data at a given locus. - */ - @Advanced - @Argument(fullName = "p_nonref_model", shortName = "pnrm", doc = "Non-reference probability calculation model to employ", required = false) - public AFCalcFactory.Calculation AFmodel = AFCalcFactory.Calculation.EXACT; - /** * The PCR error rate is independent of the sequencing error rate, which is necessary because we cannot necessarily * distinguish between PCR errors vs. sequencing errors. The practical implication for this value is that it @@ -219,7 +215,6 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection this.EXCLUDE_FILTERED_REFERENCE_SITES = uac.EXCLUDE_FILTERED_REFERENCE_SITES; this.IGNORE_LANE_INFO = uac.IGNORE_LANE_INFO; this.pairHMM = uac.pairHMM; - // todo- arguments to remove this.IGNORE_SNP_ALLELES = uac.IGNORE_SNP_ALLELES; } From 99c9031cb4c20cf996202329ca17978ea1fde59e Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 17 Oct 2012 20:41:33 -0400 Subject: [PATCH 09/34] Merge AFCalcResultTracker into StateTracker, cleanup -- These two classes were really the same, and now they are actually the same! -- Cleanuped the interfaces, removed duplicate data -- Added lots of contracts, some of which found numerical issues with GeneralPloidyExactAFCalc (which have been patched over but not fixed) -- Moved goodProbability and goodProbabilityVector utilities to MathUtils. Very useful for contracts! --- .../afcalc/GeneralPloidyExactAFCalc.java | 320 ++++++------------ ...neralPloidyAFCalculationModelUnitTest.java | 9 +- .../gatk/walkers/genotyper/afcalc/AFCalc.java | 30 +- .../genotyper/afcalc/AFCalcResult.java | 55 +-- .../genotyper/afcalc/AFCalcResultTracker.java | 256 -------------- .../genotyper/afcalc/DiploidExactAFCalc.java | 55 ++- .../afcalc/OriginalDiploidExactAFCalc.java | 4 - .../genotyper/afcalc/StateTracker.java | 304 ++++++++++++++--- .../broadinstitute/sting/utils/MathUtils.java | 33 ++ 9 files changed, 440 insertions(+), 626 deletions(-) delete mode 100644 public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResultTracker.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyExactAFCalc.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyExactAFCalc.java index 0e97c090c..3916c2549 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyExactAFCalc.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyExactAFCalc.java @@ -26,7 +26,6 @@ package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; import org.broadinstitute.sting.gatk.walkers.genotyper.GeneralPloidyGenotypeLikelihoods; -import org.broadinstitute.sting.gatk.walkers.genotyper.ProbabilityVector; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; @@ -69,8 +68,8 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc { @Override public AFCalcResult computeLog10PNonRef(final VariantContext vc, final double[] log10AlleleFrequencyPriors) { - combineSinglePools(vc.getGenotypes(), vc.getNAlleles(), ploidy, log10AlleleFrequencyPriors, getResultTracker()); - return resultFromTracker(vc, log10AlleleFrequencyPriors); + combineSinglePools(vc.getGenotypes(), vc.getNAlleles(), ploidy, log10AlleleFrequencyPriors); + return getResultFromFinalState(vc, log10AlleleFrequencyPriors); } /** @@ -171,13 +170,11 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc { * @param numAlleles Number of alternate alleles * @param ploidyPerPool Number of samples per pool * @param log10AlleleFrequencyPriors Frequency priors - * @param resultTracker object to fill with output values */ - protected static void combineSinglePools(final GenotypesContext GLs, - final int numAlleles, - final int ploidyPerPool, - final double[] log10AlleleFrequencyPriors, - final AFCalcResultTracker resultTracker) { + protected void combineSinglePools(final GenotypesContext GLs, + final int numAlleles, + final int ploidyPerPool, + final double[] log10AlleleFrequencyPriors) { final ArrayList genotypeLikelihoods = getGLs(GLs, true); @@ -196,24 +193,24 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc { if ( genotypeLikelihoods.size() <= 1 ) { // no meaningful GLs at all, just set the tracker to non poly values - resultTracker.reset(); // just mimic-ing call below - resultTracker.setLog10LikelihoodOfAFzero(0.0); + getStateTracker().reset(); // just mimic-ing call below + getStateTracker().setLog10LikelihoodOfAFzero(0.0); } else { for (int p=1; p ACqueue = new LinkedList(); // mapping of ExactACset indexes to the objects final HashMap indexesToACset = new HashMap(); @@ -230,16 +227,11 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc { indexesToACset.put(zeroSet.getACcounts(), zeroSet); // keep processing while we have AC conformations that need to be calculated - StateTracker stateTracker = new StateTracker(); while ( !ACqueue.isEmpty() ) { - resultTracker.incNEvaluations(); + getStateTracker().incNEvaluations(); // compute log10Likelihoods final ExactACset ACset = ACqueue.remove(); - final double log10LofKs = calculateACConformationAndUpdateQueue(ACset, newPool, originalPool, newGL, log10AlleleFrequencyPriors, originalPloidy, newGLPloidy, resultTracker, stateTracker, ACqueue, indexesToACset); - - // adjust max likelihood seen if needed - if ( log10LofKs > stateTracker.getMaxLog10L()) - stateTracker.update(log10LofKs, ACset.getACcounts()); + final double log10LofKs = calculateACConformationAndUpdateQueue(ACset, newPool, originalPool, newGL, log10AlleleFrequencyPriors, originalPloidy, newGLPloidy, ACqueue, indexesToACset); // clean up memory indexesToACset.remove(ACset.getACcounts()); @@ -260,39 +252,32 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc { * @param log10AlleleFrequencyPriors Prior object * @param originalPloidy Total ploidy of original combined pool * @param newGLPloidy Ploidy of GL vector - * @param resultTracker AFResult object - * @param stateTracker max likelihood observed so far * @param ACqueue Queue of conformations to compute * @param indexesToACset AC indices of objects in queue * @return max log likelihood */ - private static double calculateACConformationAndUpdateQueue(final ExactACset set, - final CombinedPoolLikelihoods newPool, - final CombinedPoolLikelihoods originalPool, - final double[] newGL, - final double[] log10AlleleFrequencyPriors, - final int originalPloidy, - final int newGLPloidy, - final AFCalcResultTracker resultTracker, - final StateTracker stateTracker, - final LinkedList ACqueue, - final HashMap indexesToACset) { + private double calculateACConformationAndUpdateQueue(final ExactACset set, + final CombinedPoolLikelihoods newPool, + final CombinedPoolLikelihoods originalPool, + final double[] newGL, + final double[] log10AlleleFrequencyPriors, + final int originalPloidy, + final int newGLPloidy, + final LinkedList ACqueue, + final HashMap indexesToACset) { // compute likeihood in "set" of new set based on original likelihoods final int numAlleles = set.getACcounts().getCounts().length; final int newPloidy = set.getACsum(); - final double log10LofK = computeLofK(set, originalPool, newGL, log10AlleleFrequencyPriors, numAlleles, originalPloidy, newGLPloidy, resultTracker); + final double log10LofK = computeLofK(set, originalPool, newGL, log10AlleleFrequencyPriors, numAlleles, originalPloidy, newGLPloidy); // add to new pool if (!Double.isInfinite(log10LofK)) newPool.add(set); - // TODO -- uncomment this correct line when the implementation of this model is optimized (it's too slow now to handle this fix) - //if ( log10LofK < stateTracker.maxLog10L - MAX_LOG10_ERROR_TO_STOP_EARLY && stateTracker.isLowerAC(set.ACcounts) ) { - if ( log10LofK < stateTracker.getMaxLog10L() - MAX_LOG10_ERROR_TO_STOP_EARLY ) { - if ( VERBOSE ) - System.out.printf(" *** breaking early set=%s log10L=%.2f maxLog10L=%.2f%n", set.getACcounts(), log10LofK, stateTracker.getMaxLog10L()); + // TODO -- change false to true this correct line when the implementation of this model is optimized (it's too slow now to handle this fix) + if ( getStateTracker().abort(log10LofK, set.getACcounts(), false) ) { return log10LofK; } @@ -323,67 +308,67 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc { } - /** - * Naive combiner of two multiallelic pools - number of alt alleles must be the same. - * Math is generalization of biallelic combiner. - * - * For vector K representing an allele count conformation, - * Pr(D | AC = K) = Sum_G Pr(D|AC1 = G) Pr (D|AC2=K-G) * F(G,K) - * where F(G,K) = choose(m1,[g0 g1 ...])*choose(m2,[...]) / choose(m1+m2,[k1 k2 ...]) - * @param originalPool First log-likelihood pool GL vector - * @param yy Second pool GL vector - * @param ploidy1 Ploidy of first pool (# of chromosomes in it) - * @param ploidy2 Ploidy of second pool - * @param numAlleles Number of alleles - * @param log10AlleleFrequencyPriors Array of biallelic priors - * @param resultTracker Af calculation result object - */ - public static void combineMultiallelicPoolNaively(CombinedPoolLikelihoods originalPool, double[] yy, int ploidy1, int ploidy2, int numAlleles, - final double[] log10AlleleFrequencyPriors, - final AFCalcResultTracker resultTracker) { -/* - final int dim1 = GenotypeLikelihoods.numLikelihoods(numAlleles, ploidy1); - final int dim2 = GenotypeLikelihoods.numLikelihoods(numAlleles, ploidy2); - - if (dim1 != originalPool.getLength() || dim2 != yy.length) - throw new ReviewedStingException("BUG: Inconsistent vector length"); - - if (ploidy2 == 0) - return; - - final int newPloidy = ploidy1 + ploidy2; - - // Say L1(K) = Pr(D|AC1=K) * choose(m1,K) - // and L2(K) = Pr(D|AC2=K) * choose(m2,K) - GeneralPloidyGenotypeLikelihoods.SumIterator firstIterator = new GeneralPloidyGenotypeLikelihoods.SumIterator(numAlleles,ploidy1); - final double[] x = originalPool.getLikelihoodsAsVector(true); - while(firstIterator.hasNext()) { - x[firstIterator.getLinearIndex()] += MathUtils.log10MultinomialCoefficient(ploidy1,firstIterator.getCurrentVector()); - firstIterator.next(); - } - - GeneralPloidyGenotypeLikelihoods.SumIterator secondIterator = new GeneralPloidyGenotypeLikelihoods.SumIterator(numAlleles,ploidy2); - final double[] y = yy.clone(); - while(secondIterator.hasNext()) { - y[secondIterator.getLinearIndex()] += MathUtils.log10MultinomialCoefficient(ploidy2,secondIterator.getCurrentVector()); - secondIterator.next(); - } - - // initialize output to -log10(choose(m1+m2,[k1 k2...]) - final int outputDim = GenotypeLikelihoods.numLikelihoods(numAlleles, newPloidy); - final GeneralPloidyGenotypeLikelihoods.SumIterator outputIterator = new GeneralPloidyGenotypeLikelihoods.SumIterator(numAlleles,newPloidy); - - - // Now, result(K) = logSum_G (L1(G)+L2(K-G)) where G are all possible vectors that sum UP to K - while(outputIterator.hasNext()) { - final ExactACset set = new ExactACset(1, new ExactACcounts(outputIterator.getCurrentAltVector())); - double likelihood = computeLofK(set, x,y, log10AlleleFrequencyPriors, numAlleles, ploidy1, ploidy2, result); - - originalPool.add(likelihood, set, outputIterator.getLinearIndex()); - outputIterator.next(); - } -*/ - } +// /** +// * Naive combiner of two multiallelic pools - number of alt alleles must be the same. +// * Math is generalization of biallelic combiner. +// * +// * For vector K representing an allele count conformation, +// * Pr(D | AC = K) = Sum_G Pr(D|AC1 = G) Pr (D|AC2=K-G) * F(G,K) +// * where F(G,K) = choose(m1,[g0 g1 ...])*choose(m2,[...]) / choose(m1+m2,[k1 k2 ...]) +// * @param originalPool First log-likelihood pool GL vector +// * @param yy Second pool GL vector +// * @param ploidy1 Ploidy of first pool (# of chromosomes in it) +// * @param ploidy2 Ploidy of second pool +// * @param numAlleles Number of alleles +// * @param log10AlleleFrequencyPriors Array of biallelic priors +// * @param resultTracker Af calculation result object +// */ +// public static void combineMultiallelicPoolNaively(CombinedPoolLikelihoods originalPool, double[] yy, int ploidy1, int ploidy2, int numAlleles, +// final double[] log10AlleleFrequencyPriors, +// final AFCalcResultTracker resultTracker) { +///* +// final int dim1 = GenotypeLikelihoods.numLikelihoods(numAlleles, ploidy1); +// final int dim2 = GenotypeLikelihoods.numLikelihoods(numAlleles, ploidy2); +// +// if (dim1 != originalPool.getLength() || dim2 != yy.length) +// throw new ReviewedStingException("BUG: Inconsistent vector length"); +// +// if (ploidy2 == 0) +// return; +// +// final int newPloidy = ploidy1 + ploidy2; +// +// // Say L1(K) = Pr(D|AC1=K) * choose(m1,K) +// // and L2(K) = Pr(D|AC2=K) * choose(m2,K) +// GeneralPloidyGenotypeLikelihoods.SumIterator firstIterator = new GeneralPloidyGenotypeLikelihoods.SumIterator(numAlleles,ploidy1); +// final double[] x = originalPool.getLikelihoodsAsVector(true); +// while(firstIterator.hasNext()) { +// x[firstIterator.getLinearIndex()] += MathUtils.log10MultinomialCoefficient(ploidy1,firstIterator.getCurrentVector()); +// firstIterator.next(); +// } +// +// GeneralPloidyGenotypeLikelihoods.SumIterator secondIterator = new GeneralPloidyGenotypeLikelihoods.SumIterator(numAlleles,ploidy2); +// final double[] y = yy.clone(); +// while(secondIterator.hasNext()) { +// y[secondIterator.getLinearIndex()] += MathUtils.log10MultinomialCoefficient(ploidy2,secondIterator.getCurrentVector()); +// secondIterator.next(); +// } +// +// // initialize output to -log10(choose(m1+m2,[k1 k2...]) +// final int outputDim = GenotypeLikelihoods.numLikelihoods(numAlleles, newPloidy); +// final GeneralPloidyGenotypeLikelihoods.SumIterator outputIterator = new GeneralPloidyGenotypeLikelihoods.SumIterator(numAlleles,newPloidy); +// +// +// // Now, result(K) = logSum_G (L1(G)+L2(K-G)) where G are all possible vectors that sum UP to K +// while(outputIterator.hasNext()) { +// final ExactACset set = new ExactACset(1, new ExactACcounts(outputIterator.getCurrentAltVector())); +// double likelihood = computeLofK(set, x,y, log10AlleleFrequencyPriors, numAlleles, ploidy1, ploidy2, result); +// +// originalPool.add(likelihood, set, outputIterator.getLinearIndex()); +// outputIterator.next(); +// } +//*/ +// } /** * Compute likelihood of a particular AC conformation and update AFresult object @@ -394,15 +379,13 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc { * @param numAlleles Number of alleles (including ref) * @param ploidy1 Ploidy of original pool (combined) * @param ploidy2 Ploidy of new pool - * @param resultTracker AFResult object * @return log-likehood of requested conformation */ - private static double computeLofK(final ExactACset set, - final CombinedPoolLikelihoods firstGLs, - final double[] secondGL, - final double[] log10AlleleFrequencyPriors, - final int numAlleles, final int ploidy1, final int ploidy2, - final AFCalcResultTracker resultTracker) { + private double computeLofK(final ExactACset set, + final CombinedPoolLikelihoods firstGLs, + final double[] secondGL, + final double[] log10AlleleFrequencyPriors, + final int numAlleles, final int ploidy1, final int ploidy2) { final int newPloidy = ploidy1 + ploidy2; @@ -420,8 +403,8 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc { final double log10Lof0 = firstGLs.getGLOfACZero() + secondGL[HOM_REF_INDEX]; set.getLog10Likelihoods()[0] = log10Lof0; - resultTracker.setLog10LikelihoodOfAFzero(log10Lof0); - resultTracker.setLog10PosteriorOfAFzero(log10Lof0 + log10AlleleFrequencyPriors[0]); + getStateTracker().setLog10LikelihoodOfAFzero(log10Lof0); + getStateTracker().setLog10PosteriorOfAFzero(log10Lof0 + log10AlleleFrequencyPriors[0]); return log10Lof0; } else { @@ -464,14 +447,14 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc { // update the MLE if necessary final int altCounts[] = Arrays.copyOfRange(set.getACcounts().getCounts(),1, set.getACcounts().getCounts().length); - resultTracker.updateMLEifNeeded(log10LofK, altCounts); + getStateTracker().updateMLEifNeeded(Double.isInfinite(log10LofK) ? MathUtils.LOG10_P_OF_ZERO : log10LofK, altCounts); // apply the priors over each alternate allele for (final int ACcount : altCounts ) { if ( ACcount > 0 ) log10LofK += log10AlleleFrequencyPriors[ACcount]; } - resultTracker.updateMAPifNeeded(log10LofK, altCounts); + getStateTracker().updateMAPifNeeded(Double.isInfinite(log10LofK) ? MathUtils.LOG10_P_OF_ZERO : log10LofK, altCounts); return log10LofK; } @@ -494,99 +477,6 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc { return (sum == ploidy); } - /** - * Combines naively two biallelic pools (of arbitrary size). - * For two pools of size m1 and m2, we can compute the combined likelihood as: - * Pr(D|AC=k) = Sum_{j=0}^k Pr(D|AC1=j) Pr(D|AC2=k-j) * choose(m1,j)*choose(m2,k-j)/choose(m1+m2,k) - * @param originalPool Pool likelihood vector, x[k] = Pr(AC_i = k) for alt allele i - * @param newPLVector Second GL vector - * @param ploidy1 Ploidy of first pool (# of chromosomes in it) - * @param ploidy2 Ploidy of second pool - * @param log10AlleleFrequencyPriors Array of biallelic priors - * @param resultTracker Af calculation result object - * @return Combined likelihood vector - */ - public static ProbabilityVector combineBiallelicPoolsNaively(final ProbabilityVector originalPool, final double[] newPLVector, - final int ploidy1, final int ploidy2, final double[] log10AlleleFrequencyPriors, - final AFCalcResultTracker resultTracker) { - - final int newPloidy = ploidy1 + ploidy2; - - final double[] combinedLikelihoods = new double[1+newPloidy]; - - /** Pre-fill result array and incorporate weights into input vectors - * Say L1(k) = Pr(D|AC1=k) * choose(m1,k) - * and L2(k) = Pr(D|AC2=k) * choose(m2,k) - * equation reduces to - * Pr(D|AC=k) = 1/choose(m1+m2,k) * Sum_{j=0}^k L1(k) L2(k-j) - * which is just plain convolution of L1 and L2 (with pre-existing vector) - */ - - // intialize result vector to -infinity - Arrays.fill(combinedLikelihoods,Double.NEGATIVE_INFINITY); - - final double[] x = Arrays.copyOf(originalPool.getProbabilityVector(),1+ploidy1); - for (int k=originalPool.getProbabilityVector().length; k< x.length; k++) - x[k] = Double.NEGATIVE_INFINITY; - - final double[] y = newPLVector.clone(); - - - final double log10Lof0 = x[0]+y[0]; - resultTracker.setLog10LikelihoodOfAFzero(log10Lof0); - resultTracker.setLog10PosteriorOfAFzero(log10Lof0 + log10AlleleFrequencyPriors[0]); - - double maxElement = log10Lof0; - int maxElementIdx = 0; - int[] alleleCounts = new int[1]; - for (int k= originalPool.getMinVal() ; k <= newPloidy; k++) { - double[] acc = new double[k+1]; - Arrays.fill(acc,Double.NEGATIVE_INFINITY); - double innerMax = Double.NEGATIVE_INFINITY; - - for (int j=0; j <=k; j++) { - double x1,y1; - - - if (k-j>=0 && k-j < y.length) - y1 = y[k-j] + MathUtils.log10BinomialCoefficient(ploidy2,k-j); - else - continue; - - if (j < x.length) - x1 = x[j] + MathUtils.log10BinomialCoefficient(ploidy1,j); - else - continue; - - if (Double.isInfinite(x1) || Double.isInfinite(y1)) - continue; - acc[j] = x1 + y1; - if (acc[j] > innerMax) - innerMax = acc[j]; - else if (acc[j] < innerMax - MAX_LOG10_ERROR_TO_STOP_EARLY) - break; - } - combinedLikelihoods[k] = MathUtils.log10sumLog10(acc) - MathUtils.log10BinomialCoefficient(newPloidy,k); - maxElementIdx = k; - double maxDiff = combinedLikelihoods[k] - maxElement; - if (maxDiff > 0) - maxElement = combinedLikelihoods[k]; - else if (maxDiff < maxElement - MAX_LOG10_ERROR_TO_STOP_EARLY) { - break; - } - - alleleCounts[0] = k; - resultTracker.updateMLEifNeeded(combinedLikelihoods[k],alleleCounts); - resultTracker.updateMAPifNeeded(combinedLikelihoods[k] + log10AlleleFrequencyPriors[k],alleleCounts); - - - } - - - return new ProbabilityVector(MathUtils.normalizeFromLog10(Arrays.copyOf(combinedLikelihoods,maxElementIdx+1),false, true)); - } - - /** * From a given variant context, extract a given subset of alleles, and update genotype context accordingly, * including updating the PL's, and assign genotypes accordingly @@ -675,10 +565,10 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc { * * @return genotype */ - private static void assignGenotype(final GenotypeBuilder gb, - final double[] newLikelihoods, - final List allelesToUse, - final int numChromosomes) { + private void assignGenotype(final GenotypeBuilder gb, + final double[] newLikelihoods, + final List allelesToUse, + final int numChromosomes) { final int numNewAltAlleles = allelesToUse.size() - 1; diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyAFCalculationModelUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyAFCalculationModelUnitTest.java index 48f282901..1b3a4c0c0 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyAFCalculationModelUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyAFCalculationModelUnitTest.java @@ -137,18 +137,15 @@ public class GeneralPloidyAFCalculationModelUnitTest extends BaseTest { @Test(dataProvider = "getGLs") public void testGLs(GetGLsTest cfg) { - - final AFCalcResultTracker resultTracker = new AFCalcResultTracker(cfg.numAltAlleles); final int len = GeneralPloidyGenotypeLikelihoods.getNumLikelihoodElements(1 + cfg.numAltAlleles, cfg.ploidy * cfg.GLs.size()); double[] priors = new double[len]; // flat priors - GeneralPloidyExactAFCalc.combineSinglePools(cfg.GLs, 1 + cfg.numAltAlleles, cfg.ploidy, priors, resultTracker); + final GeneralPloidyExactAFCalc calc = new GeneralPloidyExactAFCalc(cfg.GLs.size(), 1 + cfg.numAltAlleles, 1 + cfg.numAltAlleles, cfg.ploidy); + calc.combineSinglePools(cfg.GLs, 1 + cfg.numAltAlleles, cfg.ploidy, priors); int nameIndex = 1; for ( int allele = 0; allele < cfg.numAltAlleles; allele++, nameIndex+=2 ) { int expectedAlleleCount = Integer.valueOf(cfg.name.substring(nameIndex, nameIndex+1)); - int calculatedAlleleCount = resultTracker.getAlleleCountsOfMAP()[allele]; - -// System.out.format( "%s Expected:%d Calc:%d\n",cfg.toString(),expectedAlleleCount, calculatedAlleleCount); + int calculatedAlleleCount = calc.getStateTracker().getAlleleCountsOfMAP()[allele]; Assert.assertEquals(calculatedAlleleCount, expectedAlleleCount); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalc.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalc.java index 07f88c9e3..927fadd94 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalc.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalc.java @@ -50,7 +50,7 @@ public abstract class AFCalc implements Cloneable { protected Logger logger = defaultLogger; private SimpleTimer callTimer = new SimpleTimer(); - private final AFCalcResultTracker resultTracker; + private final StateTracker stateTracker; private ExactCallLogger exactCallLogger = null; protected AFCalc(final int nSamples, final int maxAltAlleles, final int maxAltAllelesForIndels, final int ploidy) { @@ -62,7 +62,7 @@ public abstract class AFCalc implements Cloneable { this.nSamples = nSamples; this.maxAlternateAllelesToGenotype = maxAltAlleles; this.maxAlternateAllelesForIndels = maxAltAllelesForIndels; - this.resultTracker = new AFCalcResultTracker(Math.max(maxAltAlleles, maxAltAllelesForIndels)); + this.stateTracker = new StateTracker(Math.max(maxAltAlleles, maxAltAllelesForIndels)); } public void enableProcessLog(final File exactCallsLog) { @@ -83,10 +83,10 @@ public abstract class AFCalc implements Cloneable { public AFCalcResult getLog10PNonRef(final VariantContext vc, final double[] log10AlleleFrequencyPriors) { if ( vc == null ) throw new IllegalArgumentException("VariantContext cannot be null"); if ( log10AlleleFrequencyPriors == null ) throw new IllegalArgumentException("priors vector cannot be null"); - if ( resultTracker == null ) throw new IllegalArgumentException("Results object cannot be null"); + if ( stateTracker == null ) throw new IllegalArgumentException("Results object cannot be null"); // reset the result, so we can store our new result there - resultTracker.reset(); + stateTracker.reset(); final VariantContext vcWorking = reduceScope(vc); @@ -100,10 +100,20 @@ public abstract class AFCalc implements Cloneable { return result; } - @Deprecated - protected AFCalcResult resultFromTracker(final VariantContext vcWorking, final double[] log10AlleleFrequencyPriors) { - resultTracker.setAllelesUsedInGenotyping(vcWorking.getAlleles()); - return resultTracker.toAFCalcResult(log10AlleleFrequencyPriors); + /** + * Convert the final state of the state tracker into our result as an AFCalcResult + * + * Assumes that stateTracker has been updated accordingly + * + * @param vcWorking the VariantContext we actually used as input to the calc model (after reduction) + * @param log10AlleleFrequencyPriors the priors by AC vector + * @return a AFCalcResult describing the result of this calculation + */ + @Requires("stateTracker.getnEvaluations() > 0") + @Ensures("result != null") + protected AFCalcResult getResultFromFinalState(final VariantContext vcWorking, final double[] log10AlleleFrequencyPriors) { + stateTracker.setAllelesUsedInGenotyping(vcWorking.getAlleles()); + return stateTracker.toAFCalcResult(log10AlleleFrequencyPriors); } // --------------------------------------------------------------------------- @@ -162,8 +172,8 @@ public abstract class AFCalc implements Cloneable { return Math.max(maxAlternateAllelesToGenotype, maxAlternateAllelesForIndels); } - public AFCalcResultTracker getResultTracker() { - return resultTracker; + public StateTracker getStateTracker() { + return stateTracker; } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResult.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResult.java index 7cacb2060..a65772444 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResult.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResult.java @@ -83,8 +83,8 @@ public class AFCalcResult { if ( log10pNonRefByAllele == null ) throw new IllegalArgumentException("log10pNonRefByAllele cannot be null"); if ( log10pNonRefByAllele.size() != allelesUsedInGenotyping.size() - 1 ) throw new IllegalArgumentException("log10pNonRefByAllele has the wrong number of elements: log10pNonRefByAllele " + log10pNonRefByAllele + " but allelesUsedInGenotyping " + allelesUsedInGenotyping); if ( ! allelesUsedInGenotyping.containsAll(log10pNonRefByAllele.keySet()) ) throw new IllegalArgumentException("log10pNonRefByAllele doesn't contain all of the alleles used in genotyping: log10pNonRefByAllele " + log10pNonRefByAllele + " but allelesUsedInGenotyping " + allelesUsedInGenotyping); - if ( ! goodLog10ProbVector(log10LikelihoodsOfAC, LOG_10_ARRAY_SIZES, false) ) throw new IllegalArgumentException("log10LikelihoodsOfAC are bad " + Utils.join(",", log10LikelihoodsOfAC)); - if ( ! goodLog10ProbVector(log10PriorsOfAC, LOG_10_ARRAY_SIZES, true) ) throw new IllegalArgumentException("log10priors are bad " + Utils.join(",", log10PriorsOfAC)); + if ( ! MathUtils.goodLog10ProbVector(log10LikelihoodsOfAC, LOG_10_ARRAY_SIZES, false) ) throw new IllegalArgumentException("log10LikelihoodsOfAC are bad " + Utils.join(",", log10LikelihoodsOfAC)); + if ( ! MathUtils.goodLog10ProbVector(log10PriorsOfAC, LOG_10_ARRAY_SIZES, true) ) throw new IllegalArgumentException("log10priors are bad " + Utils.join(",", log10PriorsOfAC)); this.alleleCountsOfMLE = alleleCountsOfMLE; this.nEvaluations = nEvaluations; @@ -147,7 +147,7 @@ public class AFCalcResult { * Due to computational / implementation constraints this may be smaller than * the actual list of alleles requested * - * @return a non-empty list of alleles used during genotyping + * @return a non-empty list of alleles used during genotyping, the first of which is the reference allele */ @Ensures({"result != null", "! result.isEmpty()"}) public List getAllelesUsedInGenotyping() { @@ -159,7 +159,7 @@ public class AFCalcResult { * * @return */ - @Ensures({"goodLog10Probability(result)"}) + @Ensures({"MathUtils.goodLog10Probability(result)"}) public double getLog10PosteriorOfAFEq0() { return log10PosteriorsOfAC[AF0]; } @@ -169,7 +169,7 @@ public class AFCalcResult { * * @return */ - @Ensures({"goodLog10Probability(result)"}) + @Ensures({"MathUtils.goodLog10Probability(result)"}) public double getLog10PosteriorOfAFGT0() { return log10PosteriorsOfAC[AF1p]; } @@ -179,7 +179,7 @@ public class AFCalcResult { * * @return */ - @Ensures({"goodLog10Probability(result)"}) + @Ensures({"MathUtils.goodLog10Probability(result)"}) public double getLog10LikelihoodOfAFEq0() { return log10LikelihoodsOfAC[AF0]; } @@ -189,7 +189,7 @@ public class AFCalcResult { * * @return */ - @Ensures({"goodLog10Probability(result)"}) + @Ensures({"MathUtils.goodLog10Probability(result)"}) public double getLog10LikelihoodOfAFGT0() { return log10LikelihoodsOfAC[AF1p]; } @@ -199,7 +199,7 @@ public class AFCalcResult { * * @return */ - @Ensures({"goodLog10Probability(result)"}) + @Ensures({"MathUtils.goodLog10Probability(result)"}) public double getLog10PriorOfAFEq0() { return log10PriorsOfAC[AF0]; } @@ -209,7 +209,7 @@ public class AFCalcResult { * * @return */ - @Ensures({"goodLog10Probability(result)"}) + @Ensures({"MathUtils.goodLog10Probability(result)"}) public double getLog10PriorOfAFGT0() { return log10PriorsOfAC[AF1p]; } @@ -263,7 +263,7 @@ public class AFCalcResult { * @param allele the allele we're interested in, must be in getAllelesUsedInGenotyping * @return the log10 probability that allele is segregating at this site */ - @Ensures("goodLog10Probability(result)") + @Ensures("MathUtils.goodLog10Probability(result)") public double getLog10PosteriorOfAFGt0ForAllele(final Allele allele) { final Double log10pNonRef = log10pNonRefByAllele.get(allele); if ( log10pNonRef == null ) throw new IllegalArgumentException("Unknown allele " + allele); @@ -279,7 +279,7 @@ public class AFCalcResult { * @return freshly allocated log10 normalized posteriors vector */ @Requires("log10LikelihoodsOfAC.length == log10PriorsOfAC.length") - @Ensures("goodLog10ProbVector(result, LOG_10_ARRAY_SIZES, true)") + @Ensures("MathUtils.goodLog10ProbVector(result, LOG_10_ARRAY_SIZES, true)") private static double[] computePosteriors(final double[] log10LikelihoodsOfAC, final double[] log10PriorsOfAC) { final double[] log10UnnormalizedPosteriors = new double[log10LikelihoodsOfAC.length]; for ( int i = 0; i < log10LikelihoodsOfAC.length; i++ ) @@ -287,29 +287,6 @@ public class AFCalcResult { return MathUtils.normalizeFromLog10(log10UnnormalizedPosteriors, true, false); } - /** - * Check that the log10 prob vector vector is well formed - * - * @param vector - * @param expectedSize - * @param shouldSumToOne - * - * @return true if vector is well-formed, false otherwise - */ - private static boolean goodLog10ProbVector(final double[] vector, final int expectedSize, final boolean shouldSumToOne) { - if ( vector.length != expectedSize ) return false; - - for ( final double pr : vector ) { - if ( ! goodLog10Probability(pr) ) - return false; - } - - if ( shouldSumToOne && MathUtils.compareDoubles(MathUtils.sumLog10(vector), 1.0, 1e-4) != 0 ) - return false; - - return true; // everything is good - } - /** * Computes the offset into linear vectors indexed by alt allele for allele * @@ -331,14 +308,4 @@ public class AFCalcResult { else return index - 1; } - - /** - * Checks that the result is a well-formed log10 probability - * - * @param result a supposedly well-formed log10 probability value - * @return true if result is really well formed - */ - private static boolean goodLog10Probability(final double result) { - return result <= 0.0 && ! Double.isInfinite(result) && ! Double.isNaN(result); - } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResultTracker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResultTracker.java deleted file mode 100644 index 5c926a4d8..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResultTracker.java +++ /dev/null @@ -1,256 +0,0 @@ -/* - * Copyright (c) 2010. - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; - -import com.google.java.contract.Ensures; -import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.variantcontext.Allele; - -import java.util.Arrays; -import java.util.HashMap; -import java.util.List; -import java.util.Map; - -/** - * Created by IntelliJ IDEA. - * User: ebanks - * Date: Dec 14, 2011 - * - * Useful helper class to communicate the results of the allele frequency calculation - * - * TODO -- WHAT IS THE CONTRACT ON MAP AC AND P NON REF? - */ -class AFCalcResultTracker { - protected static final double VALUE_NOT_CALCULATED = Double.NEGATIVE_INFINITY; - - // These variables are intended to contain the MLE and MAP (and their corresponding allele counts) of the site over all alternate alleles - protected double log10MLE; - protected double log10MAP; - private final int[] alleleCountsOfMLE; - private final int[] alleleCountsOfMAP; - - // The posteriors seen, not including that of AF=0 - private static final int LIKELIHOODS_CACHE_SIZE = 5000; - private final double[] log10LikelihoodsMatrixValues = new double[LIKELIHOODS_CACHE_SIZE]; - private int currentLikelihoodsCacheIndex = 0; - protected Double log10LikelihoodsMatrixSum = null; - - // These variables are intended to contain the likelihood/posterior probability for the site's being monomorphic (i.e. AF=0 for all alternate alleles) - private double log10LikelihoodOfAFzero; - private double log10PosteriorOfAFzero; - private int[] AClimits; - - int nEvaluations = 0; - - /** - * The list of alleles actually used in computing the AF - */ - private List allelesUsedInGenotyping = null; - - /** - * Create a results object capability of storing results for calls with up to maxAltAlleles - * - * @param maxAltAlleles an integer >= 1 - */ - public AFCalcResultTracker(final int maxAltAlleles) { - if ( maxAltAlleles < 1 ) throw new IllegalArgumentException("maxAltAlleles must be >= 0, saw " + maxAltAlleles); - - alleleCountsOfMLE = new int[maxAltAlleles]; - alleleCountsOfMAP = new int[maxAltAlleles]; - - reset(); - } - - /** - * Returns a vector with maxAltAlleles values containing AC values at the MLE - * - * The values of the ACs for this call are stored in the getAllelesUsedInGenotyping order, - * starting from index 0 (i.e., the first alt allele is at 0). The vector is always - * maxAltAlleles in length, and so only the first getAllelesUsedInGenotyping.size() - 1 values - * are meaningful. - * - * @return a vector with allele counts, not all of which may be meaningful - */ - @Ensures("result != null") - public int[] getAlleleCountsOfMLE() { - return alleleCountsOfMLE; - } - - /** - * Returns a vector with maxAltAlleles values containing AC values at the MAP - * - * @see #getAlleleCountsOfMLE() for the encoding of results in this vector - * - * @return a non-null vector of ints - */ - @Ensures("result != null") - public int[] getAlleleCountsOfMAP() { - return alleleCountsOfMAP; - } - - /** - * Returns the likelihoods summed across all AC values for AC > 0 - * - * @return - */ - public double getLog10LikelihoodOfAFNotZero() { - if ( log10LikelihoodsMatrixSum == null ) { - if ( currentLikelihoodsCacheIndex == 0 ) // there's nothing to sum up, so make the sum equal to the smallest thing we have - log10LikelihoodsMatrixSum = MathUtils.LOG10_P_OF_ZERO; - else - log10LikelihoodsMatrixSum = MathUtils.log10sumLog10(log10LikelihoodsMatrixValues, 0, currentLikelihoodsCacheIndex); - } - return log10LikelihoodsMatrixSum; - } - - public double getLog10LikelihoodOfAFNotZero(final boolean capAt0) { - return Math.min(getLog10LikelihoodOfAFNotZero(), capAt0 ? 0.0 : Double.POSITIVE_INFINITY); - } - - /** - * TODO -- eric what is this supposed to return? my unit tests don't do what I think they should - * - * @return - */ - public double getLog10LikelihoodOfAFzero() { - return log10LikelihoodOfAFzero; - } - - /** - * TODO -- eric what is this supposed to return? my unit tests don't do what I think they should - * - * @return - */ - public double getLog10PosteriorOfAFzero() { - return log10PosteriorOfAFzero; - } - - protected AFCalcResult toAFCalcResult(final double[] log10PriorsByAC) { - final int [] subACOfMLE = Arrays.copyOf(alleleCountsOfMLE, allelesUsedInGenotyping.size() - 1); - final double[] log10Likelihoods = new double[]{getLog10LikelihoodOfAFzero(), getLog10LikelihoodOfAFNotZero(true)}; - final double[] log10Priors = MathUtils.normalizeFromLog10(new double[]{log10PriorsByAC[0], MathUtils.log10sumLog10(log10PriorsByAC, 1)}, true); - - // TODO -- replace with more meaningful computation - // TODO -- refactor this calculation into the ref calculation - final Map log10pNonRefByAllele = new HashMap(allelesUsedInGenotyping.size()); - for ( int i = 0; i < subACOfMLE.length; i++ ) { - final Allele allele = allelesUsedInGenotyping.get(i+1); - final double log10PNonRef = getAlleleCountsOfMAP()[i] > 0 ? 0 : -10000; // TODO -- a total hack but in effect what the old behavior was - log10pNonRefByAllele.put(allele, log10PNonRef); - } - - return new AFCalcResult(subACOfMLE, nEvaluations, allelesUsedInGenotyping, log10Likelihoods, log10Priors, log10pNonRefByAllele); - } - - // -------------------------------------------------------------------------------- - // - // Protected mutational methods only for use within the calculation models themselves - // - // -------------------------------------------------------------------------------- - - /** - * Reset the data in this results object, so that it can be used in a subsequent AF calculation - * - * Resetting of the data is done by the calculation model itself, so shouldn't be done by callers any longer - */ - protected void reset() { - log10MLE = log10MAP = log10LikelihoodOfAFzero = log10PosteriorOfAFzero = VALUE_NOT_CALCULATED; - for ( int i = 0; i < alleleCountsOfMLE.length; i++ ) { - alleleCountsOfMLE[i] = 0; - alleleCountsOfMAP[i] = 0; - } - currentLikelihoodsCacheIndex = 0; - log10LikelihoodsMatrixSum = null; - allelesUsedInGenotyping = null; - nEvaluations = 0; - Arrays.fill(log10LikelihoodsMatrixValues, Double.POSITIVE_INFINITY); - } - - /** - * Tell this result we used one more evaluation cycle - */ - protected void incNEvaluations() { - nEvaluations++; - } - - protected void updateMLEifNeeded(final double log10LofK, final int[] alleleCountsForK) { - addToLikelihoodsCache(log10LofK); - - if ( log10LofK > log10MLE ) { - log10MLE = log10LofK; - for ( int i = 0; i < alleleCountsForK.length; i++ ) - alleleCountsOfMLE[i] = alleleCountsForK[i]; - } - } - - protected void updateMAPifNeeded(final double log10LofK, final int[] alleleCountsForK) { - if ( log10LofK > log10MAP ) { - log10MAP = log10LofK; - for ( int i = 0; i < alleleCountsForK.length; i++ ) - alleleCountsOfMAP[i] = alleleCountsForK[i]; - } - } - - private void addToLikelihoodsCache(final double log10LofK) { - // add to the cache - log10LikelihoodsMatrixValues[currentLikelihoodsCacheIndex++] = log10LofK; - - // if we've filled up the cache, then condense by summing up all of the values and placing the sum back into the first cell - if ( currentLikelihoodsCacheIndex == LIKELIHOODS_CACHE_SIZE) { - final double temporarySum = MathUtils.log10sumLog10(log10LikelihoodsMatrixValues, 0, currentLikelihoodsCacheIndex); - Arrays.fill(log10LikelihoodsMatrixValues, Double.POSITIVE_INFINITY); - log10LikelihoodsMatrixValues[0] = temporarySum; - currentLikelihoodsCacheIndex = 1; - } - } - - protected void setLog10LikelihoodOfAFzero(final double log10LikelihoodOfAFzero) { - this.log10LikelihoodOfAFzero = log10LikelihoodOfAFzero; - if ( log10LikelihoodOfAFzero > log10MLE ) { - log10MLE = log10LikelihoodOfAFzero; - Arrays.fill(alleleCountsOfMLE, 0); - } - } - - protected void setLog10PosteriorOfAFzero(final double log10PosteriorOfAFzero) { - this.log10PosteriorOfAFzero = log10PosteriorOfAFzero; - if ( log10PosteriorOfAFzero > log10MAP ) { - log10MAP = log10PosteriorOfAFzero; - Arrays.fill(alleleCountsOfMAP, 0); - } - } - - protected void setAllelesUsedInGenotyping(List allelesUsedInGenotyping) { - if ( allelesUsedInGenotyping == null || allelesUsedInGenotyping.isEmpty() ) - throw new IllegalArgumentException("allelesUsedInGenotyping cannot be null or empty"); - - this.allelesUsedInGenotyping = allelesUsedInGenotyping; - } - - protected void setAClimits(int[] AClimits) { - this.AClimits = AClimits; - } -} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/DiploidExactAFCalc.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/DiploidExactAFCalc.java index 49915c515..6b345dcf5 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/DiploidExactAFCalc.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/DiploidExactAFCalc.java @@ -36,10 +36,6 @@ public abstract class DiploidExactAFCalc extends ExactAFCalc { if ( ploidy != 2 ) throw new IllegalArgumentException("ploidy must be two for DiploidExactAFCalc and subclasses but saw " + ploidy); } - protected StateTracker makeMaxLikelihood(VariantContext vc, AFCalcResultTracker resultTracker) { - return new StateTracker(); - } - @Override protected AFCalcResult computeLog10PNonRef(final VariantContext vc, final double[] log10AlleleFrequencyPriors) { @@ -60,29 +56,21 @@ public abstract class DiploidExactAFCalc extends ExactAFCalc { ACqueue.add(zeroSet); indexesToACset.put(zeroSet.getACcounts(), zeroSet); - // keep processing while we have AC conformations that need to be calculated - final StateTracker stateTracker = makeMaxLikelihood(vc, getResultTracker()); - while ( !ACqueue.isEmpty() ) { - getResultTracker().incNEvaluations(); // keep track of the number of evaluations + getStateTracker().incNEvaluations(); // keep track of the number of evaluations // compute log10Likelihoods final ExactACset set = ACqueue.remove(); - if ( stateTracker.withinMaxACs(set.getACcounts()) ) { - final double log10LofKs = calculateAlleleCountConformation(set, genotypeLikelihoods, stateTracker, numChr, ACqueue, indexesToACset, log10AlleleFrequencyPriors, getResultTracker()); + final double log10LofKs = calculateAlleleCountConformation(set, genotypeLikelihoods, numChr, ACqueue, indexesToACset, log10AlleleFrequencyPriors); - // adjust max likelihood seen if needed - stateTracker.update(log10LofKs, set.getACcounts()); - - // clean up memory - indexesToACset.remove(set.getACcounts()); - //if ( DEBUG ) - // System.out.printf(" *** removing used set=%s%n", set.ACcounts); - } + // clean up memory + indexesToACset.remove(set.getACcounts()); + //if ( DEBUG ) + // System.out.printf(" *** removing used set=%s%n", set.ACcounts); } - return resultFromTracker(vc, log10AlleleFrequencyPriors); + return getResultFromFinalState(vc, log10AlleleFrequencyPriors); } @Override @@ -153,23 +141,21 @@ public abstract class DiploidExactAFCalc extends ExactAFCalc { private double calculateAlleleCountConformation(final ExactACset set, final ArrayList genotypeLikelihoods, - final StateTracker stateTracker, final int numChr, final LinkedList ACqueue, final HashMap indexesToACset, - final double[] log10AlleleFrequencyPriors, - final AFCalcResultTracker resultTracker) { + final double[] log10AlleleFrequencyPriors) { //if ( DEBUG ) // System.out.printf(" *** computing LofK for set=%s%n", set.ACcounts); // compute the log10Likelihoods - computeLofK(set, genotypeLikelihoods, log10AlleleFrequencyPriors, resultTracker); + computeLofK(set, genotypeLikelihoods, log10AlleleFrequencyPriors); final double log10LofK = set.getLog10Likelihoods()[set.getLog10Likelihoods().length-1]; // can we abort early because the log10Likelihoods are so small? - if ( stateTracker.abort(log10LofK, set.getACcounts()) ) { + if ( getStateTracker().abort(log10LofK, set.getACcounts(), true) ) { //if ( DEBUG ) // System.out.printf(" *** breaking early set=%s log10L=%.2f maxLog10L=%.2f%n", set.ACcounts, log10LofK, maxLog10L); return log10LofK; @@ -188,7 +174,7 @@ public abstract class DiploidExactAFCalc extends ExactAFCalc { ACcountsClone[allele]++; // to get to this conformation, a sample would need to be AB (remember that ref=0) final int PLindex = GenotypeLikelihoods.calculatePLindex(0, allele+1); - updateACset(stateTracker, ACcountsClone, numChr, set, PLindex, ACqueue, indexesToACset, genotypeLikelihoods); + updateACset(ACcountsClone, numChr, set, PLindex, ACqueue, indexesToACset, genotypeLikelihoods); } // add conformations for the k+2 case if it makes sense; note that the 2 new alleles may be the same or different @@ -213,9 +199,9 @@ public abstract class DiploidExactAFCalc extends ExactAFCalc { // IMPORTANT: we must first add the cases where the 2 new alleles are different so that the queue maintains its ordering for ( DependentSet dependent : differentAlleles ) - updateACset(stateTracker, dependent.ACcounts, numChr, set, dependent.PLindex, ACqueue, indexesToACset, genotypeLikelihoods); + updateACset(dependent.ACcounts, numChr, set, dependent.PLindex, ACqueue, indexesToACset, genotypeLikelihoods); for ( DependentSet dependent : sameAlleles ) - updateACset(stateTracker, dependent.ACcounts, numChr, set, dependent.PLindex, ACqueue, indexesToACset, genotypeLikelihoods); + updateACset(dependent.ACcounts, numChr, set, dependent.PLindex, ACqueue, indexesToACset, genotypeLikelihoods); } return log10LofK; @@ -223,8 +209,7 @@ public abstract class DiploidExactAFCalc extends ExactAFCalc { // adds the ExactACset represented by the ACcounts to the ACqueue if not already there (creating it if needed) and // also pushes its value to the given callingSetIndex. - private void updateACset(final StateTracker stateTracker, - final int[] newSetCounts, + private void updateACset(final int[] newSetCounts, final int numChr, final ExactACset dependentSet, final int PLsetIndex, @@ -246,8 +231,7 @@ public abstract class DiploidExactAFCalc extends ExactAFCalc { private void computeLofK(final ExactACset set, final ArrayList genotypeLikelihoods, - final double[] log10AlleleFrequencyPriors, - final AFCalcResultTracker resultTracker) { + final double[] log10AlleleFrequencyPriors) { set.getLog10Likelihoods()[0] = 0.0; // the zero case final int totalK = set.getACsum(); @@ -258,8 +242,8 @@ public abstract class DiploidExactAFCalc extends ExactAFCalc { set.getLog10Likelihoods()[j] = set.getLog10Likelihoods()[j-1] + genotypeLikelihoods.get(j)[HOM_REF_INDEX]; final double log10Lof0 = set.getLog10Likelihoods()[set.getLog10Likelihoods().length-1]; - resultTracker.setLog10LikelihoodOfAFzero(log10Lof0); - resultTracker.setLog10PosteriorOfAFzero(log10Lof0 + log10AlleleFrequencyPriors[0]); + getStateTracker().setLog10LikelihoodOfAFzero(log10Lof0); + getStateTracker().setLog10PosteriorOfAFzero(log10Lof0 + log10AlleleFrequencyPriors[0]); return; } @@ -281,14 +265,15 @@ public abstract class DiploidExactAFCalc extends ExactAFCalc { double log10LofK = set.getLog10Likelihoods()[set.getLog10Likelihoods().length-1]; // update the MLE if necessary - resultTracker.updateMLEifNeeded(log10LofK, set.getACcounts().getCounts()); + getStateTracker().updateMLEifNeeded(log10LofK, set.getACcounts().getCounts()); // apply the priors over each alternate allele for ( final int ACcount : set.getACcounts().getCounts() ) { if ( ACcount > 0 ) log10LofK += log10AlleleFrequencyPriors[ACcount]; } - resultTracker.updateMAPifNeeded(log10LofK, set.getACcounts().getCounts()); + + getStateTracker().updateMAPifNeeded(log10LofK, set.getACcounts().getCounts()); } private void pushData(final ExactACset targetSet, diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/OriginalDiploidExactAFCalc.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/OriginalDiploidExactAFCalc.java index 093bf47d5..88f5e06e6 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/OriginalDiploidExactAFCalc.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/OriginalDiploidExactAFCalc.java @@ -16,10 +16,6 @@ public class OriginalDiploidExactAFCalc extends DiploidExactAFCalc { super(nSamples, maxAltAlleles, maxAltAllelesForIndels, ploidy); } - protected StateTracker makeMaxLikelihood(final VariantContext vc, final AFCalcResultTracker resultTracker) { - return new StateTracker(); - } - @Override protected AFCalcResult computeLog10PNonRef(VariantContext vc, double[] log10AlleleFrequencyPriors) { final double[] log10AlleleFrequencyLikelihoods = new double[log10AlleleFrequencyPriors.length]; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/StateTracker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/StateTracker.java index 19e253277..3eb32d35e 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/StateTracker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/StateTracker.java @@ -1,35 +1,85 @@ package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.variantcontext.Allele; + +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + /** - * Keeps track of the best state seen by the exact model and the max states to visit - * allowing us to abort the search before we visit the entire matrix of AC x samples + * Keeps track of the state information during the exact model AF calculation. + * + * Tracks things like the MLE and MAP AC values, their corresponding likelhood and posterior + * values, the likelihood of the AF == 0 state, and the number of evaluations needed + * by the calculation to compute the P(AF == 0) */ final class StateTracker { - public final static double MAX_LOG10_ERROR_TO_STOP_EARLY = 6; // we want the calculation to be accurate to 1 / 10^6 - - final private int[] maxACsToConsider; - - private ExactACcounts ACsAtMax = null; - private double maxLog10L = Double.NEGATIVE_INFINITY; - - public StateTracker() { - this(null); - } - - public StateTracker(final int[] maxACsToConsider) { - this.maxACsToConsider = maxACsToConsider; - } + protected static final double VALUE_NOT_CALCULATED = Double.NEGATIVE_INFINITY; + protected final static double MAX_LOG10_ERROR_TO_STOP_EARLY = 6; // we want the calculation to be accurate to 1 / 10^6 /** - * Update the maximum log10L seen, if log10LofKs is higher, and the corresponding ACs of this state - * - * @param log10LofKs the likelihood of our current configuration state + * These variables are intended to contain the MLE and MAP (and their corresponding allele counts) + * of the site over all alternate alleles */ - public void update(final double log10LofKs, final ExactACcounts ACs) { - if ( log10LofKs > getMaxLog10L()) { - this.setMaxLog10L(log10LofKs); - this.ACsAtMax = ACs; - } + protected double log10MLE; + protected double log10MAP; + + /** + * Returns a vector with maxAltAlleles values containing AC values at the MLE + * + * The values of the ACs for this call are stored in the getAllelesUsedInGenotyping order, + * starting from index 0 (i.e., the first alt allele is at 0). The vector is always + * maxAltAlleles in length, and so only the first getAllelesUsedInGenotyping.size() - 1 values + * are meaningful. + */ + private final int[] alleleCountsOfMLE; + private final int[] alleleCountsOfMAP; + + /** + * A vector of log10 likelihood values seen, for future summation. When the size of the + * vector is exceeed -- because we've pushed more posteriors than there's space to hold + * -- we simply sum up the existing values, make that the first value, and continue. + */ + private final double[] log10LikelihoodsForAFGt0 = new double[LIKELIHOODS_CACHE_SIZE]; + private static final int LIKELIHOODS_CACHE_SIZE = 5000; + private int log10LikelihoodsForAFGt0CacheIndex = 0; + + /** + * The actual sum of the likelihoods. Null if the sum hasn't been computed yet + */ + protected Double log10LikelihoodsForAFGt0Sum = null; + + /** + * Contains the likelihood for the site's being monomorphic (i.e. AF=0 for all alternate alleles) + */ + private double log10LikelihoodOfAFzero = 0.0; + + /** + * The number of evaluates we've gone through in the AFCalc + */ + private int nEvaluations = 0; + + /** + * The list of alleles actually used in computing the AF + */ + private List allelesUsedInGenotyping = null; + + /** + * Create a results object capability of storing results for calls with up to maxAltAlleles + * + * @param maxAltAlleles an integer >= 1 + */ + public StateTracker(final int maxAltAlleles) { + if ( maxAltAlleles < 1 ) throw new IllegalArgumentException("maxAltAlleles must be >= 0, saw " + maxAltAlleles); + + alleleCountsOfMLE = new int[maxAltAlleles]; + alleleCountsOfMAP = new int[maxAltAlleles]; + + reset(); } /** @@ -39,58 +89,200 @@ final class StateTracker { * @param log10LofK the log10 likelihood of the configuration we're considering analyzing * @return true if the configuration cannot meaningfully contribute to our likelihood sum */ - public boolean tooLowLikelihood(final double log10LofK) { - return log10LofK < getMaxLog10L() - MAX_LOG10_ERROR_TO_STOP_EARLY; + private boolean tooLowLikelihood(final double log10LofK) { + return log10LofK < log10MLE - MAX_LOG10_ERROR_TO_STOP_EARLY; } /** - * Are all ACs in otherACs less than or equal to their corresponding ACs in the maxACsToConsider? + * @return true iff all ACs in this object are less than or equal to their corresponding ACs in the provided set + */ + private boolean isLowerAC(final ExactACcounts otherACs) { + final int[] otherACcounts = otherACs.getCounts(); + + for ( int i = 0; i < otherACcounts.length; i++ ) { + if ( alleleCountsOfMLE[i] > otherACcounts[i] ) + return false; + } + + return true; + } + + /** + * Should we stop exploring paths from ACs, given it's log10LofK * - * @param otherACs the set of otherACs that we want to know if we should consider analyzing - * @return true if otherACs is a state worth considering, or false otherwise + * @param log10LofK the log10LofK of these ACs + * @param ACs the ACs of this state + * @return return true if there's no reason to continue with subpaths of AC, or false otherwise */ - public boolean withinMaxACs(final ExactACcounts otherACs) { - if ( maxACsToConsider == null ) - return true; + protected boolean abort( final double log10LofK, final ExactACcounts ACs, final boolean enforceLowerACs ) { + return tooLowLikelihood(log10LofK) && (!enforceLowerACs || isLowerAC(ACs)); + } - final int[] otherACcounts = otherACs.getCounts(); + @Ensures("result != null") + protected int[] getAlleleCountsOfMAP() { + return alleleCountsOfMAP; + } - for ( int i = 0; i < maxACsToConsider.length; i++ ) { - // consider one more than the max AC to collect a bit more likelihood mass - if ( otherACcounts[i] > maxACsToConsider[i] + 1 ) - return false; - } - - return true; + @Ensures("result >= 0") + protected int getnEvaluations() { + return nEvaluations; } /** - * returns true iff all ACs in this object are less than or equal to their corresponding ACs in the provided set + * Returns the likelihoods summed across all AC values for AC > 0 + * + * @return */ - public boolean isLowerAC(final ExactACcounts otherACs) { - if ( ACsAtMax == null ) - return true; + private double getLog10LikelihoodOfAFNotZero(final boolean capAt0) { + if ( log10LikelihoodsForAFGt0Sum == null ) { + if ( log10LikelihoodsForAFGt0CacheIndex == 0 ) // there's nothing to sum up, so make the sum equal to the smallest thing we have + log10LikelihoodsForAFGt0Sum = MathUtils.LOG10_P_OF_ZERO; + else + log10LikelihoodsForAFGt0Sum = MathUtils.log10sumLog10(log10LikelihoodsForAFGt0, 0, log10LikelihoodsForAFGt0CacheIndex); + } + return Math.min(log10LikelihoodsForAFGt0Sum, capAt0 ? 0.0 : Double.POSITIVE_INFINITY); + } - final int[] myACcounts = this.ACsAtMax.getCounts(); - final int[] otherACcounts = otherACs.getCounts(); + /** + * @return + */ + private double getLog10LikelihoodOfAFzero() { + return log10LikelihoodOfAFzero; + } - for ( int i = 0; i < myACcounts.length; i++ ) { - if ( myACcounts[i] > otherACcounts[i] ) - return false; + /** + * Convert this state to an corresponding AFCalcResult. + * + * Assumes that the values in this state have been filled in with meaningful values during the calculation. + * For example, that the allelesUsedInGenotyping has been set, that the alleleCountsOfMLE contains meaningful + * values, etc. + * + * @param log10PriorsByAC + * + * @return + */ + @Requires("allelesUsedInGenotyping != null") + protected AFCalcResult toAFCalcResult(final double[] log10PriorsByAC) { + final int [] subACOfMLE = Arrays.copyOf(alleleCountsOfMLE, allelesUsedInGenotyping.size() - 1); + final double[] log10Likelihoods = new double[]{getLog10LikelihoodOfAFzero(), getLog10LikelihoodOfAFNotZero(true)}; + final double[] log10Priors = MathUtils.normalizeFromLog10(new double[]{log10PriorsByAC[0], MathUtils.log10sumLog10(log10PriorsByAC, 1)}, true); + + // TODO -- replace with more meaningful computation + // TODO -- refactor this calculation into the ref calculation + final Map log10pNonRefByAllele = new HashMap(allelesUsedInGenotyping.size()); + for ( int i = 0; i < subACOfMLE.length; i++ ) { + final Allele allele = allelesUsedInGenotyping.get(i+1); + final double log10PNonRef = alleleCountsOfMAP[i] > 0 ? 0 : -10000; // TODO -- a total hack but in effect what the old behavior was + log10pNonRefByAllele.put(allele, log10PNonRef); } - return true; + return new AFCalcResult(subACOfMLE, nEvaluations, allelesUsedInGenotyping, log10Likelihoods, log10Priors, log10pNonRefByAllele); } - public boolean abort( final double log10LofK, final ExactACcounts ACs ) { - return tooLowLikelihood(log10LofK) && isLowerAC(ACs); + // -------------------------------------------------------------------------------- + // + // Protected mutational methods only for use within the calculation models themselves + // + // -------------------------------------------------------------------------------- + + /** + * Reset the data in this results object, so that it can be used in a subsequent AF calculation + * + * Resetting of the data is done by the calculation model itself, so shouldn't be done by callers any longer + */ + protected void reset() { + log10MLE = log10MAP = log10LikelihoodOfAFzero = VALUE_NOT_CALCULATED; + log10LikelihoodsForAFGt0CacheIndex = 0; + log10LikelihoodsForAFGt0Sum = null; + allelesUsedInGenotyping = null; + nEvaluations = 0; + Arrays.fill(alleleCountsOfMLE, 0); + Arrays.fill(alleleCountsOfMAP, 0); + Arrays.fill(log10LikelihoodsForAFGt0, Double.POSITIVE_INFINITY); } - public double getMaxLog10L() { - return maxLog10L; + /** + * Tell this result we used one more evaluation cycle + */ + protected void incNEvaluations() { + nEvaluations++; } - public void setMaxLog10L(double maxLog10L) { - this.maxLog10L = maxLog10L; + /** + * Update the maximum log10 likelihoods seen, if log10LofKs is higher, and the corresponding ACs of this state + * + * @param log10LofK the likelihood of our current configuration state, cannot be the 0 state + * @param alleleCountsForK the allele counts for this state + */ + @Requires({"alleleCountsForK != null", "MathUtils.sum(alleleCountsForK) >= 0", "MathUtils.goodLog10Probability(log10LofK)"}) + @Ensures("log10MLE == Math.max(log10LofK, log10MLE)") + protected void updateMLEifNeeded(final double log10LofK, final int[] alleleCountsForK) { + addToLikelihoodsCache(log10LofK); + + if ( log10LofK > log10MLE ) { + log10MLE = log10LofK; + System.arraycopy(alleleCountsForK, 0, alleleCountsOfMLE, 0, alleleCountsForK.length); + } + } + + /** + * Update the maximum log10 posterior seen, if log10PofKs is higher, and the corresponding ACs of this state + * + * @param log10PofK the posterior of our current configuration state + * @param alleleCountsForK the allele counts for this state + */ + @Requires({"alleleCountsForK != null", "MathUtils.sum(alleleCountsForK) >= 0", "MathUtils.goodLog10Probability(log10PofK)"}) + @Ensures("log10MAP == Math.max(log10PofK, log10MAP)") + protected void updateMAPifNeeded(final double log10PofK, final int[] alleleCountsForK) { + if ( log10PofK > log10MAP ) { + log10MAP = log10PofK; + System.arraycopy(alleleCountsForK, 0, alleleCountsOfMAP, 0, alleleCountsForK.length); + } + } + + @Requires({"MathUtils.goodLog10Probability(log10LofK)"}) + private void addToLikelihoodsCache(final double log10LofK) { + // add to the cache + log10LikelihoodsForAFGt0[log10LikelihoodsForAFGt0CacheIndex++] = log10LofK; + + // if we've filled up the cache, then condense by summing up all of the values and placing the sum back into the first cell + if ( log10LikelihoodsForAFGt0CacheIndex == LIKELIHOODS_CACHE_SIZE) { + final double temporarySum = MathUtils.log10sumLog10(log10LikelihoodsForAFGt0, 0, log10LikelihoodsForAFGt0CacheIndex); + Arrays.fill(log10LikelihoodsForAFGt0, Double.POSITIVE_INFINITY); + log10LikelihoodsForAFGt0[0] = temporarySum; + log10LikelihoodsForAFGt0CacheIndex = 1; + } + } + + @Requires({"MathUtils.goodLog10Probability(log10LikelihoodOfAFzero)"}) + protected void setLog10LikelihoodOfAFzero(final double log10LikelihoodOfAFzero) { + this.log10LikelihoodOfAFzero = log10LikelihoodOfAFzero; + if ( log10LikelihoodOfAFzero > log10MLE ) { + log10MLE = log10LikelihoodOfAFzero; + Arrays.fill(alleleCountsOfMLE, 0); + } + } + + @Requires({"MathUtils.goodLog10Probability(log10LikelihoodOfAFzero)"}) + protected void setLog10PosteriorOfAFzero(final double log10PosteriorOfAFzero) { + if ( log10PosteriorOfAFzero > log10MAP ) { + log10MAP = log10PosteriorOfAFzero; + Arrays.fill(alleleCountsOfMAP, 0); + } + } + + /** + * Set the list of alleles used in genotyping + * + * @param allelesUsedInGenotyping the list of alleles, where the first allele is reference + */ + @Requires({"allelesUsedInGenotyping != null", "allelesUsedInGenotyping.size() > 1"}) + protected void setAllelesUsedInGenotyping(List allelesUsedInGenotyping) { + if ( allelesUsedInGenotyping == null || allelesUsedInGenotyping.isEmpty() ) + throw new IllegalArgumentException("allelesUsedInGenotyping cannot be null or empty"); + if ( allelesUsedInGenotyping.get(0).isNonReference() ) + throw new IllegalArgumentException("The first element of allelesUsedInGenotyping must be the reference allele"); + + this.allelesUsedInGenotyping = allelesUsedInGenotyping; } } diff --git a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java index 3740d5d7c..ff153a85c 100644 --- a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java @@ -1194,6 +1194,39 @@ public class MathUtils { return getQScoreOrderStatistic(reads, offsets, (int) Math.floor(reads.size() / 2.)); } + /** + * Check that the log10 prob vector vector is well formed + * + * @param vector + * @param expectedSize + * @param shouldSumToOne + * + * @return true if vector is well-formed, false otherwise + */ + public static boolean goodLog10ProbVector(final double[] vector, final int expectedSize, final boolean shouldSumToOne) { + if ( vector.length != expectedSize ) return false; + + for ( final double pr : vector ) { + if ( ! goodLog10Probability(pr) ) + return false; + } + + if ( shouldSumToOne && compareDoubles(sumLog10(vector), 1.0, 1e-4) != 0 ) + return false; + + return true; // everything is good + } + + /** + * Checks that the result is a well-formed log10 probability + * + * @param result a supposedly well-formed log10 probability value + * @return true if result is really well formed + */ + public static boolean goodLog10Probability(final double result) { + return result <= 0.0 && ! Double.isInfinite(result) && ! Double.isNaN(result); + } + /** * A utility class that computes on the fly average and standard deviation for a stream of numbers. * The number of observations does not have to be known in advance, and can be also very big (so that From 695cf8367598d3cb312e590b2cb3fb6a0cd65faf Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Thu, 18 Oct 2012 07:38:32 -0400 Subject: [PATCH 10/34] More docs and contracts for classes in genotyper.afcalc -- Future protection of the output of GeneralPloidyExactAFCalc, which produces in some cases bad likelihoods (positive values) --- .../afcalc/GeneralPloidyExactAFCalc.java | 6 ++-- .../gatk/walkers/genotyper/afcalc/AFCalc.java | 28 +++++++++++++++++-- .../IndependentAllelesDiploidExactAFCalc.java | 14 ++++++++-- .../afcalc/OriginalDiploidExactAFCalc.java | 2 +- .../genotyper/afcalc/StateTracker.java | 12 +++----- 5 files changed, 45 insertions(+), 17 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyExactAFCalc.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyExactAFCalc.java index 3916c2549..9c7883ab8 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyExactAFCalc.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyExactAFCalc.java @@ -447,14 +447,16 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc { // update the MLE if necessary final int altCounts[] = Arrays.copyOfRange(set.getACcounts().getCounts(),1, set.getACcounts().getCounts().length); - getStateTracker().updateMLEifNeeded(Double.isInfinite(log10LofK) ? MathUtils.LOG10_P_OF_ZERO : log10LofK, altCounts); + // TODO -- GUILLERMO THIS CODE MAY PRODUCE POSITIVE LIKELIHOODS OR -INFINITY + getStateTracker().updateMLEifNeeded(MathUtils.goodLog10Probability(log10LofK) ? log10LofK : MathUtils.LOG10_P_OF_ZERO, altCounts); // apply the priors over each alternate allele for (final int ACcount : altCounts ) { if ( ACcount > 0 ) log10LofK += log10AlleleFrequencyPriors[ACcount]; } - getStateTracker().updateMAPifNeeded(Double.isInfinite(log10LofK) ? MathUtils.LOG10_P_OF_ZERO : log10LofK, altCounts); + // TODO -- GUILLERMO THIS CODE MAY PRODUCE POSITIVE LIKELIHOODS OR -INFINITY + getStateTracker().updateMAPifNeeded(MathUtils.goodLog10Probability(log10LofK) ? log10LofK : MathUtils.LOG10_P_OF_ZERO, altCounts); return log10LofK; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalc.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalc.java index 927fadd94..e3abdeb24 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalc.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalc.java @@ -53,6 +53,16 @@ public abstract class AFCalc implements Cloneable { private final StateTracker stateTracker; private ExactCallLogger exactCallLogger = null; + /** + * Create a new AFCalc object capable of calculating the prob. that alleles are + * segregating among nSamples with up to maxAltAlleles for SNPs and maxAltAllelesForIndels + * for indels for samples with ploidy + * + * @param nSamples number of samples, must be > 0 + * @param maxAltAlleles maxAltAlleles for SNPs + * @param maxAltAllelesForIndels for indels + * @param ploidy the ploidy, must be > 0 + */ protected AFCalc(final int nSamples, final int maxAltAlleles, final int maxAltAllelesForIndels, final int ploidy) { if ( nSamples < 0 ) throw new IllegalArgumentException("nSamples must be greater than zero " + nSamples); if ( maxAltAlleles < 1 ) throw new IllegalArgumentException("maxAltAlleles must be greater than zero " + maxAltAlleles); @@ -65,10 +75,20 @@ public abstract class AFCalc implements Cloneable { this.stateTracker = new StateTracker(Math.max(maxAltAlleles, maxAltAllelesForIndels)); } + /** + * Enable exact call logging to file + * + * @param exactCallsLog the destination file + */ public void enableProcessLog(final File exactCallsLog) { exactCallLogger = new ExactCallLogger(exactCallsLog); } + /** + * Use this logger instead of the default logger + * + * @param logger + */ public void setLogger(Logger logger) { this.logger = logger; } @@ -109,7 +129,7 @@ public abstract class AFCalc implements Cloneable { * @param log10AlleleFrequencyPriors the priors by AC vector * @return a AFCalcResult describing the result of this calculation */ - @Requires("stateTracker.getnEvaluations() > 0") + @Requires("stateTracker.getnEvaluations() >= 0") @Ensures("result != null") protected AFCalcResult getResultFromFinalState(final VariantContext vcWorking, final double[] log10AlleleFrequencyPriors) { stateTracker.setAllelesUsedInGenotyping(vcWorking.getAlleles()); @@ -144,11 +164,13 @@ public abstract class AFCalc implements Cloneable { * @param log10AlleleFrequencyPriors priors * @return a AFCalcResult object describing the results of this calculation */ - // TODO -- add consistent requires among args + @Requires({"vc != null", "log10AlleleFrequencyPriors != null"}) protected abstract AFCalcResult computeLog10PNonRef(final VariantContext vc, final double[] log10AlleleFrequencyPriors); /** + * Subset VC to the just allelesToUse, updating genotype likelihoods + * * Must be overridden by concrete subclasses * * @param vc variant context with alleles and genotype likelihoods @@ -172,7 +194,7 @@ public abstract class AFCalc implements Cloneable { return Math.max(maxAlternateAllelesToGenotype, maxAlternateAllelesForIndels); } - public StateTracker getStateTracker() { + protected StateTracker getStateTracker() { return stateTracker; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalc.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalc.java index c0edee291..2f85a5246 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalc.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalc.java @@ -303,7 +303,13 @@ import java.util.*; /** * Take the independent estimates of pNonRef for each alt allele and combine them into a single result * - * TODO -- add more docs + * Given n independent calculations for each of n alternate alleles create a single + * combined AFCalcResult with: + * + * priors for AF == 0 equal to theta^N for the nth least likely allele + * posteriors that reflect the combined chance that any alleles are segregating and corresponding + * likelihoods + * combined MLEs in the order of the alt alleles in vc * * @param sortedResultsWithThetaNPriors the pNonRef result for each allele independently */ @@ -350,8 +356,10 @@ import java.util.*; }; return new MyAFCalcResult(alleleCountsOfMLE, nEvaluations, vc.getAlleles(), - MathUtils.normalizeFromLog10(log10LikelihoodsOfAC, true), // necessary to ensure all values < 0 - MathUtils.normalizeFromLog10(log10PriorsOfAC, true), // priors incorporate multiple alt alleles, must be normalized + // necessary to ensure all values < 0 + MathUtils.normalizeFromLog10(log10LikelihoodsOfAC, true), + // priors incorporate multiple alt alleles, must be normalized + MathUtils.normalizeFromLog10(log10PriorsOfAC, true), log10pNonRefByAllele, sortedResultsWithThetaNPriors); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/OriginalDiploidExactAFCalc.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/OriginalDiploidExactAFCalc.java index 88f5e06e6..dea38e46c 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/OriginalDiploidExactAFCalc.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/OriginalDiploidExactAFCalc.java @@ -11,7 +11,7 @@ import java.util.Map; /** * Original bi-allelic ~O(N) implementation. Kept here for posterity and reference */ -public class OriginalDiploidExactAFCalc extends DiploidExactAFCalc { +class OriginalDiploidExactAFCalc extends DiploidExactAFCalc { protected OriginalDiploidExactAFCalc(int nSamples, int maxAltAlleles, int maxAltAllelesForIndels, final int ploidy) { super(nSamples, maxAltAlleles, maxAltAllelesForIndels, ploidy); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/StateTracker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/StateTracker.java index 3eb32d35e..301891a99 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/StateTracker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/StateTracker.java @@ -129,9 +129,7 @@ final class StateTracker { } /** - * Returns the likelihoods summed across all AC values for AC > 0 - * - * @return + * @return the likelihoods summed across all AC values for AC > 0 */ private double getLog10LikelihoodOfAFNotZero(final boolean capAt0) { if ( log10LikelihoodsForAFGt0Sum == null ) { @@ -144,7 +142,7 @@ final class StateTracker { } /** - * @return + * @return the log10 likelihood of AF == 0 */ private double getLog10LikelihoodOfAFzero() { return log10LikelihoodOfAFzero; @@ -157,9 +155,9 @@ final class StateTracker { * For example, that the allelesUsedInGenotyping has been set, that the alleleCountsOfMLE contains meaningful * values, etc. * - * @param log10PriorsByAC + * @param log10PriorsByAC the priors by AC * - * @return + * @return an AFCalcResult summarizing the final results of this calculation */ @Requires("allelesUsedInGenotyping != null") protected AFCalcResult toAFCalcResult(final double[] log10PriorsByAC) { @@ -167,8 +165,6 @@ final class StateTracker { final double[] log10Likelihoods = new double[]{getLog10LikelihoodOfAFzero(), getLog10LikelihoodOfAFNotZero(true)}; final double[] log10Priors = MathUtils.normalizeFromLog10(new double[]{log10PriorsByAC[0], MathUtils.log10sumLog10(log10PriorsByAC, 1)}, true); - // TODO -- replace with more meaningful computation - // TODO -- refactor this calculation into the ref calculation final Map log10pNonRefByAllele = new HashMap(allelesUsedInGenotyping.size()); for ( int i = 0; i < subACOfMLE.length; i++ ) { final Allele allele = allelesUsedInGenotyping.get(i+1); From 326f42927050ad326ec41a64f6d8c72c17ecc22f Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 19 Oct 2012 14:06:41 -0400 Subject: [PATCH 11/34] Bugfixes to make new AFCalc system pass integrationtests -- GeneralPloidyExactAFCalc turns -Infinity values into -Double.MAX_VALUE, so our calculations pass unit tests -- Bugfix for GeneralPloidyGenotypeLikelihoodsCalculationModel, return a null VC when the only allele we get from our final alleles to use method is the reference base -- Fix calculation of reference posteriors when P(AF == 0) = 0.0 and P(AF == 0) = X for some meaningful value of X. Added unit test to ensure this behavior is correct -- Fix horrible sorting bug in IndependentAllelesDiploidExactAFCalc that applied the theta^N priors in the wrong order. Add contract to ensure this doesn't ever happen again -- Bugfix in GLBasedSampleSelector, where VCs without any polymorphic alleles were being sent to the exact model -- --- ...dyGenotypeLikelihoodsCalculationModel.java | 2 +- .../afcalc/GeneralPloidyExactAFCalc.java | 4 +-- .../afcalc/AFCalcResultUnitTest.java | 5 ++++ .../genotyper/afcalc/AFCalcUnitTest.java | 2 +- .../IndependentAllelesDiploidExactAFCalc.java | 28 +++++++++++++++++-- .../GLBasedSampleSelector.java | 3 ++ 6 files changed, 37 insertions(+), 7 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyGenotypeLikelihoodsCalculationModel.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyGenotypeLikelihoodsCalculationModel.java index 4c20700ac..2522fc16e 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyGenotypeLikelihoodsCalculationModel.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyGenotypeLikelihoodsCalculationModel.java @@ -245,7 +245,7 @@ public abstract class GeneralPloidyGenotypeLikelihoodsCalculationModel extends G // find the alternate allele(s) that we should be using final List alleles = getFinalAllelesToUse(tracker, ref, allAllelesToUse, GLs); - if (alleles == null || alleles.isEmpty()) + if (alleles == null || alleles.isEmpty() || (alleles.size() == 1 && alleles.get(0).isReference())) return null; // start making the VariantContext final GenomeLoc loc = ref.getLocus(); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyExactAFCalc.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyExactAFCalc.java index 9c7883ab8..51b7fb633 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyExactAFCalc.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyExactAFCalc.java @@ -448,7 +448,7 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc { // update the MLE if necessary final int altCounts[] = Arrays.copyOfRange(set.getACcounts().getCounts(),1, set.getACcounts().getCounts().length); // TODO -- GUILLERMO THIS CODE MAY PRODUCE POSITIVE LIKELIHOODS OR -INFINITY - getStateTracker().updateMLEifNeeded(MathUtils.goodLog10Probability(log10LofK) ? log10LofK : MathUtils.LOG10_P_OF_ZERO, altCounts); + getStateTracker().updateMLEifNeeded(Math.max(Math.min(log10LofK, 0.0), -Double.MAX_VALUE), altCounts); // apply the priors over each alternate allele for (final int ACcount : altCounts ) { @@ -456,7 +456,7 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc { log10LofK += log10AlleleFrequencyPriors[ACcount]; } // TODO -- GUILLERMO THIS CODE MAY PRODUCE POSITIVE LIKELIHOODS OR -INFINITY - getStateTracker().updateMAPifNeeded(MathUtils.goodLog10Probability(log10LofK) ? log10LofK : MathUtils.LOG10_P_OF_ZERO, altCounts); + getStateTracker().updateMAPifNeeded(Math.max(Math.min(log10LofK, 0.0), -Double.MAX_VALUE), altCounts); return log10LofK; } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResultUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResultUnitTest.java index 1070642e9..cbe2eb268 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResultUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResultUnitTest.java @@ -56,6 +56,11 @@ public class AFCalcResultUnitTest extends BaseTest { tests.add(new Object[]{new MyTest(new double[]{-1e-9, badL}, new double[]{0.0, badL})}); } + // test that a non-ref site gets reasonable posteriors with an ~0.0 value doesn't get lost + for ( final double nonRefL : Arrays.asList(-100.0, -50.0, -10.0, -9.0, -8.0, -7.0, -6.0, -5.0)) { + tests.add(new Object[]{new MyTest(new double[]{0.0, nonRefL}, new double[]{0.0, nonRefL})}); + } + return tests.toArray(new Object[][]{}); } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java index 25df0f6d2..9c6c8e8ab 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java @@ -185,7 +185,7 @@ public class AFCalcUnitTest extends BaseTest { testResultSimple(cfg); } - @Test(enabled = true, dataProvider = "badGLs") + @Test(enabled = true && !DEBUG_ONLY, dataProvider = "badGLs") public void testBadGLs(GetGLsTest cfg) { testResultSimple(cfg); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalc.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalc.java index 2f85a5246..ea89d3802 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalc.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalc.java @@ -100,7 +100,7 @@ import java.util.*; private final static class CompareAFCalcResultsByPNonRef implements Comparator { @Override public int compare(AFCalcResult o1, AFCalcResult o2) { - return Double.compare(o1.getLog10PosteriorOfAFGT0(), o2.getLog10PosteriorOfAFGT0()); + return -1 * Double.compare(o1.getLog10PosteriorOfAFGT0(), o2.getLog10PosteriorOfAFGT0()); } } @@ -313,6 +313,7 @@ import java.util.*; * * @param sortedResultsWithThetaNPriors the pNonRef result for each allele independently */ + @Requires("sortedByPosteriorGT(sortedResultsWithThetaNPriors)") protected AFCalcResult combineIndependentPNonRefs(final VariantContext vc, final List sortedResultsWithThetaNPriors) { int nEvaluations = 0; @@ -321,8 +322,9 @@ import java.util.*; final double[] log10PriorsOfAC = new double[2]; final Map log10pNonRefByAllele = new HashMap(nAltAlleles); - // this value is a sum in log space + // the sum of the log10 posteriors for AF == 0 and AF > 0 to determine joint probs double log10PosteriorOfACEq0Sum = 0.0; + double log10PosteriorOfACGt0Sum = 0.0; for ( final AFCalcResult sortedResultWithThetaNPriors : sortedResultsWithThetaNPriors ) { final Allele altAllele = sortedResultWithThetaNPriors.getAllelesUsedInGenotyping().get(1); @@ -337,6 +339,7 @@ import java.util.*; // the AF > 0 case requires us to store the normalized likelihood for later summation if ( sortedResultWithThetaNPriors.getLog10PosteriorOfAFGT0() > MIN_LOG10_CONFIDENCE_TO_INCLUDE_ALLELE_IN_POSTERIOR ) log10PosteriorOfACEq0Sum += sortedResultWithThetaNPriors.getLog10PosteriorOfAFEq0(); + log10PosteriorOfACGt0Sum += sortedResultWithThetaNPriors.getLog10PosteriorOfAFGT0(); // bind pNonRef for allele to the posterior value of the AF > 0 with the new adjusted prior log10pNonRefByAllele.put(altAllele, sortedResultWithThetaNPriors.getLog10PosteriorOfAFGT0()); @@ -348,7 +351,16 @@ import java.util.*; // In principle, if B_p = x and C_p = y are the probabilities of being poly for alleles B and C, // the probability of being poly is (1 - B_p) * (1 - C_p) = (1 - x) * (1 - y). We want to estimate confidently // log10((1 - x) * (1 - y)) which is log10(1 - x) + log10(1 - y). This sum is log10PosteriorOfACEq0 - final double log10PosteriorOfACGt0 = Math.max(Math.log10(1 - Math.pow(10, log10PosteriorOfACEq0Sum)), MathUtils.LOG10_P_OF_ZERO); + // + // note we need to handle the case where the posterior of AF == 0 is 0.0, in which case we + // use the summed log10PosteriorOfACGt0Sum directly. This happens in cases where + // AF > 0 : 0.0 and AF == 0 : -16, and if you use the inverse calculation you get 0.0 and MathUtils.LOG10_P_OF_ZERO + final double log10PosteriorOfACGt0; + if ( log10PosteriorOfACEq0Sum == 0.0 ) + log10PosteriorOfACGt0 = log10PosteriorOfACGt0Sum; + else + log10PosteriorOfACGt0 = Math.max(Math.log10(1 - Math.pow(10, log10PosteriorOfACEq0Sum)), MathUtils.LOG10_P_OF_ZERO); + final double[] log10LikelihoodsOfAC = new double[] { // L + prior = posterior => L = poster - prior log10PosteriorOfACEq0Sum - log10PriorsOfAC[0], @@ -362,4 +374,14 @@ import java.util.*; MathUtils.normalizeFromLog10(log10PriorsOfAC, true), log10pNonRefByAllele, sortedResultsWithThetaNPriors); } + + private static boolean sortedByPosteriorGT(final List sortedVCs) { + double lastPosteriorGt0 = sortedVCs.get(0).getLog10PosteriorOfAFGT0(); + for ( final AFCalcResult vc : sortedVCs ) { + if ( vc.getLog10PosteriorOfAFGT0() > lastPosteriorGt0 ) + return false; + lastPosteriorGt0 = vc.getLog10PosteriorOfAFGT0(); + } + return true; + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GLBasedSampleSelector.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GLBasedSampleSelector.java index f8c871e7d..48a2d2700 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GLBasedSampleSelector.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GLBasedSampleSelector.java @@ -48,6 +48,9 @@ public class GLBasedSampleSelector extends SampleSelector { // first subset to the samples VariantContext subContext = vc.subContextFromSamples(samples); + if ( ! subContext.isPolymorphicInSamples() ) + return false; + // now check to see (using EXACT model) whether this should be variant // do we want to apply a prior? maybe user-spec? if ( flatPriors == null ) { From eaffb814d381e19aa8d522338f726c481634ab98 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Thu, 18 Oct 2012 13:34:51 -0400 Subject: [PATCH 12/34] IndependentExactAFCalc is now the default EXACT model implementation -- Changed UG / HC to use this one via the StandardCallerArgumentCollection -- Update the AFCalcFactory.Calculation to have a getDefault() value instead of having a duplicate entry in the enums --- .../gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java | 2 +- .../arguments/StandardCallerArgumentCollection.java | 2 +- .../gatk/walkers/genotyper/afcalc/AFCalcFactory.java | 11 +++++------ 3 files changed, 7 insertions(+), 8 deletions(-) diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java index 9c6c8e8ab..ab967fbe1 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java @@ -495,7 +495,7 @@ public class AFCalcUnitTest extends BaseTest { // list of all high-quality models in the system final List models = Arrays.asList( - AFCalcFactory.Calculation.EXACT, + AFCalcFactory.Calculation.getDefaultModel(), AFCalcFactory.Calculation.EXACT_REFERENCE, AFCalcFactory.Calculation.EXACT_INDEPENDENT); diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java index a511364f9..84dfa694b 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java @@ -105,5 +105,5 @@ public class StandardCallerArgumentCollection { */ @Advanced @Argument(fullName = "p_nonref_model", shortName = "pnrm", doc = "Non-reference probability calculation model to employ", required = false) - public AFCalcFactory.Calculation AFmodel = AFCalcFactory.Calculation.EXACT; + public AFCalcFactory.Calculation AFmodel = AFCalcFactory.Calculation.getDefaultModel(); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcFactory.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcFactory.java index 7d67815cf..80de555ca 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcFactory.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcFactory.java @@ -24,15 +24,12 @@ public class AFCalcFactory { * the needs of the request (i.e., considering ploidy). */ public enum Calculation { - /** The default implementation */ - EXACT(ReferenceDiploidExactAFCalc.class, 2, -1), - - /** reference implementation of multi-allelic EXACT model */ - EXACT_REFERENCE(ReferenceDiploidExactAFCalc.class, 2, -1), - /** expt. implementation -- for testing only */ EXACT_INDEPENDENT(IndependentAllelesDiploidExactAFCalc.class, 2, -1), + /** reference implementation of multi-allelic EXACT model. Extremely slow for many alternate alleles */ + EXACT_REFERENCE(ReferenceDiploidExactAFCalc.class, 2, -1), + /** original biallelic exact model, for testing only */ EXACT_ORIGINAL(OriginalDiploidExactAFCalc.class, 2, 2), @@ -60,6 +57,8 @@ public class AFCalcFactory { return (requiredPloidy == -1 || requiredPloidy == requestedPloidy) && (maxAltAlleles == -1 || maxAltAlleles >= requestedMaxAltAlleles); } + + public static Calculation getDefaultModel() { return EXACT_INDEPENDENT; } } private static final Map> afClasses; From 0fb82745077af0d1795253a1b3c2e5c03551645d Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 19 Oct 2012 17:11:31 -0400 Subject: [PATCH 13/34] Fix contact on sorting of AFCalcResults -- The thing that must be sorted is the pre-theta^N list, which is not checked in the routine that applies the theta^N prior. --- .../IndependentAllelesDiploidExactAFCalc.java | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalc.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalc.java index ea89d3802..804b560b4 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalc.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalc.java @@ -285,10 +285,14 @@ import java.util.*; // sort the results, so the most likely allele is first Collections.sort(sorted, compareAFCalcResultsByPNonRef); + double lastPosteriorGt0 = sorted.get(0).getLog10PosteriorOfAFGT0(); final double log10SingleAllelePriorOfAFGt0 = conditionalPNonRefResults.get(0).getLog10PriorOfAFGT0(); for ( int i = 0; i < sorted.size(); i++ ) { - final double log10PriorAFGt0 = (i + 1) * log10SingleAllelePriorOfAFGt0; + if ( sorted.get(i).getLog10PosteriorOfAFGT0() > lastPosteriorGt0 ) + throw new IllegalStateException("pNonRefResults not sorted: lastPosteriorGt0 " + lastPosteriorGt0 + " but current is " + sorted.get(i).getLog10PosteriorOfAFGT0()); + + final double log10PriorAFGt0 = (i + 1) * log10SingleAllelePriorOfAFGt0; final double log10PriorAFEq0 = Math.log10(1 - Math.pow(10, log10PriorAFGt0)); final double[] thetaTONPriors = new double[] { log10PriorAFEq0, log10PriorAFGt0 }; @@ -313,7 +317,6 @@ import java.util.*; * * @param sortedResultsWithThetaNPriors the pNonRef result for each allele independently */ - @Requires("sortedByPosteriorGT(sortedResultsWithThetaNPriors)") protected AFCalcResult combineIndependentPNonRefs(final VariantContext vc, final List sortedResultsWithThetaNPriors) { int nEvaluations = 0; @@ -374,14 +377,4 @@ import java.util.*; MathUtils.normalizeFromLog10(log10PriorsOfAC, true), log10pNonRefByAllele, sortedResultsWithThetaNPriors); } - - private static boolean sortedByPosteriorGT(final List sortedVCs) { - double lastPosteriorGt0 = sortedVCs.get(0).getLog10PosteriorOfAFGT0(); - for ( final AFCalcResult vc : sortedVCs ) { - if ( vc.getLog10PosteriorOfAFGT0() > lastPosteriorGt0 ) - return false; - lastPosteriorGt0 = vc.getLog10PosteriorOfAFGT0(); - } - return true; - } } From 0fcd358ace4c232236e577d53627ea06959b4766 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 19 Oct 2012 19:33:47 -0400 Subject: [PATCH 14/34] Original EXACT model implementation lives, providing another reference (bi-allelic only) EXACT model -- Potentially a very fast implementation (it's very clean) but restricted to the biallelic case -- A starting point for future bi-allelic only optimized (logless) or generalized (bi-allelic general ploidy) implementations -- Added systematic unit tests covering this implementation, and comparing it to others -- Uncovered a nasty normalization bug in StateTracker that was capping our likelihoods at 0, even after summing up multiple likelihoods, which is just not safe to do and was causing us to lose likelihood in some cases -- Removed the restriction that a likelihood be <= 0 in StateTracker, and the protection for these cases in GeneralPloidyExactAFCalc which just wasn't right --- .../afcalc/GeneralPloidyExactAFCalc.java | 4 +- .../genotyper/afcalc/AFCalcUnitTest.java | 149 ++++++++++++++---- .../afcalc/OriginalDiploidExactAFCalc.java | 38 +++-- .../genotyper/afcalc/StateTracker.java | 14 +- 4 files changed, 149 insertions(+), 56 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyExactAFCalc.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyExactAFCalc.java index 51b7fb633..2b247430c 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyExactAFCalc.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyExactAFCalc.java @@ -448,7 +448,7 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc { // update the MLE if necessary final int altCounts[] = Arrays.copyOfRange(set.getACcounts().getCounts(),1, set.getACcounts().getCounts().length); // TODO -- GUILLERMO THIS CODE MAY PRODUCE POSITIVE LIKELIHOODS OR -INFINITY - getStateTracker().updateMLEifNeeded(Math.max(Math.min(log10LofK, 0.0), -Double.MAX_VALUE), altCounts); + getStateTracker().updateMLEifNeeded(Math.max(log10LofK, -Double.MAX_VALUE), altCounts); // apply the priors over each alternate allele for (final int ACcount : altCounts ) { @@ -456,7 +456,7 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc { log10LofK += log10AlleleFrequencyPriors[ACcount]; } // TODO -- GUILLERMO THIS CODE MAY PRODUCE POSITIVE LIKELIHOODS OR -INFINITY - getStateTracker().updateMAPifNeeded(Math.max(Math.min(log10LofK, 0.0), -Double.MAX_VALUE), altCounts); + getStateTracker().updateMAPifNeeded(Math.max(log10LofK, -Double.MAX_VALUE), altCounts); return log10LofK; } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java index ab967fbe1..ef4318a40 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java @@ -1,5 +1,6 @@ package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; +import org.apache.commons.lang.ArrayUtils; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine; import org.broadinstitute.sting.utils.MathUtils; @@ -124,12 +125,7 @@ public class AFCalcUnitTest extends BaseTest { final List triAllelicSamples = Arrays.asList(AA2, AB2, BB2, AC2, BC2, CC2); for ( final int nSamples : Arrays.asList(1, 2, 3, 4) ) { - List calcs = AFCalcFactory.createAFCalcs( - Arrays.asList( - AFCalcFactory.Calculation.EXACT_REFERENCE, - AFCalcFactory.Calculation.EXACT_INDEPENDENT, - AFCalcFactory.Calculation.EXACT_GENERAL_PLOIDY - ), 4, 2, 2, 2); + List calcs = AFCalcFactory.createAFCalcs( Arrays.asList( AFCalcFactory.Calculation.values() ), 4, 2, 2, 2); final int nPriorValues = 2*nSamples+1; final double[] flatPriors = MathUtils.normalizeFromLog10(new double[nPriorValues], true); // flat priors @@ -146,7 +142,7 @@ public class AFCalcUnitTest extends BaseTest { new GetGLsTest(model, 1, genotypes, priors, priorName); // tri-allelic - if ( INCLUDE_TRIALLELIC && ( ! priorName.equals("human") || Guillermo_FIXME ) ) // || model != generalCalc ) ) + if ( INCLUDE_TRIALLELIC && ( ! priorName.equals("human") || Guillermo_FIXME ) && ! ( model instanceof OriginalDiploidExactAFCalc) ) // || model != generalCalc ) ) for ( List genotypes : Utils.makePermutations(triAllelicSamples, nSamples, true) ) new GetGLsTest(model, 2, genotypes, priors, priorName); } @@ -156,22 +152,28 @@ public class AFCalcUnitTest extends BaseTest { return GetGLsTest.getTests(GetGLsTest.class); } - @DataProvider(name = "badGLs") - public Object[][] createBadGLs() { - final List genotypes = Arrays.asList(AB2, BB2, CC2, CC2); - final int nSamples = genotypes.size(); +// @DataProvider(name = "badGLs") +// public Object[][] createBadGLs() { +// final List genotypes = Arrays.asList(AB2, BB2, CC2, CC2); +// final int nSamples = genotypes.size(); +// +// final AFCalc indCalc = AFCalcFactory.createAFCalc(AFCalcFactory.Calculation.EXACT_INDEPENDENT, nSamples, 4); +// +// final int nPriorValues = 2*nSamples+1; +// final double[] priors = MathUtils.normalizeFromLog10(new double[nPriorValues], true); // flat priors +// for ( AFCalc model : Arrays.asList(indCalc) ) { +// final String priorName = "flat"; +// new GetGLsTest(model, 2, genotypes, priors, priorName); +// } +// +// return GetGLsTest.getTests(GetGLsTest.class); +// } - final AFCalc indCalc = AFCalcFactory.createAFCalc(AFCalcFactory.Calculation.EXACT_INDEPENDENT, nSamples, 4); - - final int nPriorValues = 2*nSamples+1; - final double[] priors = MathUtils.normalizeFromLog10(new double[nPriorValues], true); // flat priors - for ( AFCalc model : Arrays.asList(indCalc) ) { - final String priorName = "flat"; - new GetGLsTest(model, 2, genotypes, priors, priorName); - } - - return GetGLsTest.getTests(GetGLsTest.class); - } +// +// @Test(enabled = true && !DEBUG_ONLY, dataProvider = "badGLs") +// public void testBadGLs(GetGLsTest cfg) { +// testResultSimple(cfg); +// } @Test(enabled = true && ! DEBUG_ONLY, dataProvider = "wellFormedGLs") public void testBiallelicGLs(GetGLsTest cfg) { @@ -185,11 +187,6 @@ public class AFCalcUnitTest extends BaseTest { testResultSimple(cfg); } - @Test(enabled = true && !DEBUG_ONLY, dataProvider = "badGLs") - public void testBadGLs(GetGLsTest cfg) { - testResultSimple(cfg); - } - private static class NonInformativeData { final Genotype nonInformative; final List called; @@ -218,16 +215,14 @@ public class AFCalcUnitTest extends BaseTest { samples.addAll(Collections.nCopies(nNonInformative, testData.nonInformative)); final int nSamples = samples.size(); - List calcs = AFCalcFactory.createAFCalcs( - Arrays.asList( - AFCalcFactory.Calculation.EXACT_REFERENCE, - AFCalcFactory.Calculation.EXACT_INDEPENDENT, - AFCalcFactory.Calculation.EXACT_GENERAL_PLOIDY - ), 4, 2, 2, 2); + List calcs = AFCalcFactory.createAFCalcs(Arrays.asList(AFCalcFactory.Calculation.values()), 4, 2, 2, 2); final double[] priors = MathUtils.normalizeFromLog10(new double[2*nSamples+1], true); // flat priors for ( AFCalc model : calcs ) { + if ( testData.nAltAlleles > 1 && model instanceof OriginalDiploidExactAFCalc ) + continue; + final GetGLsTest onlyInformative = new GetGLsTest(model, testData.nAltAlleles, testData.called, priors, "flat"); for ( int rotation = 0; rotation < nSamples; rotation++ ) { @@ -428,6 +423,94 @@ public class AFCalcUnitTest extends BaseTest { "Actual pNonRef not within tolerance " + tolerance + " of expected"); } + @DataProvider(name = "PNonRefBiallelicSystematic") + public Object[][] makePNonRefBiallelicSystematic() { + List tests = new ArrayList(); + + final List bigNonRefPLs = Arrays.asList(0, 1, 2, 3, 4, 5, 10, 15, 20, 25, 50, 100, 1000); + final List> bigDiploidPLs = removeBadPLs(Utils.makePermutations(bigNonRefPLs, 3, true)); + + for ( AFCalcFactory.Calculation modelType : AFCalcFactory.Calculation.values() ) { + + if ( false ) { // for testing only + tests.add(new Object[]{modelType, toGenotypes(Arrays.asList(Arrays.asList(0,100,0)))}); + } else { + if ( modelType == AFCalcFactory.Calculation.EXACT_GENERAL_PLOIDY ) continue; // TODO -- GENERAL_PLOIDY DOESN'T WORK + + // test all combinations of PLs for 1 sample + for ( final List> PLsPerSample : Utils.makePermutations(bigDiploidPLs, 1, true) ) { + tests.add(new Object[]{modelType, toGenotypes(PLsPerSample)}); + } + + + final List> smallDiploidPLs = new LinkedList>(); + for ( final int nonRefPL : Arrays.asList(5, 10, 20, 30) ) { + for ( int i = 0; i < 2; i++ ) { + List pls = new ArrayList(Collections.nCopies(3, nonRefPL)); + pls.set(i, 0); + smallDiploidPLs.add(pls); + } + } + + for ( final List> PLsPerSample : Utils.makePermutations(smallDiploidPLs, 5, false) ) { + tests.add(new Object[]{modelType, toGenotypes(PLsPerSample)}); + } + } + } + + return tests.toArray(new Object[][]{}); + } + + final List> removeBadPLs(List> listOfPLs) { + List> clean = new LinkedList>(); + + for ( final List PLs : listOfPLs ) { + int x = PLs.get(0); + boolean bad = false; + for ( int pl1 : PLs ) + if ( pl1 > x ) + bad = true; + else + x = pl1; + if ( ! bad ) clean.add(PLs); + } + + return clean; + } + + private List toGenotypes(final List> PLsPerSample) { + final List nocall = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL); + final List genotypes = new ArrayList(PLsPerSample.size()); + + for ( final List PLs : PLsPerSample ) { + final int[] pls = ArrayUtils.toPrimitive(PLs.toArray(new Integer[3])); + final int min = MathUtils.arrayMin(pls); + for ( int i = 0; i < pls.length; i++ ) pls[i] -= min; + genotypes.add(makePL(nocall, pls)); + } + + return genotypes; + } + + @Test(enabled = true && ! DEBUG_ONLY, dataProvider = "PNonRefBiallelicSystematic") + private void PNonRefBiallelicSystematic(AFCalcFactory.Calculation modelType, final List genotypes) { + //logger.warn("Running " + modelType + " with " + genotypes); + final AFCalcTestBuilder refBuilder = new AFCalcTestBuilder(genotypes.size(), 1, AFCalcFactory.Calculation.EXACT_REFERENCE, AFCalcTestBuilder.PriorType.human); + final AFCalcTestBuilder testBuilder = new AFCalcTestBuilder(genotypes.size(), 1, modelType, AFCalcTestBuilder.PriorType.human); + + final VariantContextBuilder vcb = new VariantContextBuilder("x", "1", 1, 1, Arrays.asList(A, C)); + vcb.genotypes(genotypes); + + final AFCalcResult refResult = refBuilder.makeModel().getLog10PNonRef(vcb.make(), testBuilder.makePriors()); + final AFCalcResult testResult = testBuilder.makeModel().getLog10PNonRef(vcb.make(), testBuilder.makePriors()); + + final double tolerance = 1e-3; + Assert.assertEquals(testResult.getLog10PosteriorOfAFGT0(), refResult.getLog10PosteriorOfAFGT0(), tolerance, + "Actual pNonRef not within tolerance " + tolerance + " of expected"); + Assert.assertEquals(testResult.getAlleleCountsOfMLE(), refResult.getAlleleCountsOfMLE(), + "Actual MLE " + Utils.join(",", testResult.getAlleleCountsOfMLE()) + " not equal to expected " + Utils.join(",", refResult.getAlleleCountsOfMLE())); + } + // -------------------------------------------------------------------------------- // // Test priors diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/OriginalDiploidExactAFCalc.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/OriginalDiploidExactAFCalc.java index dea38e46c..ac4634666 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/OriginalDiploidExactAFCalc.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/OriginalDiploidExactAFCalc.java @@ -1,6 +1,7 @@ package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.VariantContext; @@ -20,15 +21,22 @@ class OriginalDiploidExactAFCalc extends DiploidExactAFCalc { protected AFCalcResult computeLog10PNonRef(VariantContext vc, double[] log10AlleleFrequencyPriors) { final double[] log10AlleleFrequencyLikelihoods = new double[log10AlleleFrequencyPriors.length]; final double[] log10AlleleFrequencyPosteriors = new double[log10AlleleFrequencyPriors.length]; - final int lastK = linearExact(vc, log10AlleleFrequencyPriors, log10AlleleFrequencyLikelihoods, log10AlleleFrequencyPosteriors); + final Pair result = linearExact(vc, log10AlleleFrequencyPriors, log10AlleleFrequencyLikelihoods, log10AlleleFrequencyPosteriors); + final int lastK = result.getFirst(); + final int mleK = result.getSecond(); - final double[] log10Likelihoods = new double[]{log10AlleleFrequencyLikelihoods[0], MathUtils.log10sumLog10(log10AlleleFrequencyLikelihoods, 1)}; + final double log10LikelihoodAFGt0 = lastK == 0 ? MathUtils.LOG10_P_OF_ZERO : MathUtils.log10sumLog10(log10AlleleFrequencyLikelihoods, 1, lastK+1); + final double[] log10Likelihoods = new double[]{log10AlleleFrequencyLikelihoods[0], log10LikelihoodAFGt0}; final double[] log10Priors = new double[]{log10AlleleFrequencyPriors[0], MathUtils.log10sumLog10(log10AlleleFrequencyPriors, 1)}; + final double[] log10Posteriors = MathUtils.vectorSum(log10Likelihoods, log10Priors); - final double pNonRef = lastK > 0 ? 0.0 : -1000.0; - final Map log10pNonRefByAllele = Collections.singletonMap(vc.getAlternateAllele(0), pNonRef); + final double log10PNonRef = log10Posteriors[1] > log10Posteriors[0] ? 0.0 : MathUtils.LOG10_P_OF_ZERO; + final Map log10pNonRefByAllele = Collections.singletonMap(vc.getAlternateAllele(0), log10PNonRef); - return new AFCalcResult(new int[]{lastK}, 0, vc.getAlleles(), log10Likelihoods, log10Priors, log10pNonRefByAllele); + return new AFCalcResult(new int[]{mleK}, 0, vc.getAlleles(), + MathUtils.normalizeFromLog10(log10Likelihoods, true), + MathUtils.normalizeFromLog10(log10Priors, true), + log10pNonRefByAllele); } /** @@ -68,11 +76,11 @@ class OriginalDiploidExactAFCalc extends DiploidExactAFCalc { } } - public int linearExact(final VariantContext vc, - double[] log10AlleleFrequencyPriors, - double[] log10AlleleFrequencyLikelihoods, - double[] log10AlleleFrequencyPosteriors) { - final ArrayList genotypeLikelihoods = getGLs(vc.getGenotypes(), false); + public Pair linearExact(final VariantContext vc, + double[] log10AlleleFrequencyPriors, + double[] log10AlleleFrequencyLikelihoods, + double[] log10AlleleFrequencyPosteriors) { + final ArrayList genotypeLikelihoods = getGLs(vc.getGenotypes(), true); final int numSamples = genotypeLikelihoods.size()-1; final int numChr = 2*numSamples; @@ -81,7 +89,7 @@ class OriginalDiploidExactAFCalc extends DiploidExactAFCalc { double maxLog10L = Double.NEGATIVE_INFINITY; boolean done = false; - int lastK = -1; + int lastK = -1, mleK = -1; for (int k=0; k <= numChr && ! done; k++ ) { final double[] kMinus0 = logY.getkMinus0(); @@ -127,7 +135,11 @@ class OriginalDiploidExactAFCalc extends DiploidExactAFCalc { // can we abort early? lastK = k; - maxLog10L = Math.max(maxLog10L, log10LofK); + if ( log10LofK > maxLog10L ) { + maxLog10L = log10LofK; + mleK = k; + } + if ( log10LofK < maxLog10L - StateTracker.MAX_LOG10_ERROR_TO_STOP_EARLY ) { //if ( DEBUG ) System.out.printf(" *** breaking early k=%d log10L=%.2f maxLog10L=%.2f%n", k, log10LofK, maxLog10L); done = true; @@ -136,6 +148,6 @@ class OriginalDiploidExactAFCalc extends DiploidExactAFCalc { logY.rotate(); } - return lastK; + return new Pair(lastK, mleK); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/StateTracker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/StateTracker.java index 301891a99..b82ec1d29 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/StateTracker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/StateTracker.java @@ -131,14 +131,14 @@ final class StateTracker { /** * @return the likelihoods summed across all AC values for AC > 0 */ - private double getLog10LikelihoodOfAFNotZero(final boolean capAt0) { + private double getLog10LikelihoodOfAFNotZero() { if ( log10LikelihoodsForAFGt0Sum == null ) { if ( log10LikelihoodsForAFGt0CacheIndex == 0 ) // there's nothing to sum up, so make the sum equal to the smallest thing we have log10LikelihoodsForAFGt0Sum = MathUtils.LOG10_P_OF_ZERO; else log10LikelihoodsForAFGt0Sum = MathUtils.log10sumLog10(log10LikelihoodsForAFGt0, 0, log10LikelihoodsForAFGt0CacheIndex); } - return Math.min(log10LikelihoodsForAFGt0Sum, capAt0 ? 0.0 : Double.POSITIVE_INFINITY); + return log10LikelihoodsForAFGt0Sum; } /** @@ -162,7 +162,7 @@ final class StateTracker { @Requires("allelesUsedInGenotyping != null") protected AFCalcResult toAFCalcResult(final double[] log10PriorsByAC) { final int [] subACOfMLE = Arrays.copyOf(alleleCountsOfMLE, allelesUsedInGenotyping.size() - 1); - final double[] log10Likelihoods = new double[]{getLog10LikelihoodOfAFzero(), getLog10LikelihoodOfAFNotZero(true)}; + final double[] log10Likelihoods = MathUtils.normalizeFromLog10(new double[]{getLog10LikelihoodOfAFzero(), getLog10LikelihoodOfAFNotZero()}, true); final double[] log10Priors = MathUtils.normalizeFromLog10(new double[]{log10PriorsByAC[0], MathUtils.log10sumLog10(log10PriorsByAC, 1)}, true); final Map log10pNonRefByAllele = new HashMap(allelesUsedInGenotyping.size()); @@ -210,7 +210,7 @@ final class StateTracker { * @param log10LofK the likelihood of our current configuration state, cannot be the 0 state * @param alleleCountsForK the allele counts for this state */ - @Requires({"alleleCountsForK != null", "MathUtils.sum(alleleCountsForK) >= 0", "MathUtils.goodLog10Probability(log10LofK)"}) + @Requires({"alleleCountsForK != null", "MathUtils.sum(alleleCountsForK) >= 0"}) @Ensures("log10MLE == Math.max(log10LofK, log10MLE)") protected void updateMLEifNeeded(final double log10LofK, final int[] alleleCountsForK) { addToLikelihoodsCache(log10LofK); @@ -227,7 +227,7 @@ final class StateTracker { * @param log10PofK the posterior of our current configuration state * @param alleleCountsForK the allele counts for this state */ - @Requires({"alleleCountsForK != null", "MathUtils.sum(alleleCountsForK) >= 0", "MathUtils.goodLog10Probability(log10PofK)"}) + @Requires({"alleleCountsForK != null", "MathUtils.sum(alleleCountsForK) >= 0"}) @Ensures("log10MAP == Math.max(log10PofK, log10MAP)") protected void updateMAPifNeeded(final double log10PofK, final int[] alleleCountsForK) { if ( log10PofK > log10MAP ) { @@ -236,7 +236,6 @@ final class StateTracker { } } - @Requires({"MathUtils.goodLog10Probability(log10LofK)"}) private void addToLikelihoodsCache(final double log10LofK) { // add to the cache log10LikelihoodsForAFGt0[log10LikelihoodsForAFGt0CacheIndex++] = log10LofK; @@ -250,7 +249,6 @@ final class StateTracker { } } - @Requires({"MathUtils.goodLog10Probability(log10LikelihoodOfAFzero)"}) protected void setLog10LikelihoodOfAFzero(final double log10LikelihoodOfAFzero) { this.log10LikelihoodOfAFzero = log10LikelihoodOfAFzero; if ( log10LikelihoodOfAFzero > log10MLE ) { @@ -259,7 +257,7 @@ final class StateTracker { } } - @Requires({"MathUtils.goodLog10Probability(log10LikelihoodOfAFzero)"}) + @Requires({"MathUtils.goodLog10Probability(log10PosteriorOfAFzero)"}) protected void setLog10PosteriorOfAFzero(final double log10PosteriorOfAFzero) { if ( log10PosteriorOfAFzero > log10MAP ) { log10MAP = log10PosteriorOfAFzero; From 6b6caf8e3a299553daa1dc5b64b33e0de14144a3 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 19 Oct 2012 14:42:00 -0400 Subject: [PATCH 15/34] Bugfix for indel DP calculations using reduced reads -- Adding tests for SNP and indel calling on reduced BAM --- ...delGenotypeLikelihoodsCalculationModel.java | 2 +- .../UnifiedGenotyperIntegrationTest.java | 18 ++++++++++++++++++ 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java index e0ffb2ba6..1761f9cec 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java @@ -231,7 +231,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood int count = 0; for (PileupElement p : pileup) { if (p.isDeletion() || p.isInsertionAtBeginningOfRead() || BaseUtils.isRegularBase(p.getBase())) - count++; + count += p.getRepresentativeCount(); } return count; diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index 72724e46a..3bb630d18 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -452,4 +452,22 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { Arrays.asList("bbf16e1873e525ee5975021cfb8988cf")); executeTest("test calling on a ReducedRead BAM", spec); } + + @Test + public void testReducedBamSNPs() { + testReducedCalling("SNP", ""); + } + + @Test + public void testReducedBamINDELs() { + testReducedCalling("INDEL", ""); + } + + + private void testReducedCalling(final String model, final String md5) { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T UnifiedGenotyper -R " + b37KGReference + " -nosl --no_cmdline_in_header -I " + privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.reduced.bam -o %s -L 20:10,000,000-11,000,000 -glm " + model, 1, + Arrays.asList(md5)); + executeTest("test calling on a ReducedRead BAM with " + model, spec); + } } From d21e42608ac43b58ade9984436d8c2b428eb6037 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Sun, 21 Oct 2012 08:10:43 -0400 Subject: [PATCH 16/34] Updating integration tests for minor changes due to switching to EXACT_INDEPENDENT model by default --- ...GenotyperGeneralPloidyIntegrationTest.java | 6 +-- .../HaplotypeCallerIntegrationTest.java | 8 ++-- .../UnifiedGenotyperIntegrationTest.java | 44 +++++++++---------- .../SelectVariantsIntegrationTest.java | 4 +- .../NanoSchedulerIntegrationTest.java | 2 +- 5 files changed, 32 insertions(+), 32 deletions(-) diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyIntegrationTest.java index 652489a71..b447a1113 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyIntegrationTest.java @@ -60,12 +60,12 @@ public class UnifiedGenotyperGeneralPloidyIntegrationTest extends WalkerTest { @Test(enabled = true) public void testBOTH_GGA_Pools() { - PC_LSV_Test(String.format(" -maxAltAlleles 2 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",LSV_ALLELES),"LSV_BOTH_GGA","BOTH","67dabdbf1e6ed8a83d2e85766558a20a"); + PC_LSV_Test(String.format(" -maxAltAlleles 2 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",LSV_ALLELES),"LSV_BOTH_GGA","BOTH","9ce24f4ff787aed9d3754519a60ef49f"); } @Test(enabled = true) public void testINDEL_GGA_Pools() { - PC_LSV_Test(String.format(" -maxAltAlleles 1 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",LSV_ALLELES),"LSV_INDEL_GGA","INDEL","d4bfae27f1b07923f381d708d8a34cf4"); + PC_LSV_Test(String.format(" -maxAltAlleles 1 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",LSV_ALLELES),"LSV_INDEL_GGA","INDEL","492c8ba9a80a902097ff15bbeb031592"); } @Test(enabled = true) @@ -80,7 +80,7 @@ public class UnifiedGenotyperGeneralPloidyIntegrationTest extends WalkerTest { @Test(enabled = true) public void testMT_SNP_DISCOVERY_sp4() { - PC_MT_Test(CEUTRIO_BAM, " -maxAltAlleles 1 -ploidy 8", "MT_SNP_DISCOVERY_sp4","da84bf45f7080a46a7a78542b3a0629d"); + PC_MT_Test(CEUTRIO_BAM, " -maxAltAlleles 1 -ploidy 8", "MT_SNP_DISCOVERY_sp4","0a8c3b06243040b743dd90d497bb3f83"); } @Test(enabled = true) diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index a441e6c77..870967f09 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -21,7 +21,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSample() { - HCTest(CEUTRIO_BAM, "", "75013fa6a884104f0b1797502b636698"); + HCTest(CEUTRIO_BAM, "", "ee866a8694a6f6c77242041275350ab9"); } @Test @@ -31,7 +31,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSampleGGA() { - HCTest(CEUTRIO_BAM, "--max_alternate_alleles_for_indels 3 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", "3cd3363976b1937d801f9f82996f4abe"); + HCTest(CEUTRIO_BAM, "--max_alternate_alleles_for_indels 3 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", "53caa950535749f99d3c5b9bb61c7b60"); } private void HCTestComplexVariants(String bam, String args, String md5) { @@ -53,7 +53,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerSingleSampleSymbolic() { - HCTestSymbolicVariants(NA12878_CHR20_BAM, "", "6eb9c1026225b38ba7bd3c4c218f8269"); + HCTestSymbolicVariants(NA12878_CHR20_BAM, "", "b4ea70a446e4782bd3700ca14dd726ff"); } private void HCTestIndelQualityScores(String bam, String args, String md5) { @@ -64,7 +64,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerSingleSampleIndelQualityScores() { - HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "98d82d74e8d6a778290bee6c0df6d092"); + HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "2581e760279291a3901a506d060bfac8"); } @Test diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index 3bb630d18..044d70fc2 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -28,7 +28,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultiSamplePilot1() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000", 1, - Arrays.asList("b3abf320f7d02d0e3b2883833419130e")); + Arrays.asList("847605f4efafef89529fe0e496315edd")); executeTest("test MultiSample Pilot1", spec); } @@ -52,7 +52,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testSingleSamplePilot2() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,100,000", 1, - Arrays.asList("57e409dbb12e0d85cd8af73db221b1fc")); + Arrays.asList("afb8768f31ab57eb43f75c1115eadc99")); executeTest("test SingleSample Pilot2", spec); } @@ -60,7 +60,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultipleSNPAlleles() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b37KGReference + " -nosl --no_cmdline_in_header -glm BOTH --dbsnp " + b37dbSNP129 + " -I " + privateTestDir + "multiallelic.snps.bam -o %s -L " + privateTestDir + "multiallelic.snps.intervals", 1, - Arrays.asList("48b4f4b05461be276bffc91350f08cbc")); + Arrays.asList("73c9b926c5e971a113de347a64fdcf20")); executeTest("test Multiple SNP alleles", spec); } @@ -76,7 +76,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testReverseTrim() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b37KGReference + " -nosl --no_cmdline_in_header -glm INDEL -I " + validationDataLocation + "CEUTrio.HiSeq.b37.chr20.10_11mb.bam -o %s -L 20:10289124 -L 20:10090289", 1, - Arrays.asList("04affcc9d720ee17bc221759707e0cd2")); + Arrays.asList("7f8d13690cb7d4173787afa00c496f12")); executeTest("test reverse trim", spec); } @@ -84,7 +84,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMismatchedPLs() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b37KGReference + " -nosl --no_cmdline_in_header -glm INDEL -I " + privateTestDir + "mismatchedPLs.bam -o %s -L 1:24020341", 1, - Arrays.asList("112e7bedfd284d4d9390aa006118c733")); + Arrays.asList("3c006b06b17bbe8e787d64eff6a63a19")); executeTest("test mismatched PLs", spec); } @@ -94,7 +94,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { // // -------------------------------------------------------------------------------------------------------------- - private final static String COMPRESSED_OUTPUT_MD5 = "367c0355b4e7b10c2988e5c41f44b3d2"; + private final static String COMPRESSED_OUTPUT_MD5 = "fd236bd635d514e4214d364f45ec4d10"; @Test public void testCompressedOutput() { @@ -115,7 +115,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { // Note that we need to turn off any randomization for this to work, so no downsampling and no annotations - String md5 = "360d1274c1072a1ae9868e4e106c2650"; + String md5 = "d408b4661b820ed86272415b8ea08780"; WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommand + " -dt NONE -G none -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,075,000", 1, @@ -147,7 +147,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMinBaseQualityScore() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 --min_base_quality_score 26", 1, - Arrays.asList("6ae4a219c7b9c837fcbf12edeeac3c0c")); + Arrays.asList("839ecd30d354a36b5dfa2b5e99859765")); executeTest("test min_base_quality_score 26", spec); } @@ -182,12 +182,12 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { @Test public void testOutputParameterAllConfident() { - testOutputParameters("--output_mode EMIT_ALL_CONFIDENT_SITES", "f9ea04d96eeef29e71d37e60518c2579"); + testOutputParameters("--output_mode EMIT_ALL_CONFIDENT_SITES", "8800c58715c2bb434b69e1873cb77de6"); } @Test public void testOutputParameterAllSites() { - testOutputParameters("--output_mode EMIT_ALL_SITES", "67739a3ccf30975bcaef8a563e4b80cf"); + testOutputParameters("--output_mode EMIT_ALL_SITES", "639df9f4c029792ac2e46069efc82b20"); } private void testOutputParameters(final String args, final String md5) { @@ -220,12 +220,12 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { // -------------------------------------------------------------------------------------------------------------- @Test public void testHeterozyosity1() { - testHeterozosity( 0.01, "f1c4c8e701b2334bf3c4f12fc395fec8" ); + testHeterozosity( 0.01, "986923de51c71635d47e3d06fe3794a1" ); } @Test public void testHeterozyosity2() { - testHeterozosity( 1.0 / 1850, "7fbbf4a21d6bf0026bfdadbb3c086fbe" ); + testHeterozosity( 1.0 / 1850, "fb12b1553f813004a394a391a8540873" ); } private void testHeterozosity(final double arg, final String md5) { @@ -268,7 +268,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -L 1:10,000,000-10,100,000" + " -baq CALCULATE_AS_NECESSARY", 1, - Arrays.asList("950fb032cc9902ae48bd21f272d2fd52")); + Arrays.asList("98058fc913b61c22d44875da1f5ea89c")); executeTest(String.format("test calling with BAQ"), spec); } @@ -287,7 +287,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,500,000", 1, - Arrays.asList("b3df138254ed141b61a758df87757e0d")); + Arrays.asList("650c53774afacfc07a595675e8cdde17")); executeTest(String.format("test indel caller in SLX"), spec); } @@ -302,7 +302,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -minIndelCnt 1" + " -L 1:10,000,000-10,100,000", 1, - Arrays.asList("63fd9488daadd4baaef0a98f02916996")); + Arrays.asList("6a0c2a3a7bcc56ad01428c71408055aa")); executeTest(String.format("test indel caller in SLX with low min allele count"), spec); } @@ -315,7 +315,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,500,000", 1, - Arrays.asList("52b5a432092995c92fe71e1942689ba8")); + Arrays.asList("5f2721c3323de5390d2d47446139f32b")); executeTest(String.format("test indel calling, multiple technologies"), spec); } @@ -343,13 +343,13 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultiSampleIndels1() { WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommandIndels + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10450700-10551000", 1, - Arrays.asList("863ee56b3594f09795644127f2f9539f")); + Arrays.asList("a4761d7f25e7a62f34494801c98a0da7")); List result = executeTest("test MultiSample Pilot1 CEU indels", spec1).getFirst(); WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + result.get(0).getAbsolutePath() + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10450700-10551000", 1, - Arrays.asList("503ca1b75cc7b2679eaa80f7b5e7ef1c")); + Arrays.asList("c526c234947482d1cd2ffc5102083a08")); executeTest("test MultiSample Pilot1 CEU indels using GENOTYPE_GIVEN_ALLELES", spec2); } @@ -371,7 +371,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 20:10,000,000-10,100,000", 1, - Arrays.asList("945a2f994eaced8efdf8de24b58f2680")); + Arrays.asList("1e0d2c15546c3b0959b00ffb75488b56")); executeTest(String.format("test UG with base indel quality scores"), spec); } @@ -449,18 +449,18 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testReducedBam() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b37KGReference + " -nosl --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1, - Arrays.asList("bbf16e1873e525ee5975021cfb8988cf")); + Arrays.asList("da9c05f87bd6415e97f90c49cf68ed19")); executeTest("test calling on a ReducedRead BAM", spec); } @Test public void testReducedBamSNPs() { - testReducedCalling("SNP", ""); + testReducedCalling("SNP", "1d4a826b144723ff0766c36aa0239287"); } @Test public void testReducedBamINDELs() { - testReducedCalling("INDEL", ""); + testReducedCalling("INDEL", "68ef51d5c98480e0c0192e0eecb95bca"); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java index 58d3677c7..f29ac8cad 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java @@ -190,7 +190,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( "-T SelectVariants -R " + b36KGReference + " -regenotype -sn NA12892 --variant " + testFile + " -o %s --no_cmdline_in_header", 1, - Arrays.asList("549321a2543608f214ab4893ab478be6") + Arrays.asList("46ff472fc7ef6734ad01170028d5924a") ); executeTest("testRegenotype--" + testFile, spec); @@ -216,7 +216,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( "-T SelectVariants -R " + b36KGReference + " -regenotype -sn NA12892 --variant " + testFile + " -o %s --no_cmdline_in_header", 1, - Arrays.asList("549321a2543608f214ab4893ab478be6") + Arrays.asList("46ff472fc7ef6734ad01170028d5924a") ); executeTest("testRemoveMLEAndRegenotype--" + testFile, spec); diff --git a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerIntegrationTest.java b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerIntegrationTest.java index c1b28314c..37614f15f 100755 --- a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerIntegrationTest.java @@ -21,7 +21,7 @@ public class NanoSchedulerIntegrationTest extends WalkerTest { for ( final int nct : Arrays.asList(1, 2) ) { // tests.add(new Object[]{ "SNP", "a1c7546f32a8919a3f3a70a04b2e8322", nt, nct }); //// tests.add(new Object[]{ "INDEL", "0a6d2be79f4f8a4b0eb788cc4751b31b", nt, nct }); - tests.add(new Object[]{ "BOTH", "8cad82c3a5f5b932042933f136663c8a", nt, nct }); + tests.add(new Object[]{ "BOTH", "85fc5d6dfeb60ed89763470f4b4c981e", nt, nct }); } return tests.toArray(new Object[][]{}); From 5296de8251ddb1cd9036bf9b7e57fc362a78f92b Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Sun, 21 Oct 2012 08:20:39 -0400 Subject: [PATCH 17/34] Fix UnifiedArgumentCollection constructor logic error -- The old way of overloading constructors and calling super didn't work (might have been a consequence of merge). This is the right way to do the copy constructor with the call to super() --- .../genotyper/UnifiedArgumentCollection.java | 32 +++++++++++++------ 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java index abf0b4420..e50f25fe8 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java @@ -27,15 +27,10 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.gatk.arguments.StandardCallerArgumentCollection; -<<<<<<< HEAD -import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.AFCalcFactory; import org.broadinstitute.sting.utils.pairhmm.PairHMM; -======= ->>>>>>> 19181ee... Moving pnrm to UnifiedArgumentCollection so it's available with the HaplotypeCaller import org.broadinstitute.sting.utils.variantcontext.VariantContext; import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; - public class UnifiedArgumentCollection extends StandardCallerArgumentCollection { @Argument(fullName = "genotype_likelihoods_model", shortName = "glm", doc = "Genotype likelihoods calculation model to employ -- SNP is the default option, while INDEL is also available for calling indels and BOTH is available for calling both together", required = false) @@ -182,14 +177,30 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection @Argument(shortName="ef", fullName="exclude_filtered_reference_sites", doc="Don't include in the analysis sites where the reference sample VCF is filtered. Default: false.", required=false) boolean EXCLUDE_FILTERED_REFERENCE_SITES = false; - public UnifiedArgumentCollection() { } - - public UnifiedArgumentCollection(final StandardCallerArgumentCollection SCAC) { - super(SCAC); + /** + * Create a new UAC with defaults for all UAC arguments + */ + public UnifiedArgumentCollection() { + super(); } - // Developers must remember to add any newly added arguments to the list here as well otherwise they won't get changed from their default value! + /** + * Create a new UAC based on the information only our in super-class scac and defaults for all UAC arguments + * @param scac + */ + public UnifiedArgumentCollection(final StandardCallerArgumentCollection scac) { + super(scac); + } + + /** + * Create a new UAC with all parameters having the values in uac + * + * @param uac + */ public UnifiedArgumentCollection(final UnifiedArgumentCollection uac) { + // Developers must remember to add any newly added arguments to the list here as well otherwise they won't get changed from their default value! + super(uac); + this.GLmodel = uac.GLmodel; this.AFmodel = uac.AFmodel; this.PCR_error = uac.PCR_error; @@ -215,6 +226,7 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection this.EXCLUDE_FILTERED_REFERENCE_SITES = uac.EXCLUDE_FILTERED_REFERENCE_SITES; this.IGNORE_LANE_INFO = uac.IGNORE_LANE_INFO; this.pairHMM = uac.pairHMM; + // todo- arguments to remove this.IGNORE_SNP_ALLELES = uac.IGNORE_SNP_ALLELES; } From eb6c9a1a79b8979353d85723d26138be2d52ff6a Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Sun, 21 Oct 2012 09:57:31 -0400 Subject: [PATCH 18/34] Disable EfficiencyMonitoringThreadFactoryUnitTest -- This is no longer a core GATK activity, and the tests need to run for so long (2 min each) that it's just too painful to run them. Should be re-eabled if we come to care about this capability again, or if we can run these tests all in parallel in the future. --- .../threading/EfficiencyMonitoringThreadFactoryUnitTest.java | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/public/java/test/org/broadinstitute/sting/utils/threading/EfficiencyMonitoringThreadFactoryUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/threading/EfficiencyMonitoringThreadFactoryUnitTest.java index 7381bebc4..c072c808d 100755 --- a/public/java/test/org/broadinstitute/sting/utils/threading/EfficiencyMonitoringThreadFactoryUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/threading/EfficiencyMonitoringThreadFactoryUnitTest.java @@ -130,7 +130,10 @@ public class EfficiencyMonitoringThreadFactoryUnitTest extends BaseTest { return StateTest.getTests(StateTest.class); } - @Test(enabled = true, dataProvider = "StateTest", timeOut = MAX_THREADS * THREAD_TARGET_DURATION_IN_MILLISECOND) + // NOTE this test takes an unreasonably long time to run, and so it's been disabled as these monitoring threads + // aren't a core GATK feature any longer. Should be reabled if we come to care about this capability again + // in the future, or we can run these in parallel + @Test(enabled = false, dataProvider = "StateTest", timeOut = MAX_THREADS * THREAD_TARGET_DURATION_IN_MILLISECOND) public void testStateTest(final StateTest test) throws InterruptedException { // allows us to test blocking final EfficiencyMonitoringThreadFactory factory = new EfficiencyMonitoringThreadFactory(test.getNStates()); From 9f2851d769e3f14b19f4e92c07c25d4b4689e250 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Sun, 21 Oct 2012 20:23:11 -0400 Subject: [PATCH 19/34] Updating UnifiedGenotyperGeneralPloidyIntegrationTest following rebasing -- Created a JIRA ticket https://jira.broadinstitute.org/browse/GSA-623 for Guillermo to look at the differences as the multi-allelic nature of many sites seems to change with the new more protected infrastructure. This may be due to implementation issues in the pooled caller, problems with my interface, or could be a genuine improvement. --- .../UnifiedGenotyperGeneralPloidyIntegrationTest.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyIntegrationTest.java index b447a1113..ae3befe66 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyIntegrationTest.java @@ -70,12 +70,12 @@ public class UnifiedGenotyperGeneralPloidyIntegrationTest extends WalkerTest { @Test(enabled = true) public void testINDEL_maxAltAlleles2_ploidy3_Pools_noRef() { - PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 3","LSV_INDEL_DISC_NOREF_p3","INDEL","06a512271631c5b511314a2618de82d7"); + PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 3","LSV_INDEL_DISC_NOREF_p3","INDEL","848e1092b5cd57b0da5f1187e67134e7"); } @Test(enabled = true) public void testINDEL_maxAltAlleles2_ploidy1_Pools_noRef() { - PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 1","LSV_INDEL_DISC_NOREF_p1","INDEL","36a383adfdbf1f59656138b538a9920d"); + PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 1","LSV_INDEL_DISC_NOREF_p1","INDEL","51a7b51d82a341adec0e6510f5dfadd8"); } @Test(enabled = true) From ccae6a5b9264350050695de05dad098884ea99f9 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Mon, 22 Oct 2012 11:48:34 -0400 Subject: [PATCH 20/34] Fixed the RR bug I (knowingly) introduced last week: turns out we can't trust a context size's worth of data from the previous marking. I think Mauricio warned me about this but I forgot. --- .../gatk/walkers/compression/reducereads/SlidingWindow.java | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java index 63524ae82..32abe8ef6 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java @@ -276,7 +276,10 @@ public class SlidingWindow { final int windowHeaderStartLocation = getStartLocation(windowHeader); final int sizeOfMarkedRegion = stop - windowHeaderStartLocation + contextSize + 1; - final int lastPositionMarked = markedSites.updateRegion(windowHeaderStartLocation, sizeOfMarkedRegion); + + // copy over as many bits as we can from the previous calculation. Note that we can't trust the + // last (contextSize - 1) worth of bits because we may not have actually looked at variant regions there. + final int lastPositionMarked = markedSites.updateRegion(windowHeaderStartLocation, sizeOfMarkedRegion) - contextSize - 1; final int locationToProcess = Math.min(lastPositionMarked, stop - contextSize); // update the iterator to the correct position From 97dc3664c9ca9bd42d98e1a236219bd4e849fa26 Mon Sep 17 00:00:00 2001 From: Khalid Shakir Date: Mon, 22 Oct 2012 12:04:23 -0400 Subject: [PATCH 21/34] Fixed yet another NPE related to the ArgumentTypeDescriptor vs. ArgumentMatchValue. Added integration test based on GSA-621. --- .../gatk/io/stubs/VCFWriterArgumentTypeDescriptor.java | 2 +- .../genotyper/UnifiedGenotyperIntegrationTest.java | 10 ++++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VCFWriterArgumentTypeDescriptor.java b/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VCFWriterArgumentTypeDescriptor.java index 43350ccc1..f521c959d 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VCFWriterArgumentTypeDescriptor.java +++ b/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VCFWriterArgumentTypeDescriptor.java @@ -151,7 +151,7 @@ public class VCFWriterArgumentTypeDescriptor extends ArgumentTypeDescriptor { ? new VariantContextWriterStub(engine, writerFile, argumentSources) : new VariantContextWriterStub(engine, defaultOutputStream, argumentSources); - stub.setCompressed(isCompressed(writerFileName.asString())); + stub.setCompressed(isCompressed(writerFileName == null ? null: writerFileName.asString())); stub.setDoNotWriteGenotypes(argumentIsPresent(createSitesOnlyArgumentDefinition(),matches)); stub.setSkipWritingCommandLineHeader(argumentIsPresent(createNoCommandLineHeaderArgumentDefinition(),matches)); stub.setForceBCF(argumentIsPresent(createBCFArgumentDefinition(),matches)); diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index 044d70fc2..3a2d6f697 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -7,6 +7,7 @@ import org.testng.annotations.Test; import java.io.File; import java.util.Arrays; +import java.util.Collections; import java.util.List; // ********************************************************************************** // @@ -18,6 +19,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { private final static String baseCommand = "-T UnifiedGenotyper -R " + b36KGReference + " -nosl --no_cmdline_in_header -glm BOTH -minIndelFrac 0.0 --dbsnp " + b36dbSNP129; private final static String baseCommandIndels = "-T UnifiedGenotyper -R " + b36KGReference + " -nosl --no_cmdline_in_header -glm INDEL -mbq 20 -minIndelFrac 0.0 --dbsnp " + b36dbSNP129; private final static String baseCommandIndelsb37 = "-T UnifiedGenotyper -R " + b37KGReference + " -nosl --no_cmdline_in_header -glm INDEL -mbq 20 --dbsnp " + b37dbSNP132; + private final static String baseCommandNoCmdLineHeaderStdout = "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam"; // -------------------------------------------------------------------------------------------------------------- // @@ -175,6 +177,14 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { executeTest("test using comp track", spec); } + @Test + public void testNoCmdLineHeaderStdout() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + baseCommandNoCmdLineHeaderStdout + " -glm INDEL -L 1:67,225,396-67,288,518", 0, + Collections.emptyList()); + executeTest("testNoCmdLineHeaderStdout", spec); + } + @Test public void testOutputParameterSitesOnly() { testOutputParameters("-sites_only", "97ba874eafc9884a4de027a84c036311"); From 90f59803fd8c02bf7c2c38f3e909b72ab2c35cf8 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Mon, 22 Oct 2012 09:58:43 -0400 Subject: [PATCH 22/34] MaxAltAlleles now defaults to 6, no more MaxAltAllelesForIndels -- Updated StandardCallerArgumentCollection to remove MaxAltAllelesForIndels. Previous argument is deprecated with meaningful doc message for people to use maxAltAlleles -- All constructores, factory methods, and test builders and their users updated to provide just a single argument -- Updating MD5s for integration tests that change due to genotyping more alleles -- Adding more alleles to genotyping results in slight changes in the QUAL value for multi-allelic loci where one or more alleles aren't polymorphic. That's simply due to the way that alternative hypotheses contribute as reference evidence against each true allele. The effect can be large (new qual = old qual / 2 in one case here). -- If we want more precision in our estimates we could decide (Eric, should we discuss?) to actually separately do a discovery phase in the genotyping, eliminate all variants not considered polymorphic, and then do a final round of calling to get the exact QUAL value for only those that are segregating. This would have the value of having the QUAL stay constant as more alleles are genotyped, at the cost of some code complexity increase and runtime. Might be worth it through --- .../genotyper/afcalc/AFCalcTestBuilder.java | 2 +- .../afcalc/GeneralPloidyExactAFCalc.java | 14 ++++----- .../genotyper/afcalc/AFCalcUnitTest.java | 4 +-- ...neralPloidyAFCalculationModelUnitTest.java | 2 +- .../HaplotypeCallerIntegrationTest.java | 2 +- .../StandardCallerArgumentCollection.java | 13 ++++++--- .../gatk/walkers/genotyper/afcalc/AFCalc.java | 10 ++----- .../genotyper/afcalc/AFCalcFactory.java | 29 ++++++++----------- .../genotyper/afcalc/DiploidExactAFCalc.java | 14 ++++----- .../walkers/genotyper/afcalc/ExactAFCalc.java | 4 +-- .../IndependentAllelesDiploidExactAFCalc.java | 17 ++++++----- .../afcalc/OriginalDiploidExactAFCalc.java | 4 +-- .../afcalc/ReferenceDiploidExactAFCalc.java | 4 +-- .../GLBasedSampleSelector.java | 2 +- .../UnifiedGenotyperIntegrationTest.java | 4 +-- 15 files changed, 59 insertions(+), 66 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcTestBuilder.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcTestBuilder.java index cfb67164d..6f3740ab3 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcTestBuilder.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcTestBuilder.java @@ -54,7 +54,7 @@ public class AFCalcTestBuilder { } public AFCalc makeModel() { - return AFCalcFactory.createAFCalc(modelType, nSamples, getNumAltAlleles(), getNumAltAlleles(), 2); + return AFCalcFactory.createAFCalc(modelType, nSamples, getNumAltAlleles(), 2); } public double[] makePriors() { diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyExactAFCalc.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyExactAFCalc.java index 2b247430c..b248c8759 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyExactAFCalc.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyExactAFCalc.java @@ -40,22 +40,20 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc { private final static double MAX_LOG10_ERROR_TO_STOP_EARLY = 6; // we want the calculation to be accurate to 1 / 10^6 private final static boolean VERBOSE = false; - protected GeneralPloidyExactAFCalc(final int nSamples, final int maxAltAlleles, final int maxAltAllelesForIndels, final int ploidy) { - super(nSamples, maxAltAlleles, maxAltAllelesForIndels, ploidy); + protected GeneralPloidyExactAFCalc(final int nSamples, final int maxAltAlleles, final int ploidy) { + super(nSamples, maxAltAlleles, ploidy); this.ploidy = ploidy; } @Override protected VariantContext reduceScope(VariantContext vc) { - final int maxAltAlleles = vc.getType().equals(VariantContext.Type.INDEL) ? maxAlternateAllelesForIndels : maxAlternateAllelesToGenotype; - // don't try to genotype too many alternate alleles - if ( vc.getAlternateAlleles().size() > maxAltAlleles) { - logger.warn("this tool is currently set to genotype at most " + maxAltAlleles + " alternate alleles in a given context, but the context at " + vc.getChr() + ":" + vc.getStart() + " has " + (vc.getAlternateAlleles().size()) + " alternate alleles so only the top alleles will be used; see the --max_alternate_alleles argument"); + if ( vc.getAlternateAlleles().size() > getMaxAltAlleles()) { + logger.warn("this tool is currently set to genotype at most " + getMaxAltAlleles() + " alternate alleles in a given context, but the context at " + vc.getChr() + ":" + vc.getStart() + " has " + (vc.getAlternateAlleles().size()) + " alternate alleles so only the top alleles will be used; see the --max_alternate_alleles argument"); - final List alleles = new ArrayList(maxAltAlleles + 1); + final List alleles = new ArrayList(getMaxAltAlleles() + 1); alleles.add(vc.getReference()); - alleles.addAll(chooseMostLikelyAlternateAlleles(vc, maxAltAlleles, ploidy)); + alleles.addAll(chooseMostLikelyAlternateAlleles(vc, getMaxAltAlleles(), ploidy)); VariantContextBuilder builder = new VariantContextBuilder(vc); builder.alleles(alleles); diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java index ef4318a40..2d346e548 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java @@ -125,7 +125,7 @@ public class AFCalcUnitTest extends BaseTest { final List triAllelicSamples = Arrays.asList(AA2, AB2, BB2, AC2, BC2, CC2); for ( final int nSamples : Arrays.asList(1, 2, 3, 4) ) { - List calcs = AFCalcFactory.createAFCalcs( Arrays.asList( AFCalcFactory.Calculation.values() ), 4, 2, 2, 2); + List calcs = AFCalcFactory.createAFCalcs( Arrays.asList( AFCalcFactory.Calculation.values() ), 4, 2, 2); final int nPriorValues = 2*nSamples+1; final double[] flatPriors = MathUtils.normalizeFromLog10(new double[nPriorValues], true); // flat priors @@ -215,7 +215,7 @@ public class AFCalcUnitTest extends BaseTest { samples.addAll(Collections.nCopies(nNonInformative, testData.nonInformative)); final int nSamples = samples.size(); - List calcs = AFCalcFactory.createAFCalcs(Arrays.asList(AFCalcFactory.Calculation.values()), 4, 2, 2, 2); + List calcs = AFCalcFactory.createAFCalcs(Arrays.asList(AFCalcFactory.Calculation.values()), 4, 2, 2); final double[] priors = MathUtils.normalizeFromLog10(new double[2*nSamples+1], true); // flat priors diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyAFCalculationModelUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyAFCalculationModelUnitTest.java index 1b3a4c0c0..3df2f7883 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyAFCalculationModelUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyAFCalculationModelUnitTest.java @@ -140,7 +140,7 @@ public class GeneralPloidyAFCalculationModelUnitTest extends BaseTest { final int len = GeneralPloidyGenotypeLikelihoods.getNumLikelihoodElements(1 + cfg.numAltAlleles, cfg.ploidy * cfg.GLs.size()); double[] priors = new double[len]; // flat priors - final GeneralPloidyExactAFCalc calc = new GeneralPloidyExactAFCalc(cfg.GLs.size(), 1 + cfg.numAltAlleles, 1 + cfg.numAltAlleles, cfg.ploidy); + final GeneralPloidyExactAFCalc calc = new GeneralPloidyExactAFCalc(cfg.GLs.size(), 1 + cfg.numAltAlleles, cfg.ploidy); calc.combineSinglePools(cfg.GLs, 1 + cfg.numAltAlleles, cfg.ploidy, priors); int nameIndex = 1; for ( int allele = 0; allele < cfg.numAltAlleles; allele++, nameIndex+=2 ) { diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index 870967f09..1d4fad19e 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -31,7 +31,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSampleGGA() { - HCTest(CEUTRIO_BAM, "--max_alternate_alleles_for_indels 3 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", "53caa950535749f99d3c5b9bb61c7b60"); + HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", "53caa950535749f99d3c5b9bb61c7b60"); } private void HCTestComplexVariants(String bam, String args, String md5) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java index 84dfa694b..aa3a1e6ac 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java @@ -55,20 +55,25 @@ public class StandardCallerArgumentCollection { * then only this many alleles will be used. Note that genotyping sites with many alternate alleles is both CPU and memory intensive and it * scales exponentially based on the number of alternate alleles. Unless there is a good reason to change the default value, we highly recommend * that you not play around with this parameter. + * + * As of GATK 2.2 the genotyper can handle a very large number of events, so the default maximum has been increased to 6. */ @Advanced @Argument(fullName = "max_alternate_alleles", shortName = "maxAltAlleles", doc = "Maximum number of alternate alleles to genotype", required = false) - public int MAX_ALTERNATE_ALLELES = 3; + public int MAX_ALTERNATE_ALLELES = 6; /** * If there are more than this number of alternate alleles presented to the genotyper (either through discovery or GENOTYPE_GIVEN ALLELES), * then only this many alleles will be used. Note that genotyping sites with many alternate alleles is both CPU and memory intensive and it * scales exponentially based on the number of alternate alleles. Unless there is a good reason to change the default value, we highly recommend * that you not play around with this parameter. + * + * This argument has been retired in GATK 2.2. Please specify just maxAltAlleles from now on */ - @Advanced - @Argument(fullName = "max_alternate_alleles_for_indels", shortName = "maxAltAllelesForIndels", doc = "Maximum number of alternate alleles to genotype for indels only", required = false) - public int MAX_ALTERNATE_ALLELES_FOR_INDELS = 2; + @Deprecated + @Hidden + @Argument(fullName = "max_alternate_alleles_for_indels", shortName = "maxAltAllelesForIndels", doc = "This argument has been retired in GATK 2.2. Please specify just maxAltAlleles from now on, which will apply to any variant, regardless of type", required = false) + public int MAX_ALTERNATE_ALLELES_FOR_INDELS = -1; /** * If this fraction is greater is than zero, the caller will aggressively attempt to remove contamination through biased down-sampling of reads. diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalc.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalc.java index e3abdeb24..f783267bc 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalc.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalc.java @@ -45,7 +45,6 @@ public abstract class AFCalc implements Cloneable { protected final int nSamples; protected final int maxAlternateAllelesToGenotype; - protected final int maxAlternateAllelesForIndels; protected Logger logger = defaultLogger; @@ -60,19 +59,16 @@ public abstract class AFCalc implements Cloneable { * * @param nSamples number of samples, must be > 0 * @param maxAltAlleles maxAltAlleles for SNPs - * @param maxAltAllelesForIndels for indels * @param ploidy the ploidy, must be > 0 */ - protected AFCalc(final int nSamples, final int maxAltAlleles, final int maxAltAllelesForIndels, final int ploidy) { + protected AFCalc(final int nSamples, final int maxAltAlleles, final int ploidy) { if ( nSamples < 0 ) throw new IllegalArgumentException("nSamples must be greater than zero " + nSamples); if ( maxAltAlleles < 1 ) throw new IllegalArgumentException("maxAltAlleles must be greater than zero " + maxAltAlleles); - if ( maxAltAllelesForIndels < 1 ) throw new IllegalArgumentException("maxAltAllelesForIndels must be greater than zero " + maxAltAllelesForIndels); if ( ploidy < 1 ) throw new IllegalArgumentException("ploidy must be > 0 but got " + ploidy); this.nSamples = nSamples; this.maxAlternateAllelesToGenotype = maxAltAlleles; - this.maxAlternateAllelesForIndels = maxAltAllelesForIndels; - this.stateTracker = new StateTracker(Math.max(maxAltAlleles, maxAltAllelesForIndels)); + this.stateTracker = new StateTracker(maxAltAlleles); } /** @@ -191,7 +187,7 @@ public abstract class AFCalc implements Cloneable { // --------------------------------------------------------------------------- public int getMaxAltAlleles() { - return Math.max(maxAlternateAllelesToGenotype, maxAlternateAllelesForIndels); + return maxAlternateAllelesToGenotype; } protected StateTracker getStateTracker() { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcFactory.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcFactory.java index 80de555ca..efb16101e 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcFactory.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcFactory.java @@ -91,7 +91,7 @@ public class AFCalcFactory { public static AFCalc createAFCalc(final UnifiedArgumentCollection UAC, final int nSamples, final Logger logger) { - final int maxAltAlleles = Math.max(UAC.MAX_ALTERNATE_ALLELES, UAC.MAX_ALTERNATE_ALLELES_FOR_INDELS); + final int maxAltAlleles = UAC.MAX_ALTERNATE_ALLELES; if ( ! UAC.AFmodel.usableForParams(UAC.samplePloidy, maxAltAlleles) ) { logger.info("Requested ploidy " + UAC.samplePloidy + " maxAltAlleles " + maxAltAlleles + " not supported by requested model " + UAC.AFmodel + " looking for an option"); final List supportingCalculations = new LinkedList(); @@ -109,7 +109,7 @@ public class AFCalcFactory { logger.info("Selecting model " + UAC.AFmodel); } - final AFCalc calc = createAFCalc(UAC.AFmodel, nSamples, UAC.MAX_ALTERNATE_ALLELES, UAC.MAX_ALTERNATE_ALLELES_FOR_INDELS, UAC.samplePloidy); + final AFCalc calc = createAFCalc(UAC.AFmodel, nSamples, UAC.MAX_ALTERNATE_ALLELES, UAC.samplePloidy); if ( logger != null ) calc.setLogger(logger); if ( UAC.exactCallsLog != null ) calc.enableProcessLog(UAC.exactCallsLog); @@ -126,7 +126,7 @@ public class AFCalcFactory { * @return an initialized AFCalc */ public static AFCalc createAFCalc(final int nSamples) { - return createAFCalc(chooseBestCalculation(nSamples, 2, 1), nSamples, 2, 2, 2); + return createAFCalc(chooseBestCalculation(nSamples, 2, 1), nSamples, 2, 2); } /** @@ -139,7 +139,7 @@ public class AFCalcFactory { * @return an initialized AFCalc */ public static AFCalc createAFCalc(final Calculation calc, final int nSamples, final int maxAltAlleles) { - return createAFCalc(calc, nSamples, maxAltAlleles, maxAltAlleles, 2); + return createAFCalc(calc, nSamples, maxAltAlleles, 2); } /** @@ -147,14 +147,12 @@ public class AFCalcFactory { * * @param nSamples the number of samples we'll be using * @param maxAltAlleles the max. alt alleles to consider for SNPs - * @param maxAltAllelesForIndels the max. alt alleles to consider for non-SNPs * @param ploidy the sample ploidy. Must be consistent with the calc * * @return an initialized AFCalc */ - public static AFCalc createAFCalc(final int nSamples, final int maxAltAlleles, final int maxAltAllelesForIndels, final int ploidy) { - final int maxAlt = Math.max(maxAltAlleles, maxAltAllelesForIndels); - return createAFCalc(chooseBestCalculation(nSamples, ploidy, maxAlt), nSamples, maxAltAlleles, maxAltAllelesForIndels, ploidy); + public static AFCalc createAFCalc(final int nSamples, final int maxAltAlleles, final int ploidy) { + return createAFCalc(chooseBestCalculation(nSamples, ploidy, maxAltAlleles), nSamples, maxAltAlleles, ploidy); } /** @@ -181,20 +179,17 @@ public class AFCalcFactory { * @param calc the calculation to use * @param nSamples the number of samples we'll be using * @param maxAltAlleles the max. alt alleles to consider for SNPs - * @param maxAltAllelesForIndels the max. alt alleles to consider for non-SNPs * @param ploidy the sample ploidy. Must be consistent with the calc * * @return an initialized AFCalc */ - public static AFCalc createAFCalc(final Calculation calc, final int nSamples, final int maxAltAlleles, final int maxAltAllelesForIndels, final int ploidy) { + public static AFCalc createAFCalc(final Calculation calc, final int nSamples, final int maxAltAlleles, final int ploidy) { if ( calc == null ) throw new IllegalArgumentException("Calculation cannot be null"); if ( nSamples < 0 ) throw new IllegalArgumentException("nSamples must be greater than zero " + nSamples); if ( maxAltAlleles < 1 ) throw new IllegalArgumentException("maxAltAlleles must be greater than zero " + maxAltAlleles); - if ( maxAltAllelesForIndels < 1 ) throw new IllegalArgumentException("maxAltAllelesForIndels must be greater than zero " + maxAltAllelesForIndels); if ( ploidy < 1 ) throw new IllegalArgumentException("sample ploidy must be greater than zero " + ploidy); - final int maxAlt = Math.max(maxAltAlleles, maxAltAllelesForIndels); - if ( ! calc.usableForParams(ploidy, maxAlt) ) + if ( ! calc.usableForParams(ploidy, maxAltAlleles) ) throw new IllegalArgumentException("AFCalc " + calc + " does not support requested ploidy " + ploidy); final Class afClass = getClassByName(calc.className); @@ -202,19 +197,19 @@ public class AFCalcFactory { throw new IllegalArgumentException("Unexpected AFCalc " + calc); try { - Object args[] = new Object[]{nSamples, maxAltAlleles, maxAltAllelesForIndels, ploidy}; - Constructor c = afClass.getDeclaredConstructor(int.class, int.class, int.class, int.class); + Object args[] = new Object[]{nSamples, maxAltAlleles, ploidy}; + Constructor c = afClass.getDeclaredConstructor(int.class, int.class, int.class); return (AFCalc)c.newInstance(args); } catch (Exception e) { throw new ReviewedStingException("Could not instantiate AFCalc " + calc, e); } } - protected static List createAFCalcs(final List calcs, final int nSamples, final int maxAltAlleles, final int maxAltAllelesForIndels, final int ploidy) { + protected static List createAFCalcs(final List calcs, final int nSamples, final int maxAltAlleles, final int ploidy) { final List AFCalcs = new LinkedList(); for ( final Calculation calc : calcs ) - AFCalcs.add(createAFCalc(calc, nSamples, maxAltAlleles, maxAltAllelesForIndels, ploidy)); + AFCalcs.add(createAFCalc(calc, nSamples, maxAltAlleles, ploidy)); return AFCalcs; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/DiploidExactAFCalc.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/DiploidExactAFCalc.java index 6b345dcf5..4895c84d9 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/DiploidExactAFCalc.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/DiploidExactAFCalc.java @@ -31,8 +31,8 @@ import org.broadinstitute.sting.utils.variantcontext.*; import java.util.*; public abstract class DiploidExactAFCalc extends ExactAFCalc { - public DiploidExactAFCalc(final int nSamples, final int maxAltAlleles, final int maxAltAllelesForIndels, final int ploidy) { - super(nSamples, maxAltAlleles, maxAltAllelesForIndels, ploidy); + public DiploidExactAFCalc(final int nSamples, final int maxAltAlleles, final int ploidy) { + super(nSamples, maxAltAlleles, ploidy); if ( ploidy != 2 ) throw new IllegalArgumentException("ploidy must be two for DiploidExactAFCalc and subclasses but saw " + ploidy); } @@ -75,16 +75,14 @@ public abstract class DiploidExactAFCalc extends ExactAFCalc { @Override protected VariantContext reduceScope(final VariantContext vc) { - final int myMaxAltAllelesToGenotype = vc.getType().equals(VariantContext.Type.INDEL) ? maxAlternateAllelesForIndels : maxAlternateAllelesToGenotype; - // don't try to genotype too many alternate alleles - if ( vc.getAlternateAlleles().size() > myMaxAltAllelesToGenotype ) { - logger.warn("this tool is currently set to genotype at most " + myMaxAltAllelesToGenotype + " alternate alleles in a given context, but the context at " + vc.getChr() + ":" + vc.getStart() + " has " + (vc.getAlternateAlleles().size()) + " alternate alleles so only the top alleles will be used; see the --max_alternate_alleles argument"); + if ( vc.getAlternateAlleles().size() > getMaxAltAlleles() ) { + logger.warn("this tool is currently set to genotype at most " + getMaxAltAlleles() + " alternate alleles in a given context, but the context at " + vc.getChr() + ":" + vc.getStart() + " has " + (vc.getAlternateAlleles().size()) + " alternate alleles so only the top alleles will be used; see the --max_alternate_alleles argument"); VariantContextBuilder builder = new VariantContextBuilder(vc); - List alleles = new ArrayList(myMaxAltAllelesToGenotype + 1); + List alleles = new ArrayList(getMaxAltAlleles() + 1); alleles.add(vc.getReference()); - alleles.addAll(chooseMostLikelyAlternateAlleles(vc, myMaxAltAllelesToGenotype)); + alleles.addAll(chooseMostLikelyAlternateAlleles(vc, getMaxAltAlleles())); builder.alleles(alleles); builder.genotypes(VariantContextUtils.subsetDiploidAlleles(vc, alleles, false)); return builder.make(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactAFCalc.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactAFCalc.java index df0793352..ab230d398 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactAFCalc.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactAFCalc.java @@ -39,8 +39,8 @@ import java.util.ArrayList; abstract class ExactAFCalc extends AFCalc { protected static final int HOM_REF_INDEX = 0; // AA likelihoods are always first - protected ExactAFCalc(final int nSamples, int maxAltAlleles, int maxAltAllelesForIndels, final int ploidy) { - super(nSamples, maxAltAlleles, maxAltAllelesForIndels, ploidy); + protected ExactAFCalc(final int nSamples, int maxAltAlleles, final int ploidy) { + super(nSamples, maxAltAlleles, ploidy); } /** diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalc.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalc.java index 804b560b4..857b7e59e 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalc.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalc.java @@ -89,7 +89,7 @@ import java.util.*; /** * The min. confidence of an allele to be included in the joint posterior. */ - private final static double MIN_LOG10_CONFIDENCE_TO_INCLUDE_ALLELE_IN_POSTERIOR = Math.log10(1e-20); + private final static double MIN_LOG10_CONFIDENCE_TO_INCLUDE_ALLELE_IN_POSTERIOR = Math.log10(1e-10); private final static int[] BIALLELIC_NON_INFORMATIVE_PLS = new int[]{0,0,0}; private final static List BIALLELIC_NOCALL = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL); @@ -111,9 +111,9 @@ import java.util.*; */ final AFCalc biAlleleExactModel; - protected IndependentAllelesDiploidExactAFCalc(int nSamples, int maxAltAlleles, int maxAltAllelesForIndels, final int ploidy) { - super(nSamples, maxAltAlleles, maxAltAllelesForIndels, ploidy); - biAlleleExactModel = new ReferenceDiploidExactAFCalc(nSamples, 1, 1, ploidy); + protected IndependentAllelesDiploidExactAFCalc(int nSamples, int maxAltAlleles, final int ploidy) { + super(nSamples, maxAltAlleles, ploidy); + biAlleleExactModel = new ReferenceDiploidExactAFCalc(nSamples, 1, ploidy); } /** @@ -336,12 +336,13 @@ import java.util.*; // MLE of altI allele is simply the MLE of this allele in altAlleles alleleCountsOfMLE[altI] = sortedResultWithThetaNPriors.getAlleleCountAtMLE(altAllele); - log10PriorsOfAC[0] += sortedResultWithThetaNPriors.getLog10PriorOfAFEq0(); - log10PriorsOfAC[1] += sortedResultWithThetaNPriors.getLog10PriorOfAFGT0(); - // the AF > 0 case requires us to store the normalized likelihood for later summation - if ( sortedResultWithThetaNPriors.getLog10PosteriorOfAFGT0() > MIN_LOG10_CONFIDENCE_TO_INCLUDE_ALLELE_IN_POSTERIOR ) + if ( sortedResultWithThetaNPriors.getLog10PosteriorOfAFGT0() > MIN_LOG10_CONFIDENCE_TO_INCLUDE_ALLELE_IN_POSTERIOR ) { log10PosteriorOfACEq0Sum += sortedResultWithThetaNPriors.getLog10PosteriorOfAFEq0(); + log10PriorsOfAC[0] += sortedResultWithThetaNPriors.getLog10PriorOfAFEq0(); + log10PriorsOfAC[1] += sortedResultWithThetaNPriors.getLog10PriorOfAFGT0(); + } + log10PosteriorOfACGt0Sum += sortedResultWithThetaNPriors.getLog10PosteriorOfAFGT0(); // bind pNonRef for allele to the posterior value of the AF > 0 with the new adjusted prior diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/OriginalDiploidExactAFCalc.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/OriginalDiploidExactAFCalc.java index ac4634666..fc26111e0 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/OriginalDiploidExactAFCalc.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/OriginalDiploidExactAFCalc.java @@ -13,8 +13,8 @@ import java.util.Map; * Original bi-allelic ~O(N) implementation. Kept here for posterity and reference */ class OriginalDiploidExactAFCalc extends DiploidExactAFCalc { - protected OriginalDiploidExactAFCalc(int nSamples, int maxAltAlleles, int maxAltAllelesForIndels, final int ploidy) { - super(nSamples, maxAltAlleles, maxAltAllelesForIndels, ploidy); + protected OriginalDiploidExactAFCalc(int nSamples, int maxAltAlleles, final int ploidy) { + super(nSamples, maxAltAlleles, ploidy); } @Override diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ReferenceDiploidExactAFCalc.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ReferenceDiploidExactAFCalc.java index b4e7b2ab1..97e5fed3b 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ReferenceDiploidExactAFCalc.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ReferenceDiploidExactAFCalc.java @@ -1,7 +1,7 @@ package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; public class ReferenceDiploidExactAFCalc extends DiploidExactAFCalc { - protected ReferenceDiploidExactAFCalc(int nSamples, int maxAltAlleles, int maxAltAllelesForIndels, final int ploidy) { - super(nSamples, maxAltAlleles, maxAltAllelesForIndels, ploidy); + protected ReferenceDiploidExactAFCalc(int nSamples, int maxAltAlleles, final int ploidy) { + super(nSamples, maxAltAlleles, ploidy); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GLBasedSampleSelector.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GLBasedSampleSelector.java index 48a2d2700..d71d0c9c8 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GLBasedSampleSelector.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GLBasedSampleSelector.java @@ -55,7 +55,7 @@ public class GLBasedSampleSelector extends SampleSelector { // do we want to apply a prior? maybe user-spec? if ( flatPriors == null ) { flatPriors = new double[1+2*samples.size()]; - AFCalculator = AFCalcFactory.createAFCalc(samples.size(), 4, 4, 2); + AFCalculator = AFCalcFactory.createAFCalc(samples.size(), 4, 2); } final AFCalcResult result = AFCalculator.getLog10PNonRef(subContext, flatPriors); // do we want to let this qual go up or down? diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index 3a2d6f697..a1701e3e5 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -62,7 +62,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultipleSNPAlleles() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b37KGReference + " -nosl --no_cmdline_in_header -glm BOTH --dbsnp " + b37dbSNP129 + " -I " + privateTestDir + "multiallelic.snps.bam -o %s -L " + privateTestDir + "multiallelic.snps.intervals", 1, - Arrays.asList("73c9b926c5e971a113de347a64fdcf20")); + Arrays.asList("543f68e42034bf44cfb24da8c9204320")); executeTest("test Multiple SNP alleles", spec); } @@ -78,7 +78,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testReverseTrim() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b37KGReference + " -nosl --no_cmdline_in_header -glm INDEL -I " + validationDataLocation + "CEUTrio.HiSeq.b37.chr20.10_11mb.bam -o %s -L 20:10289124 -L 20:10090289", 1, - Arrays.asList("7f8d13690cb7d4173787afa00c496f12")); + Arrays.asList("5ce03dd9ca2d9324c1d4a9d64389beb5")); executeTest("test reverse trim", spec); } From 008df545754ce9dee93a0f0de29056385bea73b2 Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Mon, 22 Oct 2012 14:21:52 -0400 Subject: [PATCH 23/34] Bug fix in GATKSAMRecord.getSoftEnd() for reads that are entirely clipped. --- .../sting/utils/sam/GATKSAMRecord.java | 21 ++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java index 1feb76517..9fdb48b34 100755 --- a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java @@ -60,8 +60,9 @@ public class GATKSAMRecord extends BAMRecord { private String mReadString = null; private GATKSAMReadGroupRecord mReadGroup = null; private byte[] reducedReadCounts = null; - private int softStart = -1; - private int softEnd = -1; + private final static int UNINITIALIZED = -1; + private int softStart = UNINITIALIZED; + private int softEnd = UNINITIALIZED; // because some values can be null, we don't want to duplicate effort private boolean retrievedReadGroup = false; @@ -386,7 +387,7 @@ public class GATKSAMRecord extends BAMRecord { * @return the unclipped start of the read taking soft clips (but not hard clips) into account */ public int getSoftStart() { - if (softStart < 0) { + if ( softStart == UNINITIALIZED ) { softStart = getAlignmentStart(); for (final CigarElement cig : getCigar().getCigarElements()) { final CigarOperator op = cig.getOperator(); @@ -408,17 +409,23 @@ public class GATKSAMRecord extends BAMRecord { * @return the unclipped end of the read taking soft clips (but not hard clips) into account */ public int getSoftEnd() { - if ( softEnd < 0 ) { + if ( softEnd == UNINITIALIZED ) { + boolean foundAlignedBase = false; softEnd = getAlignmentEnd(); final List cigs = getCigar().getCigarElements(); - for (int i=cigs.size() - 1; i>=0; --i) { + for (int i = cigs.size() - 1; i >= 0; --i) { final CigarElement cig = cigs.get(i); final CigarOperator op = cig.getOperator(); - if (op == CigarOperator.SOFT_CLIP) + if (op == CigarOperator.SOFT_CLIP) // assumes the soft clip that we found is at the end of the aligned read softEnd += cig.getLength(); - else if (op != CigarOperator.HARD_CLIP) + else if (op != CigarOperator.HARD_CLIP) { + foundAlignedBase = true; break; + } + } + if( !foundAlignedBase ) { // for example 64H14S, the soft end is actually the same as the alignment end + softEnd = getAlignmentEnd(); } } From fd59e7d5f65784182918a9f86f888cb3515bf1af Mon Sep 17 00:00:00 2001 From: Khalid Shakir Date: Mon, 22 Oct 2012 16:27:31 -0400 Subject: [PATCH 24/34] Better error message when generic types are erased from scala collections. --- .../broadinstitute/sting/utils/exceptions/UserException.java | 3 +++ .../sting/queue/util/ScalaCompoundArgumentTypeDescriptor.scala | 3 +++ 2 files changed, 6 insertions(+) diff --git a/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java b/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java index 6b97f8f9f..c1f408bc7 100755 --- a/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java +++ b/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java @@ -352,6 +352,9 @@ public class UserException extends ReviewedStingException { } public static class CannotExecuteQScript extends UserException { + public CannotExecuteQScript(String message) { + super(String.format("Unable to execute QScript: " + message)); + } public CannotExecuteQScript(String message, Exception e) { super(String.format("Unable to execute QScript: " + message), e); } diff --git a/public/scala/src/org/broadinstitute/sting/queue/util/ScalaCompoundArgumentTypeDescriptor.scala b/public/scala/src/org/broadinstitute/sting/queue/util/ScalaCompoundArgumentTypeDescriptor.scala index 0d8edc25d..54e89ec58 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/util/ScalaCompoundArgumentTypeDescriptor.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/util/ScalaCompoundArgumentTypeDescriptor.scala @@ -28,6 +28,7 @@ import collection.JavaConversions._ import org.broadinstitute.sting.queue.QException import java.lang.Class import org.broadinstitute.sting.commandline.{ArgumentMatches, ArgumentSource, ArgumentTypeDescriptor, ParsingEngine} +import org.broadinstitute.sting.utils.exceptions.UserException import java.lang.reflect.Type /** @@ -75,6 +76,8 @@ class ScalaCompoundArgumentTypeDescriptor extends ArgumentTypeDescriptor { def parse(parsingEngine: ParsingEngine, source: ArgumentSource, classType: Class[_], argumentMatches: ArgumentMatches) = { val componentType = ReflectionUtils.getCollectionType(source.field) + if (componentType == classOf[java.lang.Object]) + throw new UserException.CannotExecuteQScript("Please also include a @ClassType(classOf[]) annotation on field: " + source.field + ". Example: @ClassType(classOf[Double]). The scala generic type for the field was subjected to java/scala type erasure and is not available via reflection.") val componentArgumentParser = parsingEngine.selectBestTypeDescriptor(componentType) if (classOf[Seq[_]].isAssignableFrom(classType)) { From 15b28e61cd6f73c9ddbab1e06e8b0cb08e62572e Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Mon, 22 Oct 2012 16:54:38 -0400 Subject: [PATCH 25/34] Retiring TraverseReads and TraverseLoci after testing confirms nano scheduler version in single threaded version is fine -- There's been no report of problems with the nano scheduled version of TraverseLoci and TraverseReads, so I'm removing the old versions since they are no longer needed -- Removing unnecessary intermediate base classes -- GSA-515 / Nanoscheduler GSA-549 / https://jira.broadinstitute.org/browse/GSA-549 --- .../sting/gatk/executive/MicroScheduler.java | 12 +- .../gatk/traversals/TraverseLociBase.java | 103 ---------------- .../gatk/traversals/TraverseLociLinear.java | 47 -------- .../gatk/traversals/TraverseLociNano.java | 85 +++++++++++++- .../gatk/traversals/TraverseReadPairs.java | 2 +- .../sting/gatk/traversals/TraverseReads.java | 111 ------------------ .../traversals/TraverseReadsUnitTest.java | 11 +- 7 files changed, 89 insertions(+), 282 deletions(-) delete mode 100755 public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociBase.java delete mode 100755 public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociLinear.java delete mode 100755 public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReads.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java index 223e11680..df4ed9ef8 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java @@ -74,8 +74,6 @@ import java.util.*; * */ public abstract class MicroScheduler implements MicroSchedulerMBean { - // TODO -- remove me and retire non nano scheduled versions of traversals - private final static boolean USE_NANOSCHEDULER_FOR_EVERYTHING = true; protected static final Logger logger = Logger.getLogger(MicroScheduler.class); /** @@ -238,15 +236,9 @@ public abstract class MicroScheduler implements MicroSchedulerMBean { @Ensures("result != null") private TraversalEngine createTraversalEngine(final Walker walker, final ThreadAllocation threadAllocation) { if (walker instanceof ReadWalker) { - if ( USE_NANOSCHEDULER_FOR_EVERYTHING || threadAllocation.getNumCPUThreadsPerDataThread() > 1 ) - return new TraverseReadsNano(threadAllocation.getNumCPUThreadsPerDataThread()); - else - return new TraverseReads(); + return new TraverseReadsNano(threadAllocation.getNumCPUThreadsPerDataThread()); } else if (walker instanceof LocusWalker) { - if ( USE_NANOSCHEDULER_FOR_EVERYTHING || threadAllocation.getNumCPUThreadsPerDataThread() > 1 ) - return new TraverseLociNano(threadAllocation.getNumCPUThreadsPerDataThread()); - else - return new TraverseLociLinear(); + return new TraverseLociNano(threadAllocation.getNumCPUThreadsPerDataThread()); } else if (walker instanceof DuplicateWalker) { return new TraverseDuplicates(); } else if (walker instanceof ReadPairWalker) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociBase.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociBase.java deleted file mode 100755 index 30e78ef5c..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociBase.java +++ /dev/null @@ -1,103 +0,0 @@ -package org.broadinstitute.sting.gatk.traversals; - -import org.apache.log4j.Logger; -import org.broadinstitute.sting.gatk.WalkerManager; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.datasources.providers.*; -import org.broadinstitute.sting.gatk.walkers.DataSource; -import org.broadinstitute.sting.gatk.walkers.LocusWalker; -import org.broadinstitute.sting.gatk.walkers.Walker; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl; - -/** - * A simple solution to iterating over all reference positions over a series of genomic locations. - */ -public abstract class TraverseLociBase extends TraversalEngine,LocusShardDataProvider> { - /** - * our log, which we want to capture anything from this class - */ - protected static final Logger logger = Logger.getLogger(TraversalEngine.class); - - @Override - public final String getTraversalUnits() { - return "sites"; - } - - protected static class TraverseResults { - final int numIterations; - final T reduceResult; - - public TraverseResults(int numIterations, T reduceResult) { - this.numIterations = numIterations; - this.reduceResult = reduceResult; - } - } - - protected abstract TraverseResults traverse( final LocusWalker walker, - final LocusView locusView, - final LocusReferenceView referenceView, - final ReferenceOrderedView referenceOrderedDataView, - final T sum); - - @Override - public T traverse( LocusWalker walker, - LocusShardDataProvider dataProvider, - T sum) { - logger.debug(String.format("TraverseLociBase.traverse: Shard is %s", dataProvider)); - - final LocusView locusView = getLocusView( walker, dataProvider ); - - if ( locusView.hasNext() ) { // trivial optimization to avoid unnecessary processing when there's nothing here at all - //ReferenceOrderedView referenceOrderedDataView = new ReferenceOrderedView( dataProvider ); - ReferenceOrderedView referenceOrderedDataView = null; - if ( WalkerManager.getWalkerDataSource(walker) != DataSource.REFERENCE_ORDERED_DATA ) - referenceOrderedDataView = new ManagingReferenceOrderedView( dataProvider ); - else - referenceOrderedDataView = (RodLocusView)locusView; - - final LocusReferenceView referenceView = new LocusReferenceView( walker, dataProvider ); - - final TraverseResults result = traverse( walker, locusView, referenceView, referenceOrderedDataView, sum ); - sum = result.reduceResult; - dataProvider.getShard().getReadMetrics().incrementNumIterations(result.numIterations); - updateCumulativeMetrics(dataProvider.getShard()); - } - - // We have a final map call to execute here to clean up the skipped based from the - // last position in the ROD to that in the interval - if ( WalkerManager.getWalkerDataSource(walker) == DataSource.REFERENCE_ORDERED_DATA && ! walker.isDone() ) { - // only do this if the walker isn't done! - final RodLocusView rodLocusView = (RodLocusView)locusView; - final long nSkipped = rodLocusView.getLastSkippedBases(); - if ( nSkipped > 0 ) { - final GenomeLoc site = rodLocusView.getLocOneBeyondShard(); - final AlignmentContext ac = new AlignmentContext(site, new ReadBackedPileupImpl(site), nSkipped); - final M x = walker.map(null, null, ac); - sum = walker.reduce(x, sum); - } - } - - return sum; - } - - /** - * Gets the best view of loci for this walker given the available data. The view will function as a 'trigger track' - * of sorts, providing a consistent interface so that TraverseLociBase doesn't need to be reimplemented for any new datatype - * that comes along. - * @param walker walker to interrogate. - * @param dataProvider Data which which to drive the locus view. - * @return A view of the locus data, where one iteration of the locus view maps to one iteration of the traversal. - */ - private LocusView getLocusView( Walker walker, LocusShardDataProvider dataProvider ) { - final DataSource dataSource = WalkerManager.getWalkerDataSource(walker); - if( dataSource == DataSource.READS ) - return new CoveredLocusView(dataProvider); - else if( dataSource == DataSource.REFERENCE ) //|| ! GenomeAnalysisEngine.instance.getArguments().enableRodWalkers ) - return new AllLocusView(dataProvider); - else if( dataSource == DataSource.REFERENCE_ORDERED_DATA ) - return new RodLocusView(dataProvider); - else - throw new UnsupportedOperationException("Unsupported traversal type: " + dataSource); - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociLinear.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociLinear.java deleted file mode 100755 index 22381092f..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociLinear.java +++ /dev/null @@ -1,47 +0,0 @@ -package org.broadinstitute.sting.gatk.traversals; - -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.datasources.providers.LocusReferenceView; -import org.broadinstitute.sting.gatk.datasources.providers.LocusView; -import org.broadinstitute.sting.gatk.datasources.providers.ReferenceOrderedView; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.LocusWalker; -import org.broadinstitute.sting.utils.GenomeLoc; - -/** - * A simple solution to iterating over all reference positions over a series of genomic locations. - */ -public class TraverseLociLinear extends TraverseLociBase { - - @Override - protected TraverseResults traverse(LocusWalker walker, LocusView locusView, LocusReferenceView referenceView, ReferenceOrderedView referenceOrderedDataView, T sum) { - // We keep processing while the next reference location is within the interval - boolean done = false; - int numIterations = 0; - - while( locusView.hasNext() && ! done ) { - numIterations++; - final AlignmentContext locus = locusView.next(); - final GenomeLoc location = locus.getLocation(); - - // create reference context. Note that if we have a pileup of "extended events", the context will - // hold the (longest) stretch of deleted reference bases (if deletions are present in the pileup). - final ReferenceContext refContext = referenceView.getReferenceContext(location); - - // Iterate forward to get all reference ordered data covering this location - final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus(locus.getLocation(), refContext); - - final boolean keepMeP = walker.filter(tracker, refContext, locus); - if (keepMeP) { - final M x = walker.map(tracker, refContext, locus); - sum = walker.reduce(x, sum); - done = walker.isDone(); - } - - printProgress(locus.getLocation()); - } - - return new TraverseResults(numIterations, sum); - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociNano.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociNano.java index 469625c30..84715e5fd 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociNano.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociNano.java @@ -1,24 +1,26 @@ package org.broadinstitute.sting.gatk.traversals; +import org.broadinstitute.sting.gatk.WalkerManager; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.datasources.providers.LocusReferenceView; -import org.broadinstitute.sting.gatk.datasources.providers.LocusView; -import org.broadinstitute.sting.gatk.datasources.providers.ReferenceOrderedView; +import org.broadinstitute.sting.gatk.datasources.providers.*; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.DataSource; import org.broadinstitute.sting.gatk.walkers.LocusWalker; +import org.broadinstitute.sting.gatk.walkers.Walker; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.nanoScheduler.NSMapFunction; import org.broadinstitute.sting.utils.nanoScheduler.NSProgressFunction; import org.broadinstitute.sting.utils.nanoScheduler.NSReduceFunction; import org.broadinstitute.sting.utils.nanoScheduler.NanoScheduler; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl; import java.util.Iterator; /** * A simple solution to iterating over all reference positions over a series of genomic locations. */ -public class TraverseLociNano extends TraverseLociBase { +public class TraverseLociNano extends TraversalEngine,LocusShardDataProvider> { /** our log, which we want to capture anything from this class */ private static final boolean DEBUG = false; @@ -30,6 +32,81 @@ public class TraverseLociNano extends TraverseLociBase { } @Override + public final String getTraversalUnits() { + return "sites"; + } + + protected static class TraverseResults { + final int numIterations; + final T reduceResult; + + public TraverseResults(int numIterations, T reduceResult) { + this.numIterations = numIterations; + this.reduceResult = reduceResult; + } + } + + @Override + public T traverse( LocusWalker walker, + LocusShardDataProvider dataProvider, + T sum) { + logger.debug(String.format("TraverseLoci.traverse: Shard is %s", dataProvider)); + + final LocusView locusView = getLocusView( walker, dataProvider ); + + if ( locusView.hasNext() ) { // trivial optimization to avoid unnecessary processing when there's nothing here at all + //ReferenceOrderedView referenceOrderedDataView = new ReferenceOrderedView( dataProvider ); + ReferenceOrderedView referenceOrderedDataView = null; + if ( WalkerManager.getWalkerDataSource(walker) != DataSource.REFERENCE_ORDERED_DATA ) + referenceOrderedDataView = new ManagingReferenceOrderedView( dataProvider ); + else + referenceOrderedDataView = (RodLocusView)locusView; + + final LocusReferenceView referenceView = new LocusReferenceView( walker, dataProvider ); + + final TraverseResults result = traverse( walker, locusView, referenceView, referenceOrderedDataView, sum ); + sum = result.reduceResult; + dataProvider.getShard().getReadMetrics().incrementNumIterations(result.numIterations); + updateCumulativeMetrics(dataProvider.getShard()); + } + + // We have a final map call to execute here to clean up the skipped based from the + // last position in the ROD to that in the interval + if ( WalkerManager.getWalkerDataSource(walker) == DataSource.REFERENCE_ORDERED_DATA && ! walker.isDone() ) { + // only do this if the walker isn't done! + final RodLocusView rodLocusView = (RodLocusView)locusView; + final long nSkipped = rodLocusView.getLastSkippedBases(); + if ( nSkipped > 0 ) { + final GenomeLoc site = rodLocusView.getLocOneBeyondShard(); + final AlignmentContext ac = new AlignmentContext(site, new ReadBackedPileupImpl(site), nSkipped); + final M x = walker.map(null, null, ac); + sum = walker.reduce(x, sum); + } + } + + return sum; + } + + /** + * Gets the best view of loci for this walker given the available data. The view will function as a 'trigger track' + * of sorts, providing a consistent interface so that TraverseLoci doesn't need to be reimplemented for any new datatype + * that comes along. + * @param walker walker to interrogate. + * @param dataProvider Data which which to drive the locus view. + * @return A view of the locus data, where one iteration of the locus view maps to one iteration of the traversal. + */ + private LocusView getLocusView( Walker walker, LocusShardDataProvider dataProvider ) { + final DataSource dataSource = WalkerManager.getWalkerDataSource(walker); + if( dataSource == DataSource.READS ) + return new CoveredLocusView(dataProvider); + else if( dataSource == DataSource.REFERENCE ) //|| ! GenomeAnalysisEngine.instance.getArguments().enableRodWalkers ) + return new AllLocusView(dataProvider); + else if( dataSource == DataSource.REFERENCE_ORDERED_DATA ) + return new RodLocusView(dataProvider); + else + throw new UnsupportedOperationException("Unsupported traversal type: " + dataSource); + } + protected TraverseResults traverse(final LocusWalker walker, final LocusView locusView, final LocusReferenceView referenceView, diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadPairs.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadPairs.java index aef3cf7d0..8273e1328 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadPairs.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadPairs.java @@ -42,7 +42,7 @@ public class TraverseReadPairs extends TraversalEngine walker, ReadShardDataProvider dataProvider, T sum) { - logger.debug(String.format("TraverseReads.traverse Covered dataset is %s", dataProvider)); + logger.debug(String.format("TraverseReadsPairs.traverse Covered dataset is %s", dataProvider)); if( !dataProvider.hasReads() ) throw new IllegalArgumentException("Unable to traverse reads; no read data is available."); diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReads.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReads.java deleted file mode 100755 index d41d17bde..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReads.java +++ /dev/null @@ -1,111 +0,0 @@ -/* - * Copyright (c) 2009 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ -package org.broadinstitute.sting.gatk.traversals; - -import net.sf.samtools.SAMRecord; -import org.apache.log4j.Logger; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.datasources.providers.ReadBasedReferenceOrderedView; -import org.broadinstitute.sting.gatk.datasources.providers.ReadReferenceView; -import org.broadinstitute.sting.gatk.datasources.providers.ReadShardDataProvider; -import org.broadinstitute.sting.gatk.datasources.providers.ReadView; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.ReadWalker; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; - -/** - * @author aaron - * @version 1.0 - * @date Apr 24, 2009 - *

- * Class TraverseReads - *

- * This class handles traversing by reads in the new shardable style - */ -public class TraverseReads extends TraversalEngine,ReadShardDataProvider> { - /** our log, which we want to capture anything from this class */ - protected static final Logger logger = Logger.getLogger(TraverseReads.class); - - @Override - public String getTraversalUnits() { - return "reads"; - } - - /** - * Traverse by reads, given the data and the walker - * - * @param walker the walker to traverse with - * @param dataProvider the provider of the reads data - * @param sum the value of type T, specified by the walker, to feed to the walkers reduce function - * @return the reduce variable of the read walker - */ - public T traverse(ReadWalker walker, - ReadShardDataProvider dataProvider, - T sum) { - - logger.debug(String.format("TraverseReads.traverse Covered dataset is %s", dataProvider)); - - if( !dataProvider.hasReads() ) - throw new IllegalArgumentException("Unable to traverse reads; no read data is available."); - - final ReadView reads = new ReadView(dataProvider); - final ReadReferenceView reference = new ReadReferenceView(dataProvider); - - // get the reference ordered data - final ReadBasedReferenceOrderedView rodView = new ReadBasedReferenceOrderedView(dataProvider); - - boolean done = walker.isDone(); - // while we still have more reads - for (final SAMRecord read : reads) { - if ( done ) break; - - // ReferenceContext -- the reference bases covered by the read - final ReferenceContext refContext = ! read.getReadUnmappedFlag() && dataProvider.hasReference() - ? reference.getReferenceContext(read) - : null; - - // update the number of reads we've seen - dataProvider.getShard().getReadMetrics().incrementNumIterations(); - - // if the read is mapped, create a metadata tracker - final RefMetaDataTracker tracker = read.getReferenceIndex() >= 0 ? rodView.getReferenceOrderedDataForRead(read) : null; - - final boolean keepMeP = walker.filter(refContext, (GATKSAMRecord) read); - if (keepMeP) { - M x = walker.map(refContext, (GATKSAMRecord) read, tracker); // the tracker can be null - sum = walker.reduce(x, sum); - } - - final GenomeLoc locus = read.getReferenceIndex() == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX ? null : engine.getGenomeLocParser().createGenomeLoc(read.getReferenceName(),read.getAlignmentStart()); - - updateCumulativeMetrics(dataProvider.getShard()); - printProgress(locus); - - done = walker.isDone(); - } - return sum; - } -} diff --git a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseReadsUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseReadsUnitTest.java index 46f77c283..bf1fc9e65 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseReadsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseReadsUnitTest.java @@ -6,13 +6,12 @@ import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.commandline.Tags; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.datasources.providers.ReadShardDataProvider; -import org.broadinstitute.sting.gatk.datasources.providers.ShardDataProvider; import org.broadinstitute.sting.gatk.datasources.reads.ReadShardBalancer; import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource; import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID; import org.broadinstitute.sting.gatk.datasources.reads.Shard; import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation; -import org.broadinstitute.sting.gatk.walkers.Walker; +import org.broadinstitute.sting.gatk.walkers.ReadWalker; import org.broadinstitute.sting.gatk.walkers.qc.CountReads; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.exceptions.UserException; @@ -62,9 +61,9 @@ public class TraverseReadsUnitTest extends BaseTest { private SAMReaderID bam = new SAMReaderID(new File(validationDataLocation + "index_test.bam"),new Tags()); // TCGA-06-0188.aligned.duplicates_marked.bam"); private File refFile = new File(validationDataLocation + "Homo_sapiens_assembly17.fasta"); private List bamList; - private Walker countReadWalker; + private ReadWalker countReadWalker; private File output; - private TraverseReads traversalEngine = null; + private TraverseReadsNano traversalEngine = null; private IndexedFastaSequenceFile ref = null; private GenomeLocParser genomeLocParser = null; @@ -107,7 +106,7 @@ public class TraverseReadsUnitTest extends BaseTest { bamList.add(bam); countReadWalker = new CountReads(); - traversalEngine = new TraverseReads(); + traversalEngine = new TraverseReadsNano(1); traversalEngine.initialize(engine); } @@ -125,7 +124,7 @@ public class TraverseReadsUnitTest extends BaseTest { fail("Shard == null"); } - ShardDataProvider dataProvider = new ReadShardDataProvider(shard,genomeLocParser,dataSource.seek(shard),null,null); + ReadShardDataProvider dataProvider = new ReadShardDataProvider(shard,genomeLocParser,dataSource.seek(shard),null,null); accumulator = traversalEngine.traverse(countReadWalker, dataProvider, accumulator); dataProvider.close(); } From f838815343d971d3a9ff2eda0df931aa69654c52 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Tue, 23 Oct 2012 06:47:53 -0400 Subject: [PATCH 26/34] Updating MD5s for confidence ref site estimation in IndependentAllelesDiploidExactAFCalc -- Included logic to only add priors for alleles with sufficient evidence to be called polymorphic. If no alleles are poly make sure to add priors of first allele --- .../afcalc/IndependentAllelesDiploidExactAFCalc.java | 8 ++++++++ .../genotyper/UnifiedGenotyperIntegrationTest.java | 4 ++-- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalc.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalc.java index 857b7e59e..d0b801a20 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalc.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalc.java @@ -329,6 +329,7 @@ import java.util.*; double log10PosteriorOfACEq0Sum = 0.0; double log10PosteriorOfACGt0Sum = 0.0; + boolean anyPoly = false; for ( final AFCalcResult sortedResultWithThetaNPriors : sortedResultsWithThetaNPriors ) { final Allele altAllele = sortedResultWithThetaNPriors.getAllelesUsedInGenotyping().get(1); final int altI = vc.getAlleles().indexOf(altAllele) - 1; @@ -338,6 +339,7 @@ import java.util.*; // the AF > 0 case requires us to store the normalized likelihood for later summation if ( sortedResultWithThetaNPriors.getLog10PosteriorOfAFGT0() > MIN_LOG10_CONFIDENCE_TO_INCLUDE_ALLELE_IN_POSTERIOR ) { + anyPoly = true; log10PosteriorOfACEq0Sum += sortedResultWithThetaNPriors.getLog10PosteriorOfAFEq0(); log10PriorsOfAC[0] += sortedResultWithThetaNPriors.getLog10PriorOfAFEq0(); log10PriorsOfAC[1] += sortedResultWithThetaNPriors.getLog10PriorOfAFGT0(); @@ -352,6 +354,12 @@ import java.util.*; nEvaluations += sortedResultWithThetaNPriors.nEvaluations; } + // If no alleles were polymorphic, make sure we have the proper priors (the defaults) for likelihood calculation + if ( ! anyPoly ) { + log10PriorsOfAC[0] = sortedResultsWithThetaNPriors.get(0).getLog10PriorOfAFEq0(); + log10PriorsOfAC[1] = sortedResultsWithThetaNPriors.get(0).getLog10PriorOfAFGT0(); + } + // In principle, if B_p = x and C_p = y are the probabilities of being poly for alleles B and C, // the probability of being poly is (1 - B_p) * (1 - C_p) = (1 - x) * (1 - y). We want to estimate confidently // log10((1 - x) * (1 - y)) which is log10(1 - x) + log10(1 - y). This sum is log10PosteriorOfACEq0 diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index a1701e3e5..dee54b9f8 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -192,12 +192,12 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { @Test public void testOutputParameterAllConfident() { - testOutputParameters("--output_mode EMIT_ALL_CONFIDENT_SITES", "8800c58715c2bb434b69e1873cb77de6"); + testOutputParameters("--output_mode EMIT_ALL_CONFIDENT_SITES", "f9ea04d96eeef29e71d37e60518c2579"); } @Test public void testOutputParameterAllSites() { - testOutputParameters("--output_mode EMIT_ALL_SITES", "639df9f4c029792ac2e46069efc82b20"); + testOutputParameters("--output_mode EMIT_ALL_SITES", "41c046d38ea328421df924e37e017645"); } private void testOutputParameters(final String args, final String md5) { From 6e421a72d642b1e2477af3528a2248859332cd0f Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Tue, 23 Oct 2012 20:10:39 -0400 Subject: [PATCH 27/34] Add more exhaustive unit tests for input errors to NanoScheduler -- Resolves issue GSA-515 / Nanoscheduler GSA-605 / Seems that -nct may deadlock as not reproducible -- It seems that it's not an input error problem (or at least cannot be provoked with unit tests) -- I'll keep an eye on this later --- .../nanoScheduler/NanoSchedulerUnitTest.java | 100 +++++++++++++----- 1 file changed, 73 insertions(+), 27 deletions(-) diff --git a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java index 2ba2da734..af2e18ad9 100644 --- a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java @@ -23,6 +23,7 @@ import java.util.List; * To change this template use File | Settings | File Templates. */ public class NanoSchedulerUnitTest extends BaseTest { + private final static boolean DEBUG = false; private final static boolean debug = false; public static final int NANO_SCHEDULE_MAX_RUNTIME = 30000; @@ -30,19 +31,22 @@ public class NanoSchedulerUnitTest extends BaseTest { @Override public Integer apply(Integer input) { return input * 2; } } + private static void maybeDelayMe(final int input) { + try { + if ( input % 7 == 0 ) { + final int milliToSleep = (input % 10); + //System.out.printf("Sleeping %d millseconds%n", milliToSleep); + Thread.sleep(milliToSleep); + } + } catch ( InterruptedException ex ) { + throw new RuntimeException(ex); + } + } + private static class Map2xWithDelays extends Map2x { @Override public Integer apply(Integer input) { - try { - if ( input % 7 == 0 ) { - final int milliToSleep = (input % 10); - //System.out.printf("Sleeping %d millseconds%n", milliToSleep); - Thread.sleep(milliToSleep); - } - - return input * 2; - } catch ( InterruptedException ex ) { - throw new RuntimeException(ex); - } + maybeDelayMe(input); + return input * 2; } } @@ -117,10 +121,12 @@ public class NanoSchedulerUnitTest extends BaseTest { } static NanoSchedulerBasicTest exampleTest = null; + static NanoSchedulerBasicTest exampleTestWithDelays = null; @BeforeSuite public void setUp() throws Exception { exampleTest = new NanoSchedulerBasicTest(10, 2, 1, 10, false); + exampleTestWithDelays = new NanoSchedulerBasicTest(10, 2, 1, 10, true); } @DataProvider(name = "NanoSchedulerBasicTest") @@ -151,14 +157,14 @@ public class NanoSchedulerUnitTest extends BaseTest { return NanoSchedulerBasicTest.getTests(NanoSchedulerBasicTest.class); } - @Test(enabled = true, dataProvider = "NanoSchedulerBasicTest", timeOut = NANO_SCHEDULE_MAX_RUNTIME) + @Test(enabled = true && ! DEBUG, dataProvider = "NanoSchedulerBasicTest", timeOut = NANO_SCHEDULE_MAX_RUNTIME) public void testSingleThreadedNanoScheduler(final NanoSchedulerBasicTest test) throws InterruptedException { logger.warn("Running " + test); if ( test.nThreads == 1 ) testNanoScheduler(test); } - @Test(enabled = true, dataProvider = "NanoSchedulerBasicTest", timeOut = NANO_SCHEDULE_MAX_RUNTIME, dependsOnMethods = "testSingleThreadedNanoScheduler") + @Test(enabled = true && ! DEBUG, dataProvider = "NanoSchedulerBasicTest", timeOut = NANO_SCHEDULE_MAX_RUNTIME, dependsOnMethods = "testSingleThreadedNanoScheduler") public void testMultiThreadedNanoScheduler(final NanoSchedulerBasicTest test) throws InterruptedException { logger.warn("Running " + test); if ( test.nThreads >= 1 ) @@ -195,7 +201,7 @@ public class NanoSchedulerUnitTest extends BaseTest { } } - @Test(enabled = true, dataProvider = "NanoSchedulerBasicTest", dependsOnMethods = "testMultiThreadedNanoScheduler", timeOut = NANO_SCHEDULE_MAX_RUNTIME) + @Test(enabled = true && ! DEBUG, dataProvider = "NanoSchedulerBasicTest", dependsOnMethods = "testMultiThreadedNanoScheduler", timeOut = NANO_SCHEDULE_MAX_RUNTIME) public void testNanoSchedulerInLoop(final NanoSchedulerBasicTest test) throws InterruptedException { if ( test.bufferSize > 1) { logger.warn("Running " + test); @@ -213,7 +219,7 @@ public class NanoSchedulerUnitTest extends BaseTest { } } - @Test(timeOut = NANO_SCHEDULE_MAX_RUNTIME) + @Test(enabled = true && ! DEBUG, timeOut = NANO_SCHEDULE_MAX_RUNTIME) public void testShutdown() throws InterruptedException { final NanoScheduler nanoScheduler = new NanoScheduler(1, 2); Assert.assertFalse(nanoScheduler.isShutdown(), "scheduler should be alive"); @@ -221,38 +227,78 @@ public class NanoSchedulerUnitTest extends BaseTest { Assert.assertTrue(nanoScheduler.isShutdown(), "scheduler should be dead"); } - @Test(expectedExceptions = IllegalStateException.class, timeOut = NANO_SCHEDULE_MAX_RUNTIME) + @Test(enabled = true && ! DEBUG, expectedExceptions = IllegalStateException.class, timeOut = NANO_SCHEDULE_MAX_RUNTIME) public void testShutdownExecuteFailure() throws InterruptedException { final NanoScheduler nanoScheduler = new NanoScheduler(1, 2); nanoScheduler.shutdown(); nanoScheduler.execute(exampleTest.makeReader(), exampleTest.makeMap(), exampleTest.initReduce(), exampleTest.makeReduce()); } - @Test(expectedExceptions = NullPointerException.class, timeOut = 10000, invocationCount = 50) + @DataProvider(name = "NanoSchedulerInputExceptionTest") + public Object[][] createNanoSchedulerInputExceptionTest() { + List tests = new ArrayList(); + + + for ( final int bufSize : Arrays.asList(100) ) { + for ( final int nThreads : Arrays.asList(8) ) { + for ( final boolean addDelays : Arrays.asList(true, false) ) { + final NanoSchedulerBasicTest test = new NanoSchedulerBasicTest(bufSize, nThreads, 1, 1000000, false); + final int maxN = addDelays ? 10000 : 100000; + for ( int nElementsBeforeError = 0; nElementsBeforeError < maxN; nElementsBeforeError += Math.max(nElementsBeforeError / 10, 1) ) { + tests.add(new Object[]{nElementsBeforeError, test, addDelays}); + } + } + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(enabled = true, expectedExceptions = NullPointerException.class, timeOut = 10000) public void testInputErrorIsThrown_NPE() throws InterruptedException { - executeTestErrorThrowingInput(new NullPointerException()); + executeTestErrorThrowingInput(10, new NullPointerException(), exampleTest, false); } - @Test(expectedExceptions = ReviewedStingException.class, timeOut = 10000, invocationCount = 50) + @Test(enabled = true, expectedExceptions = ReviewedStingException.class, timeOut = 10000) public void testInputErrorIsThrown_RSE() throws InterruptedException { - executeTestErrorThrowingInput(new ReviewedStingException("test")); + executeTestErrorThrowingInput(10, new ReviewedStingException("test"), exampleTest, false); } - private void executeTestErrorThrowingInput(final RuntimeException ex) { - final NanoScheduler nanoScheduler = new NanoScheduler(1, 2); - nanoScheduler.execute(new ErrorThrowingIterator(ex), exampleTest.makeMap(), exampleTest.initReduce(), exampleTest.makeReduce()); + @Test(enabled = true, expectedExceptions = NullPointerException.class, dataProvider = "NanoSchedulerInputExceptionTest", timeOut = 10000, invocationCount = 1) + public void testInputErrorDoesntDeadlock(final int nElementsBeforeError, final NanoSchedulerBasicTest test, final boolean addDelays ) throws InterruptedException { + executeTestErrorThrowingInput(nElementsBeforeError, new NullPointerException(), test, addDelays); + } + + private void executeTestErrorThrowingInput(final int nElementsBeforeError, final RuntimeException ex, final NanoSchedulerBasicTest test, final boolean addDelays) { + logger.warn("executeTestErrorThrowingInput " + nElementsBeforeError + " ex=" + ex + " test=" + test + " addInputDelays=" + addDelays); + final NanoScheduler nanoScheduler = test.makeScheduler(); + nanoScheduler.execute(new ErrorThrowingIterator(nElementsBeforeError, ex, addDelays), test.makeMap(), test.initReduce(), test.makeReduce()); } private static class ErrorThrowingIterator implements Iterator { + final int nElementsBeforeError; + final boolean addDelays; + int i = 0; final RuntimeException ex; - private ErrorThrowingIterator(RuntimeException ex) { + private ErrorThrowingIterator(final int nElementsBeforeError, RuntimeException ex, boolean addDelays) { + this.nElementsBeforeError = nElementsBeforeError; this.ex = ex; + this.addDelays = addDelays; } - @Override public boolean hasNext() { throw ex; } - @Override public Integer next() { throw ex; } - @Override public void remove() { throw ex; } + @Override public boolean hasNext() { return true; } + @Override public Integer next() { + if ( i++ > nElementsBeforeError ) { + throw ex; + } else if ( addDelays ) { + maybeDelayMe(i); + return i; + } else { + return i; + } + } + @Override public void remove() { throw new UnsupportedOperationException("x"); } } public static void main(String [ ] args) { From 094db7bf2468352811764d4e98f38d637516a577 Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Wed, 24 Oct 2012 14:07:36 -0400 Subject: [PATCH 28/34] We now require at least 10 samples to merge variants into complex events in the HC. Added a new population based bam for the complex event integration test. --- .../sting/gatk/walkers/haplotypecaller/GenotypingEngine.java | 2 +- .../walkers/haplotypecaller/LikelihoodCalculationEngine.java | 2 +- .../haplotypecaller/HaplotypeCallerIntegrationTest.java | 4 +--- 3 files changed, 3 insertions(+), 5 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java index 8738def50..d91df82e2 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java @@ -217,7 +217,7 @@ public class GenotypingEngine { } cleanUpSymbolicUnassembledEvents( haplotypes ); - if( activeAllelesToGenotype.isEmpty() && haplotypes.get(0).getSampleKeySet().size() >= 3 ) { // if not in GGA mode and have at least 3 samples try to create MNP and complex events by looking at LD structure + if( activeAllelesToGenotype.isEmpty() && haplotypes.get(0).getSampleKeySet().size() >= 10 ) { // if not in GGA mode and have at least 10 samples try to create MNP and complex events by looking at LD structure mergeConsecutiveEventsBasedOnLD( haplotypes, startPosKeySet, ref, refLoc ); } if( !activeAllelesToGenotype.isEmpty() ) { // we are in GGA mode! diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java index 62554c4ab..e90d89a8c 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java @@ -139,7 +139,7 @@ public class LikelihoodCalculationEngine { } private static int computeFirstDifferingPosition( final byte[] b1, final byte[] b2 ) { - for( int iii = 0; iii < b1.length && iii < b2.length; iii++ ){ + for( int iii = 0; iii < b1.length && iii < b2.length; iii++ ) { if( b1[iii] != b2[iii] ) { return iii; } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index 1d4fad19e..a0849d0c1 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -42,7 +42,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSampleComplex() { - HCTestComplexVariants(CEUTRIO_BAM, "", "30598abeeb0b0ae5816ffdbf0c4044fd"); + HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "966d338f423c86a390d685aa6336ec69"); } private void HCTestSymbolicVariants(String bam, String args, String md5) { @@ -94,6 +94,4 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { Arrays.asList("79af83432dc4a1768b3ebffffc4d2b8f")); executeTest("HC calling on a ReducedRead BAM", spec); } - - } From a27ee26481e94b182003212944d75001e6a1a4c0 Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Wed, 24 Oct 2012 14:08:39 -0400 Subject: [PATCH 29/34] updating HC integration test. --- .../walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index a0849d0c1..71f03ea00 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -21,7 +21,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSample() { - HCTest(CEUTRIO_BAM, "", "ee866a8694a6f6c77242041275350ab9"); + HCTest(CEUTRIO_BAM, "", "1d826f852ec1457efbfa7ee828c5783a"); } @Test From 991658acf485c3c7634387eea0dab46018aa6894 Mon Sep 17 00:00:00 2001 From: David Roazen Date: Mon, 22 Oct 2012 13:36:07 -0400 Subject: [PATCH 30/34] BQSR: use more granular locking for concurrency control -With this change, BQSR performance scales properly by thread rather than gaining nothing from additional threads. -Benefits are seen when using either -nt (HierarchicalMicroScheduler) or -nct (NanoScheduler) -Removes high-level locks in the recalibration engines and NestedIntegerArray in favor of maximally-granular locks on and around manipulation of the leaf nodes of the NestedIntegerArray. -NestedIntegerArray now creates all interior nodes upfront rather than on the fly to avoid the need for locking during tree traversals. This uses more memory in the initial part of BQSR runs, but the BQSR would eventually converge to use this memory anyway over the course of a typical run. IMPORTANT NOTE: This does not mean it's safe to run the old BaseRecalibrator walker with multiple threads. The BaseRecalibrator walker is and will never be thread-safe, as it's a LocusWalker that uses read attributes to track state information. ONLY the newer DelocalizedBaseRecalibrator can be made thread-safe (and will hopefully be made so in my subsequent commits). This commit addresses performance, not correctness. --- .../bqsr/AdvancedRecalibrationEngine.java | 71 +++----- .../bqsr/StandardRecalibrationEngine.java | 157 +++++++++++++++--- .../LoggingNestedIntegerArray.java | 6 +- .../utils/collections/NestedIntegerArray.java | 56 +++++-- .../utils/threading/ThreadLocalArray.java | 64 +++++++ 5 files changed, 270 insertions(+), 84 deletions(-) create mode 100644 public/java/src/org/broadinstitute/sting/utils/threading/ThreadLocalArray.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/AdvancedRecalibrationEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/AdvancedRecalibrationEngine.java index b89f68e24..be8357679 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/AdvancedRecalibrationEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/AdvancedRecalibrationEngine.java @@ -28,26 +28,22 @@ package org.broadinstitute.sting.gatk.walkers.bqsr; import org.broadinstitute.sting.utils.recalibration.covariates.Covariate; import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.classloader.ProtectedPackageSource; -import org.broadinstitute.sting.utils.collections.NestedIntegerArray; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.recalibration.EventType; import org.broadinstitute.sting.utils.recalibration.ReadCovariates; -import org.broadinstitute.sting.utils.recalibration.RecalDatum; import org.broadinstitute.sting.utils.recalibration.RecalibrationTables; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.threading.ThreadLocalArray; public class AdvancedRecalibrationEngine extends StandardRecalibrationEngine implements ProtectedPackageSource { - // optimizations: don't reallocate an array each time - private byte[] tempQualArray; - private boolean[] tempErrorArray; - private double[] tempFractionalErrorArray; + // optimization: only allocate temp arrays once per thread + private final ThreadLocal threadLocalTempQualArray = new ThreadLocalArray(EventType.values().length, byte.class); + private final ThreadLocal threadLocalTempErrorArray = new ThreadLocalArray(EventType.values().length, boolean.class); + private final ThreadLocal threadLocalTempFractionalErrorArray = new ThreadLocalArray(EventType.values().length, double.class); public void initialize(final Covariate[] covariates, final RecalibrationTables recalibrationTables) { super.initialize(covariates, recalibrationTables); - tempQualArray = new byte[EventType.values().length]; - tempErrorArray = new boolean[EventType.values().length]; - tempFractionalErrorArray = new double[EventType.values().length]; } /** @@ -60,10 +56,13 @@ public class AdvancedRecalibrationEngine extends StandardRecalibrationEngine imp * @param refBase The reference base at this locus */ @Override - public synchronized void updateDataForPileupElement(final PileupElement pileupElement, final byte refBase) { + public void updateDataForPileupElement(final PileupElement pileupElement, final byte refBase) { final int offset = pileupElement.getOffset(); final ReadCovariates readCovariates = covariateKeySetFrom(pileupElement.getRead()); + byte[] tempQualArray = threadLocalTempQualArray.get(); + boolean[] tempErrorArray = threadLocalTempErrorArray.get(); + tempQualArray[EventType.BASE_SUBSTITUTION.index] = pileupElement.getQual(); tempErrorArray[EventType.BASE_SUBSTITUTION.index] = !BaseUtils.basesAreEqual(pileupElement.getBase(), refBase); tempQualArray[EventType.BASE_INSERTION.index] = pileupElement.getBaseInsertionQual(); @@ -77,40 +76,29 @@ public class AdvancedRecalibrationEngine extends StandardRecalibrationEngine imp final byte qual = tempQualArray[eventIndex]; final boolean isError = tempErrorArray[eventIndex]; - final NestedIntegerArray rgRecalTable = recalibrationTables.getReadGroupTable(); - final RecalDatum rgPreviousDatum = rgRecalTable.get(keys[0], eventIndex); - final RecalDatum rgThisDatum = createDatumObject(qual, isError); - if (rgPreviousDatum == null) // key doesn't exist yet in the map so make a new bucket and add it - rgRecalTable.put(rgThisDatum, keys[0], eventIndex); - else - rgPreviousDatum.combine(rgThisDatum); + // TODO: should this really be combine rather than increment? + combineDatumOrPutIfNecessary(recalibrationTables.getReadGroupTable(), qual, isError, keys[0], eventIndex); - final NestedIntegerArray qualRecalTable = recalibrationTables.getQualityScoreTable(); - final RecalDatum qualPreviousDatum = qualRecalTable.get(keys[0], keys[1], eventIndex); - if (qualPreviousDatum == null) - qualRecalTable.put(createDatumObject(qual, isError), keys[0], keys[1], eventIndex); - else - qualPreviousDatum.increment(isError); + incrementDatumOrPutIfNecessary(recalibrationTables.getQualityScoreTable(), qual, isError, keys[0], keys[1], eventIndex); for (int i = 2; i < covariates.length; i++) { if (keys[i] < 0) continue; - final NestedIntegerArray covRecalTable = recalibrationTables.getTable(i); - final RecalDatum covPreviousDatum = covRecalTable.get(keys[0], keys[1], keys[i], eventIndex); - if (covPreviousDatum == null) - covRecalTable.put(createDatumObject(qual, isError), keys[0], keys[1], keys[i], eventIndex); - else - covPreviousDatum.increment(isError); + + incrementDatumOrPutIfNecessary(recalibrationTables.getTable(i), qual, isError, keys[0], keys[1], keys[i], eventIndex); } } } @Override - public synchronized void updateDataForRead(final GATKSAMRecord read, final boolean[] skip, final double[] snpErrors, final double[] insertionErrors, final double[] deletionErrors ) { + public void updateDataForRead(final GATKSAMRecord read, final boolean[] skip, final double[] snpErrors, final double[] insertionErrors, final double[] deletionErrors ) { for( int offset = 0; offset < read.getReadBases().length; offset++ ) { if( !skip[offset] ) { final ReadCovariates readCovariates = covariateKeySetFrom(read); + byte[] tempQualArray = threadLocalTempQualArray.get(); + double[] tempFractionalErrorArray = threadLocalTempFractionalErrorArray.get(); + tempQualArray[EventType.BASE_SUBSTITUTION.index] = read.getBaseQualities()[offset]; tempFractionalErrorArray[EventType.BASE_SUBSTITUTION.index] = snpErrors[offset]; tempQualArray[EventType.BASE_INSERTION.index] = read.getBaseInsertionQualities()[offset]; @@ -124,30 +112,15 @@ public class AdvancedRecalibrationEngine extends StandardRecalibrationEngine imp final byte qual = tempQualArray[eventIndex]; final double isError = tempFractionalErrorArray[eventIndex]; - final NestedIntegerArray rgRecalTable = recalibrationTables.getReadGroupTable(); - final RecalDatum rgPreviousDatum = rgRecalTable.get(keys[0], eventIndex); - final RecalDatum rgThisDatum = createDatumObject(qual, isError); - if (rgPreviousDatum == null) // key doesn't exist yet in the map so make a new bucket and add it - rgRecalTable.put(rgThisDatum, keys[0], eventIndex); - else - rgPreviousDatum.combine(rgThisDatum); + combineDatumOrPutIfNecessary(recalibrationTables.getReadGroupTable(), qual, isError, keys[0], eventIndex); - final NestedIntegerArray qualRecalTable = recalibrationTables.getQualityScoreTable(); - final RecalDatum qualPreviousDatum = qualRecalTable.get(keys[0], keys[1], eventIndex); - if (qualPreviousDatum == null) - qualRecalTable.put(createDatumObject(qual, isError), keys[0], keys[1], eventIndex); - else - qualPreviousDatum.increment(1.0, isError); + incrementDatumOrPutIfNecessary(recalibrationTables.getQualityScoreTable(), qual, isError, keys[0], keys[1], eventIndex); for (int i = 2; i < covariates.length; i++) { if (keys[i] < 0) continue; - final NestedIntegerArray covRecalTable = recalibrationTables.getTable(i); - final RecalDatum covPreviousDatum = covRecalTable.get(keys[0], keys[1], keys[i], eventIndex); - if (covPreviousDatum == null) - covRecalTable.put(createDatumObject(qual, isError), keys[0], keys[1], keys[i], eventIndex); - else - covPreviousDatum.increment(1.0, isError); + + incrementDatumOrPutIfNecessary(recalibrationTables.getTable(i), qual, isError, keys[0], keys[1], keys[i], eventIndex); } } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/StandardRecalibrationEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/StandardRecalibrationEngine.java index 4fe9c5323..1f7fbbf42 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/StandardRecalibrationEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/StandardRecalibrationEngine.java @@ -55,7 +55,7 @@ public class StandardRecalibrationEngine implements RecalibrationEngine, PublicP * @param refBase The reference base at this locus */ @Override - public synchronized void updateDataForPileupElement(final PileupElement pileupElement, final byte refBase) { + public void updateDataForPileupElement(final PileupElement pileupElement, final byte refBase) { final int offset = pileupElement.getOffset(); final ReadCovariates readCovariates = covariateKeySetFrom(pileupElement.getRead()); @@ -65,35 +65,21 @@ public class StandardRecalibrationEngine implements RecalibrationEngine, PublicP final int[] keys = readCovariates.getKeySet(offset, EventType.BASE_SUBSTITUTION); final int eventIndex = EventType.BASE_SUBSTITUTION.index; - final NestedIntegerArray rgRecalTable = recalibrationTables.getReadGroupTable(); - final RecalDatum rgPreviousDatum = rgRecalTable.get(keys[0], eventIndex); - final RecalDatum rgThisDatum = createDatumObject(qual, isError); - if (rgPreviousDatum == null) // key doesn't exist yet in the map so make a new bucket and add it - rgRecalTable.put(rgThisDatum, keys[0], eventIndex); - else - rgPreviousDatum.combine(rgThisDatum); + // TODO: should this really be combine rather than increment? + combineDatumOrPutIfNecessary(recalibrationTables.getReadGroupTable(), qual, isError, keys[0], eventIndex); - final NestedIntegerArray qualRecalTable = recalibrationTables.getQualityScoreTable(); - final RecalDatum qualPreviousDatum = qualRecalTable.get(keys[0], keys[1], eventIndex); - if (qualPreviousDatum == null) - qualRecalTable.put(createDatumObject(qual, isError), keys[0], keys[1], eventIndex); - else - qualPreviousDatum.increment(isError); + incrementDatumOrPutIfNecessary(recalibrationTables.getQualityScoreTable(), qual, isError, keys[0], keys[1], eventIndex); for (int i = 2; i < covariates.length; i++) { if (keys[i] < 0) continue; - final NestedIntegerArray covRecalTable = recalibrationTables.getTable(i); - final RecalDatum covPreviousDatum = covRecalTable.get(keys[0], keys[1], keys[i], eventIndex); - if (covPreviousDatum == null) - covRecalTable.put(createDatumObject(qual, isError), keys[0], keys[1], keys[i], eventIndex); - else - covPreviousDatum.increment(isError); + + incrementDatumOrPutIfNecessary(recalibrationTables.getTable(i), qual, isError, keys[0], keys[1], keys[i], eventIndex); } } @Override - public synchronized void updateDataForRead( final GATKSAMRecord read, final boolean[] skip, final double[] snpErrors, final double[] insertionErrors, final double[] deletionErrors ) { + public void updateDataForRead( final GATKSAMRecord read, final boolean[] skip, final double[] snpErrors, final double[] insertionErrors, final double[] deletionErrors ) { throw new UnsupportedOperationException("Delocalized BQSR is not available in the GATK-lite version"); } @@ -121,4 +107,133 @@ public class StandardRecalibrationEngine implements RecalibrationEngine, PublicP protected ReadCovariates covariateKeySetFrom(GATKSAMRecord read) { return (ReadCovariates) read.getTemporaryAttribute(BaseRecalibrator.COVARS_ATTRIBUTE); } + + /** + * Increments the RecalDatum at the specified position in the specified table, or put a new item there + * if there isn't already one. + * + * Does this in a thread-safe way WITHOUT being synchronized: relies on the behavior of NestedIntegerArray.put() + * to return false if another thread inserts a new item at our position in the middle of our put operation. + * + * @param table the table that holds/will hold our item + * @param qual qual for this event + * @param isError error status for this event + * @param keys location in table of our item + */ + protected void incrementDatumOrPutIfNecessary( final NestedIntegerArray table, final byte qual, final boolean isError, final int... keys ) { + final RecalDatum existingDatum = table.get(keys); + + // There are three cases here: + // + // 1. The original get succeeded (existingDatum != null) because there already was an item in this position. + // In this case we can just increment the existing item. + // + // 2. The original get failed (existingDatum == null), and we successfully put a new item in this position + // and are done. + // + // 3. The original get failed (existingDatum == null), AND the put fails because another thread comes along + // in the interim and puts an item in this position. In this case we need to do another get after the + // failed put to get the item the other thread put here and increment it. + if ( existingDatum == null ) { + // No existing item, try to put a new one + if ( ! table.put(createDatumObject(qual, isError), keys) ) { + // Failed to put a new item because another thread came along and put an item here first. + // Get the newly-put item and increment it (item is guaranteed to exist at this point) + table.get(keys).increment(isError); + } + } + else { + // Easy case: already an item here, so increment it + existingDatum.increment(isError); + } + } + + /** + * Increments the RecalDatum at the specified position in the specified table, or put a new item there + * if there isn't already one. + * + * Does this in a thread-safe way WITHOUT being synchronized: relies on the behavior of NestedIntegerArray.put() + * to return false if another thread inserts a new item at our position in the middle of our put operation. + * + * @param table the table that holds/will hold our item + * @param qual qual for this event + * @param isError error value for this event + * @param keys location in table of our item + */ + protected void incrementDatumOrPutIfNecessary( final NestedIntegerArray table, final byte qual, final double isError, final int... keys ) { + final RecalDatum existingDatum = table.get(keys); + + if ( existingDatum == null ) { + // No existing item, try to put a new one + if ( ! table.put(createDatumObject(qual, isError), keys) ) { + // Failed to put a new item because another thread came along and put an item here first. + // Get the newly-put item and increment it (item is guaranteed to exist at this point) + table.get(keys).increment(1.0, isError); + } + } + else { + // Easy case: already an item here, so increment it + existingDatum.increment(1.0, isError); + } + } + + /** + * Combines the RecalDatum at the specified position in the specified table with a new RecalDatum, or put a + * new item there if there isn't already one. + * + * Does this in a thread-safe way WITHOUT being synchronized: relies on the behavior of NestedIntegerArray.put() + * to return false if another thread inserts a new item at our position in the middle of our put operation. + * + * @param table the table that holds/will hold our item + * @param qual qual for this event + * @param isError error status for this event + * @param keys location in table of our item + */ + protected void combineDatumOrPutIfNecessary( final NestedIntegerArray table, final byte qual, final boolean isError, final int... keys ) { + final RecalDatum existingDatum = table.get(keys); + final RecalDatum newDatum = createDatumObject(qual, isError); + + if ( existingDatum == null ) { + // No existing item, try to put a new one + if ( ! table.put(newDatum, keys) ) { + // Failed to put a new item because another thread came along and put an item here first. + // Get the newly-put item and combine it with our item (item is guaranteed to exist at this point) + table.get(keys).combine(newDatum); + } + } + else { + // Easy case: already an item here, so combine it with our item + existingDatum.combine(newDatum); + } + } + + /** + * Combines the RecalDatum at the specified position in the specified table with a new RecalDatum, or put a + * new item there if there isn't already one. + * + * Does this in a thread-safe way WITHOUT being synchronized: relies on the behavior of NestedIntegerArray.put() + * to return false if another thread inserts a new item at our position in the middle of our put operation. + * + * @param table the table that holds/will hold our item + * @param qual qual for this event + * @param isError error value for this event + * @param keys location in table of our item + */ + protected void combineDatumOrPutIfNecessary( final NestedIntegerArray table, final byte qual, final double isError, final int... keys ) { + final RecalDatum existingDatum = table.get(keys); + final RecalDatum newDatum = createDatumObject(qual, isError); + + if ( existingDatum == null ) { + // No existing item, try to put a new one + if ( ! table.put(newDatum, keys) ) { + // Failed to put a new item because another thread came along and put an item here first. + // Get the newly-put item and combine it with our item (item is guaranteed to exist at this point) + table.get(keys).combine(newDatum); + } + } + else { + // Easy case: already an item here, so combine it with our item + existingDatum.combine(newDatum); + } + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/collections/LoggingNestedIntegerArray.java b/public/java/src/org/broadinstitute/sting/utils/collections/LoggingNestedIntegerArray.java index 6fda0245b..63927ac84 100644 --- a/public/java/src/org/broadinstitute/sting/utils/collections/LoggingNestedIntegerArray.java +++ b/public/java/src/org/broadinstitute/sting/utils/collections/LoggingNestedIntegerArray.java @@ -99,9 +99,7 @@ public class LoggingNestedIntegerArray extends NestedIntegerArray { } @Override - public void put( final T value, final int... keys ) { - super.put(value, keys); - + public boolean put( final T value, final int... keys ) { StringBuilder logEntry = new StringBuilder(); logEntry.append(logEntryLabel); @@ -116,5 +114,7 @@ public class LoggingNestedIntegerArray extends NestedIntegerArray { // PrintStream methods all use synchronized blocks internally, so our logging is thread-safe log.println(logEntry.toString()); + + return super.put(value, keys); } } diff --git a/public/java/src/org/broadinstitute/sting/utils/collections/NestedIntegerArray.java b/public/java/src/org/broadinstitute/sting/utils/collections/NestedIntegerArray.java index 31d316555..9f1eb546d 100755 --- a/public/java/src/org/broadinstitute/sting/utils/collections/NestedIntegerArray.java +++ b/public/java/src/org/broadinstitute/sting/utils/collections/NestedIntegerArray.java @@ -50,6 +50,28 @@ public class NestedIntegerArray { this.dimensions = dimensions.clone(); data = new Object[dimensions[0]]; + prepopulateArray(data, 0); + } + + /** + * Recursively allocate the entire tree of arrays in all its dimensions. + * + * Doing this upfront uses more memory initially, but saves time over the course of the run + * and (crucially) avoids having to make threads wait while traversing the tree to check + * whether branches exist or not. + * + * @param subarray current node in the tree + * @param dimension current level in the tree + */ + private void prepopulateArray( Object[] subarray, int dimension ) { + if ( dimension >= numDimensions - 1 ) { + return; + } + + for ( int i = 0; i < subarray.length; i++ ) { + subarray[i] = new Object[dimensions[dimension + 1]]; + prepopulateArray((Object[])subarray[i], dimension + 1); + } } public T get(final int... keys) { @@ -59,14 +81,15 @@ public class NestedIntegerArray { for( int i = 0; i < numNestedDimensions; i++ ) { if ( keys[i] >= dimensions[i] ) return null; - myData = (Object[])myData[keys[i]]; - if ( myData == null ) - return null; + + myData = (Object[])myData[keys[i]]; // interior nodes in the tree will never be null, so we can safely traverse + // down to the leaves } + return (T)myData[keys[numNestedDimensions]]; } - public synchronized void put(final T value, final int... keys) { // WARNING! value comes before the keys! + public boolean put(final T value, final int... keys) { // WARNING! value comes before the keys! if ( keys.length != numDimensions ) throw new ReviewedStingException("Exactly " + numDimensions + " keys should be passed to this NestedIntegerArray but " + keys.length + " were provided"); @@ -75,15 +98,26 @@ public class NestedIntegerArray { for ( int i = 0; i < numNestedDimensions; i++ ) { if ( keys[i] >= dimensions[i] ) throw new ReviewedStingException("Key " + keys[i] + " is too large for dimension " + i + " (max is " + (dimensions[i]-1) + ")"); - Object[] temp = (Object[])myData[keys[i]]; - if ( temp == null ) { - temp = new Object[dimensions[i+1]]; - myData[keys[i]] = temp; - } - myData = temp; + + myData = (Object[])myData[keys[i]]; // interior nodes in the tree will never be null, so we can safely traverse + // down to the leaves } - myData[keys[numNestedDimensions]] = value; + synchronized ( myData ) { // lock the bottom row while we examine and (potentially) update it + + // Insert the new value only if there still isn't any existing value in this position + if ( myData[keys[numNestedDimensions]] == null ) { + myData[keys[numNestedDimensions]] = value; + } + else { + // Already have a value for this leaf (perhaps another thread came along and inserted one + // while we traversed the tree), so return false to notify the caller that we didn't put + // the item + return false; + } + } + + return true; } public List getAllValues() { diff --git a/public/java/src/org/broadinstitute/sting/utils/threading/ThreadLocalArray.java b/public/java/src/org/broadinstitute/sting/utils/threading/ThreadLocalArray.java new file mode 100644 index 000000000..cc50152ac --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/threading/ThreadLocalArray.java @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.threading; + +import java.lang.reflect.Array; + +/** + * ThreadLocal implementation for arrays + * + * Example usage: + * + * private ThreadLocal threadLocalByteArray = new ThreadLocalArray(length, byte.class); + * .... + * byte[] byteArray = threadLocalByteArray.get(); + * + * @param the type of the array itself (eg., int[], double[], etc.) + * + * @author David Roazen + */ +public class ThreadLocalArray extends ThreadLocal { + private int arraySize; + private Class arrayElementType; + + /** + * Create a new ThreadLocalArray + * + * @param arraySize desired length of the array + * @param arrayElementType type of the elements within the array (eg., Byte.class, Integer.class, etc.) + */ + public ThreadLocalArray( int arraySize, Class arrayElementType ) { + super(); + + this.arraySize = arraySize; + this.arrayElementType = arrayElementType; + } + + @Override + @SuppressWarnings("unchecked") + protected T initialValue() { + return (T)Array.newInstance(arrayElementType, arraySize); + } +} From 32a6d7000a4dab5f208c9d5e78f35b314edbaec3 Mon Sep 17 00:00:00 2001 From: David Roazen Date: Wed, 24 Oct 2012 13:06:34 -0400 Subject: [PATCH 31/34] Thread-safe ReadGroupCovariate The ReadGroupCovariate class was not thread-safe. This led to horrible race conditions in multithreaded runs of the BQSR where (for example) the same read group could get inserted into the reverse lookup table twice with different IDs. Should fix the intermittent crash reported in GSA-492. --- .../covariates/ReadGroupCovariate.java | 29 +++++++++++++++---- 1 file changed, 23 insertions(+), 6 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/covariates/ReadGroupCovariate.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/covariates/ReadGroupCovariate.java index 85568dac9..29c15adf7 100755 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/covariates/ReadGroupCovariate.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/covariates/ReadGroupCovariate.java @@ -66,7 +66,9 @@ public class ReadGroupCovariate implements RequiredCovariate { } @Override - public String formatKey(final int key) { + public synchronized String formatKey(final int key) { + // This method is synchronized so that we don't attempt to do a get() + // from the reverse lookup table while that table is being updated return readGroupReverseLookupTable.get(key); } @@ -76,17 +78,32 @@ public class ReadGroupCovariate implements RequiredCovariate { } private int keyForReadGroup(final String readGroupId) { - if (!readGroupLookupTable.containsKey(readGroupId)) { - readGroupLookupTable.put(readGroupId, nextId); - readGroupReverseLookupTable.put(nextId, readGroupId); - nextId++; + // Rather than synchronize this entire method (which would be VERY expensive for walkers like the BQSR), + // synchronize only the table updates. + + // Before entering the synchronized block, check to see if this read group is not in our tables. + // If it's not, either we will have to insert it, OR another thread will insert it first. + // This preliminary check avoids doing any synchronization most of the time. + if ( ! readGroupLookupTable.containsKey(readGroupId) ) { + + synchronized ( this ) { + + // Now we need to make sure the key is STILL not there, since another thread may have come along + // and inserted it while we were waiting to enter this synchronized block! + if ( ! readGroupLookupTable.containsKey(readGroupId) ) { + readGroupLookupTable.put(readGroupId, nextId); + readGroupReverseLookupTable.put(nextId, readGroupId); + nextId++; + } + } } return readGroupLookupTable.get(readGroupId); } @Override - public int maximumKeyValue() { + public synchronized int maximumKeyValue() { + // Synchronized so that we don't query table size while the tables are being updated return readGroupLookupTable.size() - 1; } From 02018ca76495b1405a3b1ec4086afdb470ef0b95 Mon Sep 17 00:00:00 2001 From: David Roazen Date: Wed, 24 Oct 2012 15:06:13 -0400 Subject: [PATCH 32/34] Legacy BaseRecalibrator walker is neither TreeReducible nor NanoSchedulable The old BaseRecalibrator walker is and never will be thread-safe, since it's a LocusWalker that uses read attributes to track state. ONLY the newer DelocalizedBaseRecalibrator is believed likely to be thread-safe at this point. It is safe to run the DelocalizedBaseRecalibrator with -nct > 1 for testing purposes, but wait for further testing to be done before using it for production purposes in multithreaded mode. --- .../sting/gatk/walkers/bqsr/BQSRIntegrationTest.java | 5 ----- .../sting/gatk/walkers/bqsr/BaseRecalibrator.java | 7 +------ 2 files changed, 1 insertion(+), 11 deletions(-) diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRIntegrationTest.java index 26ee78484..90d67d393 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRIntegrationTest.java @@ -73,11 +73,6 @@ public class BQSRIntegrationTest extends WalkerTest { params.getCommandLine(), Arrays.asList(params.md5)); executeTest("testBQSR-"+params.args, spec).getFirst(); - - WalkerTestSpec specNT2 = new WalkerTestSpec( - params.getCommandLine() + " -nt 2", - Arrays.asList(params.md5)); - executeTest("testBQSR-nt2-"+params.args, specNT2).getFirst(); } @Test diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java index 2c7a12a36..245c8e3e8 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java @@ -109,7 +109,7 @@ import java.util.ArrayList; @ReadFilters({MappingQualityZeroFilter.class, MappingQualityUnavailableFilter.class}) // only look at covered loci, not every loci of the reference file @Requires({DataSource.READS, DataSource.REFERENCE}) // filter out all reads with zero or unavailable mapping quality @PartitionBy(PartitionType.LOCUS) // this walker requires both -I input.bam and -R reference.fasta -public class BaseRecalibrator extends LocusWalker implements TreeReducible, NanoSchedulable { +public class BaseRecalibrator extends LocusWalker { @ArgumentCollection private final RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection(); // all the command line arguments for BQSR and it's covariates @@ -277,11 +277,6 @@ public class BaseRecalibrator extends LocusWalker implements TreeRed return sum; } - public Long treeReduce(Long sum1, Long sum2) { - sum1 += sum2; - return sum1; - } - @Override public void onTraversalDone(Long result) { logger.info("Calculating quantized quality scores..."); From d9aa9855f874f80ddc629c3c7179c00108991c9e Mon Sep 17 00:00:00 2001 From: David Roazen Date: Wed, 24 Oct 2012 15:29:13 -0400 Subject: [PATCH 33/34] Better comments in NestedIntegerArray --- .../utils/collections/NestedIntegerArray.java | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/public/java/src/org/broadinstitute/sting/utils/collections/NestedIntegerArray.java b/public/java/src/org/broadinstitute/sting/utils/collections/NestedIntegerArray.java index 9f1eb546d..c83de8ec7 100755 --- a/public/java/src/org/broadinstitute/sting/utils/collections/NestedIntegerArray.java +++ b/public/java/src/org/broadinstitute/sting/utils/collections/NestedIntegerArray.java @@ -89,6 +89,20 @@ public class NestedIntegerArray { return (T)myData[keys[numNestedDimensions]]; } + /** + * Insert a value at the position specified by the given keys. + * + * This method is THREAD-SAFE despite not being synchronized, however the caller MUST + * check the return value to see if the put succeeded. This method RETURNS FALSE if + * the value could not be inserted because there already was a value present + * at the specified location. In this case the caller should do a get() to get + * the already-existing value and (potentially) update it. + * + * @param value value to insert + * @param keys keys specifying the location of the value in the tree + * @return true if the value was inserted, false if it could not be inserted because there was already + * a value at the specified position + */ public boolean put(final T value, final int... keys) { // WARNING! value comes before the keys! if ( keys.length != numDimensions ) throw new ReviewedStingException("Exactly " + numDimensions + " keys should be passed to this NestedIntegerArray but " + keys.length + " were provided");