From aae04b6122edaba865622bdff248e80453371466 Mon Sep 17 00:00:00 2001 From: Valentin Ruano-Rubio Date: Tue, 6 Jan 2015 13:36:36 -0500 Subject: [PATCH] Fixes explicit limitation of the maximum ploidy of the reference-confidence model Story: ===== - https://www.pivotaltracker.com/story/show/83803796 Changes: ======= - From a fix maximum ploidy indel RCM likelihood cache to a dynamically resizable one. - Used the occassion to removed an unused and deprecated method from ReferenceConfidenceModel Testing: ======= - Added integration test to check on ploidies larger than the previous limit of 20. --- .../ReferenceConfidenceModel.java | 53 +++---------------- .../HaplotypeCallerGVCFIntegrationTest.java | 31 +++++++++++ 2 files changed, 38 insertions(+), 46 deletions(-) diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ReferenceConfidenceModel.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ReferenceConfidenceModel.java index 5320cb52e..45a7b0ff7 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ReferenceConfidenceModel.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ReferenceConfidenceModel.java @@ -265,26 +265,23 @@ public class ReferenceConfidenceModel { * @return non-null GenotypeLikelihoods given N */ protected final GenotypeLikelihoods getIndelPLs(final int ploidy, final int nInformativeReads) { - if (ploidy > MAX_N_INDEL_PLOIDY) - throw new IllegalArgumentException("you have hit a current limitation of the GVCF output model that cannot handle ploidies larger than " + MAX_N_INDEL_PLOIDY + " , please let the GATK team about it: " + ploidy); return indelPLCache(ploidy, nInformativeReads > MAX_N_INDEL_INFORMATIVE_READS ? MAX_N_INDEL_INFORMATIVE_READS : nInformativeReads); } protected static final int MAX_N_INDEL_INFORMATIVE_READS = 40; // more than this is overkill because GQs are capped at 99 anyway - private static final int MAX_N_INDEL_PLOIDY = 20; - private static final GenotypeLikelihoods[][] indelPLCache = new GenotypeLikelihoods[MAX_N_INDEL_PLOIDY][]; + private static final int INITIAL_INDEL_LK_CACHE_PLOIDY_CAPACITY = 20; + private static GenotypeLikelihoods[][] indelPLCache = new GenotypeLikelihoods[INITIAL_INDEL_LK_CACHE_PLOIDY_CAPACITY + 1][]; private static final double INDEL_ERROR_RATE = -4.5; // 10^-4.5 indel errors per bp private final GenotypeLikelihoods indelPLCache(final int ploidy, final int nInformativeReads) { - GenotypeLikelihoods[] indelPLCacheByPloidy = indelPLCache[ploidy]; - if (indelPLCacheByPloidy == null) - return initializeIndelPLCache(ploidy)[nInformativeReads]; - else - return indelPLCacheByPloidy[nInformativeReads]; + return initializeIndelPLCache(ploidy)[nInformativeReads]; } private synchronized GenotypeLikelihoods[] initializeIndelPLCache(final int ploidy) { - // Double-check whether another thread has done the initialization. + + if (indelPLCache.length <= ploidy) + indelPLCache = Arrays.copyOf(indelPLCache,ploidy << 1); + if (indelPLCache[ploidy] != null) return indelPLCache[ploidy]; @@ -308,42 +305,6 @@ public class ReferenceConfidenceModel { return result; } - /** - * Calculate the genotype likelihoods for the sample in pileup for being hom-ref contrasted with being ref vs. alt - * - * @param pileup the read backed pileup containing the data we want to evaluate - * @param refBase the reference base at this pileup position - * @param minBaseQual the min base quality for a read in the pileup at the pileup position to be included in the calculation - * @param hqSoftClips running average data structure (can be null) to collect information about the number of high quality soft clips - * @return a RefVsAnyResult genotype call - */ - @Deprecated - public RefVsAnyResult calcGenotypeLikelihoodsOfRefVsAny(final ReadBackedPileup pileup, final byte refBase, final byte minBaseQual, final MathUtils.RunningAverage hqSoftClips) { - final RefVsAnyResult result = new RefVsAnyResult(); - - for( final PileupElement p : pileup ) { - final byte qual = (p.isDeletion() ? REF_MODEL_DELETION_QUAL : p.getQual()); - if( p.isDeletion() || qual > minBaseQual ) { - int AA = 0; final int AB = 1; int BB = 2; - if( p.getBase() != refBase || p.isDeletion() || p.isBeforeDeletionStart() || p.isAfterDeletionEnd() || p.isBeforeInsertion() || p.isAfterInsertion() || p.isNextToSoftClip() ) { - AA = 2; - BB = 0; - if( hqSoftClips != null && p.isNextToSoftClip() ) { - hqSoftClips.add(AlignmentUtils.calcNumHighQualitySoftClips(p.getRead(), (byte) 28)); - } - result.AD_Ref_Any[1]++; - } else { - result.AD_Ref_Any[0]++; - } - result.genotypeLikelihoods[AA] += QualityUtils.qualToProbLog10(qual); - result.genotypeLikelihoods[AB] += MathUtils.approximateLog10SumLog10( QualityUtils.qualToProbLog10(qual) + MathUtils.LOG_ONE_HALF, QualityUtils.qualToErrorProbLog10(qual) + MathUtils.LOG_ONE_THIRD + MathUtils.LOG_ONE_HALF ); - result.genotypeLikelihoods[BB] += QualityUtils.qualToErrorProbLog10(qual) + MathUtils.LOG_ONE_THIRD; - } - } - - return result; - } - /** * Calculate the genotype likelihoods for the sample in pileup for being hom-ref contrasted with being ref vs. alt * diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java index 1922cd92d..981d081e7 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java @@ -126,6 +126,25 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest { return tests.toArray(new Object[][]{}); } + @DataProvider(name = "MyDataProviderManyploid") + public Object[][] makeMyDataProviderManyploid() { + List tests = new ArrayList<>(); + + final String PCRFreeIntervals = "-L 20:10,000,000-10,010,000"; + final String WExIntervals = "-L 20:10,000,000-10,100,000 -isr INTERSECTION -L " + hg19Chr20Intervals; + + // this functionality can be adapted to provide input data for whatever you might want in your data + tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.NONE, PCRFreeIntervals, "d8daf60c68f39875680cc037f264ccc0"}); + tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "dd1e6ee5e242fa1caa7f64a9605e4b16"}); + tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "a040bb2ebacf5afab3490504730043a1"}); + tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.NONE, WExIntervals, "8b44f98c12ed7949d716852e0a21b346"}); + tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "4872ba8a27a29c9cac5e5c787ca853b5"}); + tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.GVCF, WExIntervals, "65fc0938ca7df792a36413465878bc89"}); + + return tests.toArray(new Object[][]{}); + } + + /** * Example testng test using MyDataProvider */ @@ -162,6 +181,18 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest { executeTest(name, spec); } + /** + * Example testng test using MyDataProvider + */ + @Test(dataProvider = "MyDataProviderManyploid", enabled=true) + public void testHCWithGVCFManyploid(final String bam, final ReferenceConfidenceMode mode, final String intervals, final String md5) { + final String commandLine = String.format("-T HaplotypeCaller -ploidy 33 --disableDithering --pcr_indel_model NONE -pairHMMSub %s %s -R %s -I %s %s -ERC %s --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d", + HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, b37KGReference, bam, intervals, mode, HaplotypeCaller.OPTIMAL_GVCF_INDEX_TYPE, HaplotypeCaller.OPTIMAL_GVCF_INDEX_PARAMETER); + final String name = "testHCWithGVCFManyploid bam=" + bam + " intervals= " + intervals + " gvcf= " + mode; + final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList(md5)); + executeTest(name, spec); + } + @Test public void testERCRegionWithNoCalledHaplotypes() { final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -pairHMMSub %s %s -R %s -I %s -L %s -ERC GVCF -variant_index_type %s -variant_index_parameter %d",