From 85dce75f3f1a8be329212622bf595665282ce583 Mon Sep 17 00:00:00 2001 From: Geraldine Van der Auwera Date: Wed, 1 Jun 2016 17:21:48 -0400 Subject: [PATCH 01/68] Update pom versions to mark the start of GATK 3.7 development --- pom.xml | 2 +- protected/gatk-package-distribution/pom.xml | 2 +- protected/gatk-queue-extensions-distribution/pom.xml | 2 +- protected/gatk-queue-package-distribution/pom.xml | 2 +- protected/gatk-tools-protected/pom.xml | 2 +- protected/pom.xml | 2 +- public/VectorPairHMM/pom.xml | 2 +- public/external-example/pom.xml | 2 +- public/gatk-engine/pom.xml | 2 +- public/gatk-queue-extensions-generator/pom.xml | 2 +- public/gatk-queue-extensions-public/pom.xml | 2 +- public/gatk-queue/pom.xml | 2 +- public/gatk-root/pom.xml | 2 +- public/gatk-tools-public/pom.xml | 2 +- public/gatk-utils/pom.xml | 2 +- public/gsalib/pom.xml | 2 +- public/package-tests/pom.xml | 2 +- public/pom.xml | 2 +- 18 files changed, 18 insertions(+), 18 deletions(-) diff --git a/pom.xml b/pom.xml index 7f6394a8a..c238762c0 100644 --- a/pom.xml +++ b/pom.xml @@ -13,7 +13,7 @@ org.broadinstitute.gatk gatk-root - 3.6 + 3.7-SNAPSHOT public/gatk-root diff --git a/protected/gatk-package-distribution/pom.xml b/protected/gatk-package-distribution/pom.xml index 8486ee153..840d2356d 100644 --- a/protected/gatk-package-distribution/pom.xml +++ b/protected/gatk-package-distribution/pom.xml @@ -5,7 +5,7 @@ org.broadinstitute.gatk gatk-aggregator - 3.6 + 3.7-SNAPSHOT ../.. diff --git a/protected/gatk-queue-extensions-distribution/pom.xml b/protected/gatk-queue-extensions-distribution/pom.xml index eed2e0db4..2acb0c09a 100644 --- a/protected/gatk-queue-extensions-distribution/pom.xml +++ b/protected/gatk-queue-extensions-distribution/pom.xml @@ -5,7 +5,7 @@ org.broadinstitute.gatk gatk-aggregator - 3.6 + 3.7-SNAPSHOT ../.. diff --git a/protected/gatk-queue-package-distribution/pom.xml b/protected/gatk-queue-package-distribution/pom.xml index 0ae5b23c7..f4cc8663b 100644 --- a/protected/gatk-queue-package-distribution/pom.xml +++ b/protected/gatk-queue-package-distribution/pom.xml @@ -5,7 +5,7 @@ org.broadinstitute.gatk gatk-aggregator - 3.6 + 3.7-SNAPSHOT ../.. diff --git a/protected/gatk-tools-protected/pom.xml b/protected/gatk-tools-protected/pom.xml index ba6937868..350bab208 100644 --- a/protected/gatk-tools-protected/pom.xml +++ b/protected/gatk-tools-protected/pom.xml @@ -5,7 +5,7 @@ org.broadinstitute.gatk gatk-aggregator - 3.6 + 3.7-SNAPSHOT ../.. diff --git a/protected/pom.xml b/protected/pom.xml index 8177b7026..99fc50776 100644 --- a/protected/pom.xml +++ b/protected/pom.xml @@ -5,7 +5,7 @@ org.broadinstitute.gatk gatk-root - 3.6 + 3.7-SNAPSHOT ../public/gatk-root diff --git a/public/VectorPairHMM/pom.xml b/public/VectorPairHMM/pom.xml index 82157d5fe..a4deb1ed5 100644 --- a/public/VectorPairHMM/pom.xml +++ b/public/VectorPairHMM/pom.xml @@ -5,7 +5,7 @@ org.broadinstitute.gatk gatk-root - 3.6 + 3.7-SNAPSHOT ../../public/gatk-root diff --git a/public/external-example/pom.xml b/public/external-example/pom.xml index bc85bc6c3..13759ddce 100644 --- a/public/external-example/pom.xml +++ b/public/external-example/pom.xml @@ -9,7 +9,7 @@ External Example - 3.6 + 3.7-SNAPSHOT - 2.4.1 - 2.4.1 + 2.5.0 + 2.5.0 diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/tools/walkers/CNV/SymbolicAllelesIntegrationTest.java b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/tools/walkers/CNV/SymbolicAllelesIntegrationTest.java index f1746cb6d..c63a6e8ea 100644 --- a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/tools/walkers/CNV/SymbolicAllelesIntegrationTest.java +++ b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/tools/walkers/CNV/SymbolicAllelesIntegrationTest.java @@ -57,7 +57,7 @@ public class SymbolicAllelesIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( baseTestString(b36KGReference, "symbolic_alleles_2.vcf"), 1, - Arrays.asList("c8b294089832bb1a2c450b550318a471")); + Arrays.asList("3f97ed3243fad7f953f4859af92d227f")); executeTest("Test symbolic alleles mixed in with non-symbolic alleles", spec); } } diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/MannWhitneyU.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/MannWhitneyU.java index f16178d26..d413f6f79 100644 --- a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/MannWhitneyU.java +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/MannWhitneyU.java @@ -503,7 +503,7 @@ public class MannWhitneyU { * @return P-value based on histogram with u calculated for every possible permutation of group tag. */ public double permutationTest(final double[] series1, final double[] series2, final double testStatU) { - final Histogram histo = new Histogram(); + final Histogram histo = new Histogram<>(); final int n1 = series1.length; final int n2 = series2.length; @@ -555,7 +555,7 @@ public class MannWhitneyU { */ double sumOfAllSmallerBins = histo.get(testStatU).getValue() / 2.0; - for (final Histogram.Bin bin : histo.values()) { + for (final Histogram.Bin bin : histo.values()) { if (bin.getId() < testStatU) sumOfAllSmallerBins += bin.getValue(); } From 07052ba8eafd2b66fb7b663158afa96ab64abb65 Mon Sep 17 00:00:00 2001 From: Valentin Ruano Rubio Date: Thu, 26 May 2016 14:59:26 -0400 Subject: [PATCH 11/68] Changes to use the median rather than the second best likelihood for the NON_REF allele Addresses issue #1378 following the first proposal using the 'median' rather than the 'mean'. --- .../genotyper/ReadLikelihoodsUnitTest.java | 28 ++++++-- .../HaplotypeCallerGVCFIntegrationTest.java | 27 ++++---- .../gatk/utils/genotyper/ReadLikelihoods.java | 64 ++++++++++++++++--- 3 files changed, 90 insertions(+), 29 deletions(-) diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/ReadLikelihoodsUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/ReadLikelihoodsUnitTest.java index b4cae1373..15a092cd4 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/ReadLikelihoodsUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/ReadLikelihoodsUnitTest.java @@ -53,11 +53,12 @@ package org.broadinstitute.gatk.tools.walkers.genotyper; import htsjdk.samtools.SAMFileHeader; import htsjdk.variant.variantcontext.Allele; -import org.broadinstitute.gatk.utils.genotyper.*; +import org.apache.commons.math3.stat.descriptive.rank.Median; import org.broadinstitute.gatk.utils.GenomeLoc; import org.broadinstitute.gatk.utils.GenomeLocParser; import org.broadinstitute.gatk.utils.MathUtils; import org.broadinstitute.gatk.utils.Utils; +import org.broadinstitute.gatk.utils.genotyper.*; import org.broadinstitute.gatk.utils.sam.ArtificialSAMUtils; import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; import org.broadinstitute.gatk.utils.variant.GATKVCFConstants; @@ -472,14 +473,24 @@ public class ReadLikelihoodsUnitTest secondBestLk = lk; } } - final double expectedNonRefLk = Double.isInfinite(secondBestLk) ? bestLk : secondBestLk; + final Median median = new Median(); + final List qualifylingLikelihoods = new ArrayList<>(); + for (int a = 0; a < ordinaryAlleleCount; a++) { + if (originalLikelihoods[s][a][r] >= bestLk) continue; + qualifylingLikelihoods.add(originalLikelihoods[s][a][r]); + } + final double medianLikelihood = median.evaluate(qualifylingLikelihoods.stream().mapToDouble(d -> d).toArray()); + // NaN is returned in cases whether there is no elements in qualifyingLikelihoods. + // In such case we set the NON-REF likelihood to -Inf. + final double expectedNonRefLk = !Double.isNaN(medianLikelihood) ? medianLikelihood + : ordinaryAlleleCount <= 1 ? Double.NaN : bestLk; newLikelihoods[s][ordinaryAlleleCount][r] = expectedNonRefLk; } } testLikelihoodMatrixQueries(samples,result,newLikelihoods); } - private void testLikelihoodMatrixQueries(String[] samples, ReadLikelihoods result, final double[][][] likelihoods) { + private void testLikelihoodMatrixQueries(final String[] samples, final ReadLikelihoods result, final double[][][] likelihoods) { for (final String sample : samples) { final int sampleIndex = result.sampleIndex(sample); final int sampleReadCount = result.sampleReadCount(sampleIndex); @@ -487,9 +498,14 @@ public class ReadLikelihoodsUnitTest Assert.assertEquals(result.alleleCount(), alleleCount); for (int a = 0; a < alleleCount; a++) { Assert.assertEquals(result.sampleReadCount(sampleIndex),sampleReadCount); - for (int r = 0; r < sampleReadCount; r++) - Assert.assertEquals(result.sampleMatrix(sampleIndex).get(a,r), - likelihoods == null ? 0.0 : likelihoods[sampleIndex][a][r], EPSILON); + for (int r = 0; r < sampleReadCount; r++) { + if (Double.isNaN(result.sampleMatrix(sampleIndex).get(a, r))) { + Assert.assertTrue(likelihoods != null && Double.isNaN(likelihoods[sampleIndex][a][r])); + } else { + Assert.assertEquals(result.sampleMatrix(sampleIndex).get(a, r), + likelihoods == null ? 0.0 : likelihoods[sampleIndex][a][r], EPSILON); + } + } } } } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java index 0cafbff8e..fd15a2834 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java @@ -65,6 +65,7 @@ import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; +import java.util.Collections; import java.util.List; public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest { @@ -85,8 +86,8 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest { //TODO the following test is commented out for the record //tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.NONE, PCRFreeIntervals, "7f09c261950bf86e435edfa69ed2ec71"}); tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.NONE, PCRFreeIntervals, "8d30370465d74fd549d76dd31adc4c0c"}); - tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "b7a5f4e40d5ebaf5f6c46a3d4355c817"}); - tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "0f5e6f2584649a1b7386d94e3dc60f91"}); + tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "119a30fac57a0e5cf1b8164c1059b22c"}); + tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "a6bbc30b82e7864baf64163d55f5aee5"}); tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.NONE, WExIntervals, "2e81881e92061ad4eb29025ffdc129c7"}); tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "2c67bdc08c8784f2114c2039270b9766"}); tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.GVCF, WExIntervals, "63fa5841a21e2c13f1e1a8e2d4ea3380"}); @@ -104,8 +105,8 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest { // this functionality can be adapted to provide input data for whatever you might want in your data tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.NONE, PCRFreeIntervals, "3ae2c7e570855f6d6ca58ddd1089a970"}); - tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "22e03f01e91177011ac028d2347751ba"}); - tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "cb3f16bc10e1cc75f2093bec92145d18"}); + tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "822856b75c792be81693019bee672c09"}); + tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "6ef5ce3fbc943f15c077a0f12ff5bc2e"}); tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.NONE, WExIntervals, "63ff771eed3e62340c8938b4963d0add"}); tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "1122a0b3849f42d1c4a654f93b660e1b"}); @@ -127,8 +128,8 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest { // this functionality can be adapted to provide input data for whatever you might want in your data tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.NONE, PCRFreeIntervals, "8bf132d73cf6b0851ae73c6799f19ba9"}); - tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "90b25f3050435c9e67aa0ee325c24167"}); - tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "5f329540dc5c4556ab029d0e2cfcabcb"}); + tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "2f1534d30b51fd8a7861d73091be2336"}); + tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "2307bcb9f9e3468375a389720036b7da"}); tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.NONE, WExIntervals, "6ad7855dbf6dda2060aa93a3ee010b3e"}); tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "a0be095ed902a8acdb80fb56ca4e8fb4"}); tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.GVCF, WExIntervals, "8123d8b68b6fa77ef084f292e191622a"}); @@ -145,8 +146,8 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest { // this functionality can be adapted to provide input data for whatever you might want in your data tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.NONE, PCRFreeIntervals, "6662cfc41393257dfd6c39f1af1e3843"}); - tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "70ee4e60d9f86b63aaab09075a71ddd3"}); - tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "700d79df3b0b481444e81471204e242e"}); + tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "7dc7cfd463ecb7ac535c6ba925c46ef0"}); + tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "689d4b9cdc21be370c82251e1f7a3c4f"}); tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.NONE, WExIntervals, "af0fe243e3b96e59097187cd16ba1597"}); tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "228e1d2ec2e729a5f79c37f3f2557708"}); tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.GVCF, WExIntervals, "2fc7020457dde4439b4133c098d9ab9b"}); @@ -325,7 +326,7 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest { public void testAlleleSpecificAnnotations() { final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -pairHMMSub %s %s -R %s -I %s -L %s -ERC GVCF --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d -G Standard -G AS_Standard --disableDithering", HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, b37KGReference, privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.bam", "20:10433000-10437000", GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER); - final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("6f6b2fa85cd1bae7f8f72e144fe56c96")); + final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Collections.singletonList("1733d15e960ed473f58a2bfc7f686a2e")); spec.disableShadowBCF(); executeTest(" testAlleleSpecificAnnotations", spec); } @@ -334,7 +335,7 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest { public void testASMQMateRankSumAnnotation() { final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -pairHMMSub %s %s -R %s -I %s -L %s -ERC GVCF --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d -A AS_MQMateRankSumTest --disableDithering", HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, b37KGReference, privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.bam", "20:10433000-10437000", GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER); - final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("9613ec1ec93547cfb0651673e914bee4")); + final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Collections.singletonList("e6e09a82cade24f8121c81c1d43b5d03")); spec.disableShadowBCF(); executeTest(" testASMQMateRankSumAnnotation", spec); } @@ -343,7 +344,7 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest { public void testASInsertSizeRankSum() { final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -pairHMMSub %s %s -R %s -I %s -L %s -ERC GVCF --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d -G Standard -G AS_Standard --disableDithering -A AS_InsertSizeRankSum", HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, b37KGReference, privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.bam", "20:10433000-10437000", GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER); - final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("a8765c11b9130c815aae4e06c1f90e45")); + final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Collections.singletonList("33db0c7e64fc963c160f8bb59d983375")); spec.disableShadowBCF(); executeTest(" testASInsertSizeRankSum", spec); } @@ -352,7 +353,7 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest { public void testHaplotypeCallerMultiAllelicNonRef() { final String commandLine = String.format("-T HaplotypeCaller -R %s -I %s -L %s -ERC GVCF --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d -A StrandAlleleCountsBySample", b37KGReference, privateTestDir + "multiallelic-nonref.bam", "2:47641259-47641859", GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER); - final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("1d9e75bd09a6fc5a1d9156fe8a7d43ce")); + final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Collections.singletonList("182aa78f42235d2b4dabb87cc6c8a433")); spec.disableShadowBCF(); executeTest(" testHaplotypeCallerMultiAllelicNonRef", spec); } @@ -361,7 +362,7 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest { public void testHaplotypeCallerMaxNumPLValues() { final String commandLine = String.format("-T HaplotypeCaller -R %s -I %s -L %s -ERC GVCF --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d -ploidy 4 -maxNumPLValues 70", b37KGReference, privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.bam", validationDataLocation + "NA12878.HiSeq.b37.chr20.10_11mb.test.intervals", GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER); - final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("a4b5c40b1993573c5efd992f3f0db8a9")); + final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Collections.singletonList("1176028faca6cd397f581f9e60c474a8")); spec.disableShadowBCF(); executeTest("testHaplotypeCallerMaxNumPLValues", spec); } diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/genotyper/ReadLikelihoods.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/genotyper/ReadLikelihoods.java index 3a588a1fa..e03417d19 100644 --- a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/genotyper/ReadLikelihoods.java +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/genotyper/ReadLikelihoods.java @@ -30,6 +30,8 @@ import htsjdk.variant.variantcontext.Allele; import it.unimi.dsi.fastutil.ints.IntArrayList; import it.unimi.dsi.fastutil.objects.Object2IntMap; import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap; +import org.apache.commons.math.util.DoubleArray; +import org.apache.commons.math3.stat.descriptive.rank.Median; import org.broadinstitute.gatk.utils.downsampling.AlleleBiasedDownsamplingUtils; import org.broadinstitute.gatk.utils.GenomeLoc; import org.broadinstitute.gatk.utils.Utils; @@ -504,23 +506,26 @@ public class ReadLikelihoods implements SampleList, AlleleList * likelihood value. * @param candidateAlleles the potentially missing alleles. * @param defaultLikelihood the default read likelihood value for that allele. + * @return {@code true} iff the the read-likelihood collection was modified by the addition of the input alleles. + * So if all the alleles in the input collection were already present in the read-likelihood collection this method + * will return {@code false}. * * @throws IllegalArgumentException if {@code candidateAlleles} is {@code null} or there is more than * one missing allele that is a reference or there is one but the collection already has * a reference allele. */ - public void addMissingAlleles(final Collection candidateAlleles, final double defaultLikelihood) { + public boolean addMissingAlleles(final Collection candidateAlleles, final double defaultLikelihood) { if (candidateAlleles == null) throw new IllegalArgumentException("the candidateAlleles list cannot be null"); if (candidateAlleles.isEmpty()) - return; + return false; final List allelesToAdd = new ArrayList<>(candidateAlleles.size()); for (final A allele : candidateAlleles) if (alleles.alleleIndex(allele) == -1) allelesToAdd.add(allele); if (allelesToAdd.isEmpty()) - return; + return false; final int oldAlleleCount = alleles.alleleCount(); final int newAlleleCount = alleles.alleleCount() + allelesToAdd.size(); @@ -566,6 +571,7 @@ public class ReadLikelihoods implements SampleList, AlleleList nonRefAlleleIndex = nonRefIndex; updateNonRefAlleleLikelihoods(); } + return true; } /** @@ -1171,24 +1177,57 @@ public class ReadLikelihoods implements SampleList, AlleleList throw new IllegalArgumentException("non-ref allele cannot be null"); if (!nonRefAllele.equals(GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE)) throw new IllegalArgumentException("the non-ref allele is not valid"); - addMissingAlleles(Collections.singleton(nonRefAllele), Double.NEGATIVE_INFINITY); - updateNonRefAlleleLikelihoods(); + if (addMissingAlleles(Collections.singleton(nonRefAllele), Double.NEGATIVE_INFINITY)) { + updateNonRefAlleleLikelihoods(); + } } + /** + * Updates the likelihoods of the non-ref allele, if present, considering all concrete alleles avaialble. + */ public void updateNonRefAlleleLikelihoods() { updateNonRefAlleleLikelihoods(alleles); } + /** + * Updates the likelihood of the NonRef allele (if present) based on the likehoods of a set of concrete + * alleles. + *

+ * This method does + *

+ * + * + * @param allelesToConsider + */ public void updateNonRefAlleleLikelihoods(final AlleleList
allelesToConsider) { if (nonRefAlleleIndex < 0) return; + final int alleleCount = alleles.alleleCount(); + final int nonRefAlleleIndex = alleleIndex((A) GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE); + final int concreteAlleleCount = nonRefAlleleIndex < 0 ? alleleCount : alleleCount - 1; + // likelihood buffer reused across reads: + final double[] qualifiedAlleleLikelihoods = new double[concreteAlleleCount]; + final Median medianCalculator = new Median(); for (int s = 0; s < samples.sampleCount(); s++) { final double[][] sampleValues = valuesBySampleIndex[s]; - for (int r = 0; r < sampleValues[0].length; r++) { - final BestAllele bestAllele = searchBestAllele(s, r, true, allelesToConsider); - final double secondBestLikelihood = Double.isInfinite(bestAllele.confidence) ? bestAllele.likelihood - : bestAllele.likelihood - bestAllele.confidence; - sampleValues[nonRefAlleleIndex][r] = secondBestLikelihood; + final int readCount = sampleValues[0].length; + for (int r = 0; r < readCount; r++) { + final BestAllele bestAllele = searchBestAllele(s, r, true); + int numberOfQualifiedAlleleLikelihoods = 0; + for (int i = 0; i < alleleCount; i++) { + final double alleleLikelihood = sampleValues[i][r]; + if (i != nonRefAlleleIndex && alleleLikelihood < bestAllele.likelihood + && !Double.isNaN(alleleLikelihood) && allelesToConsider.alleleIndex(alleles.alleleAt(i)) != -1) { + qualifiedAlleleLikelihoods[numberOfQualifiedAlleleLikelihoods++] = alleleLikelihood; + } + } + final double nonRefLikelihood = medianCalculator.evaluate(qualifiedAlleleLikelihoods, 0, numberOfQualifiedAlleleLikelihoods); + // when the median is NaN that means that all applicable likekihoods are the same as the best + // so the read is not informative at all given the existing alleles. Unless there is only one (or zero) concrete + // alleles with give the same (the best) likelihood to the NON-REF. When there is only one (or zero) concrete + // alleles we set the NON-REF likelihood to NaN. + sampleValues[nonRefAlleleIndex][r] = !Double.isNaN(nonRefLikelihood) ? nonRefLikelihood + : concreteAlleleCount <= 1 ? Double.NaN : bestAllele.likelihood; } } } @@ -1383,6 +1422,11 @@ public class ReadLikelihoods implements SampleList, AlleleList * Contains information about the best allele for a read search result. */ public class BestAllele { + + /** + * Minimum difference between best and second best likelihood to consider a read informative as to + * what is the most probably allele the read came from. + */ public static final double INFORMATIVE_THRESHOLD = 0.2; /** From 556cc691858a482176965bedfd62a3c4f7b5aee9 Mon Sep 17 00:00:00 2001 From: meganshand Date: Fri, 24 Jun 2016 15:31:42 -0400 Subject: [PATCH 12/68] Fix for int overflow in RankSum calculation --- ...perGeneralPloidySuite1IntegrationTest.java | 6 +-- ...perGeneralPloidySuite2IntegrationTest.java | 2 +- ...GenotyperNormalCallingIntegrationTest.java | 4 +- .../HaplotypeCallerIntegrationTest.java | 2 +- .../gatk/utils/MannWhitneyU.java | 38 +++++++++++-------- .../broadinstitute/gatk/utils/MWUnitTest.java | 7 ++++ 6 files changed, 36 insertions(+), 23 deletions(-) diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite1IntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite1IntegrationTest.java index 6e21e96d1..92fb16387 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite1IntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite1IntegrationTest.java @@ -74,12 +74,12 @@ public class UnifiedGenotyperGeneralPloidySuite1IntegrationTest extends WalkerTe @Test(enabled = true) public void testBOTH_GGA_Pools() { - executor.PC_LSV_Test(String.format("-A AlleleCountBySample -maxAltAlleles 2 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s", LSV_ALLELES), "LSV_BOTH_GGA", "BOTH", "c3826794a250e32b0497353ceb1deb26"); + executor.PC_LSV_Test(String.format("-A AlleleCountBySample -maxAltAlleles 2 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s", LSV_ALLELES), "LSV_BOTH_GGA", "BOTH", "ee5a2f8954f38d6e5d44fe50b22e43a1"); } @Test(enabled = true) public void testINDEL_GGA_Pools() { - executor.PC_LSV_Test(String.format("-A AlleleCountBySample -maxAltAlleles 1 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s", LSV_ALLELES), "LSV_INDEL_GGA", "INDEL", "4eb0d8018da6612cd434491f338ed5a4"); + executor.PC_LSV_Test(String.format("-A AlleleCountBySample -maxAltAlleles 1 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s", LSV_ALLELES), "LSV_INDEL_GGA", "INDEL", "5cb3fe396302f3d4a4a9b7b3cc1877cc"); } @Test(enabled = true) @@ -88,6 +88,6 @@ public class UnifiedGenotyperGeneralPloidySuite1IntegrationTest extends WalkerTe //TODO the old MD5 is kept for the record. //TODO this should be revisit once we get into addressing inaccuracies by the independent allele approach. // executor.PC_LSV_Test_NoRef("-A AlleleCountBySample -maxAltAlleles 2 -ploidy 1", "LSV_INDEL_DISC_NOREF_p1", "INDEL", "b5ff7530827f4b9039a58bdc8a3560d2"); - executor.PC_LSV_Test_NoRef("-A AlleleCountBySample -maxAltAlleles 2 -ploidy 1", "LSV_INDEL_DISC_NOREF_p1", "INDEL", "c2fb9b05027c2b0ac9e338d9ddda69b1"); + executor.PC_LSV_Test_NoRef("-A AlleleCountBySample -maxAltAlleles 2 -ploidy 1", "LSV_INDEL_DISC_NOREF_p1", "INDEL", "b99416c04ba951577f43fd2d25f46388"); } } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite2IntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite2IntegrationTest.java index 51d7a5a65..4262ab665 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite2IntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite2IntegrationTest.java @@ -63,7 +63,7 @@ public class UnifiedGenotyperGeneralPloidySuite2IntegrationTest extends WalkerTe @Test(enabled = true) public void testINDEL_maxAltAlleles2_ploidy3_Pools_noRef() { - executor.PC_LSV_Test_NoRef("-A AlleleCountBySample -maxAltAlleles 2 -ploidy 3","LSV_INDEL_DISC_NOREF_p3","INDEL","e22846de4567f576e08e00edda2931d0"); + executor.PC_LSV_Test_NoRef("-A AlleleCountBySample -maxAltAlleles 2 -ploidy 3","LSV_INDEL_DISC_NOREF_p3","INDEL","1d27eaa3557dc28c95b9024114d50ad1"); } @Test(enabled = true) diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperNormalCallingIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperNormalCallingIntegrationTest.java index bdd3eb5c0..7348e12b1 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperNormalCallingIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperNormalCallingIntegrationTest.java @@ -102,7 +102,7 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{ public void testMultipleSNPAlleles() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -glm BOTH --dbsnp " + b37dbSNP129 + " -I " + privateTestDir + "multiallelic.snps.bam -o %s -L " + privateTestDir + "multiallelic.snps.intervals", 1, - Arrays.asList("94ca1e00d4fad9c5279271c2779ff797")); + Arrays.asList("25b710f830749448cd056c9b2e7798ff")); executeTest("test Multiple SNP alleles", spec); } @@ -126,7 +126,7 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{ public void testMismatchedPLs() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -I " + privateTestDir + "mismatchedPLs.bam -o %s -L 1:24020341", 1, - Arrays.asList("c5aff2572ce09c413e7f5c9e1b3f92d6")); + Arrays.asList("1759c156bc45528504398a7ef4ce5bf8")); executeTest("test mismatched PLs", spec); } } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index 09722f5ee..747609ace 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -397,7 +397,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { public void testLackSensitivityDueToBadHaplotypeSelectionFix() { final String commandLine = String.format("-T HaplotypeCaller -pairHMMSub %s %s -R %s -I %s -L %s --no_cmdline_in_header --maxNumHaplotypesInPopulation 16", HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, b37KGReferenceWithDecoy, privateTestDir + "hc-lack-sensitivity.bam", privateTestDir + "hc-lack-sensitivity.interval_list"); - final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("5514cfbcf12954bb12c725b77eaac248")); + final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("22f5a3e9366e611509f03c984f8b4960")); spec.disableShadowBCF(); executeTest("testLackSensitivityDueToBadHaplotypeSelectionFix", spec); } diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/MannWhitneyU.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/MannWhitneyU.java index d413f6f79..d475692d4 100644 --- a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/MannWhitneyU.java +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/MannWhitneyU.java @@ -280,23 +280,9 @@ public class MannWhitneyU { RankedData ranked = calculateRank(series1, series2); Rank[] ranks = ranked.getRank(); ArrayList numOfTies = ranked.getNumOfTies(); + int lengthOfRanks = ranks.length; - //Calculate number of ties transformed for formula for Sigma to calculate Z-score - ArrayList transformedTies = new ArrayList<>(); - for (int count : numOfTies) { - //If every single datapoint is tied then we want to return a p-value of .5 and - //the formula for sigma that includes the number of ties breaks down. Setting - //the number of ties to 0 in this case gets the desired result in the normal - //approximation case. - if (count != ranks.length) { - transformedTies.add((count * count * count) - count); - } - } - - double numOfTiesForSigma = 0.0; - for (int count : transformedTies) { - numOfTiesForSigma += count; - } + double numOfTiesForSigma = transformTies(lengthOfRanks, numOfTies); // Calculate R1 and R2 and U. float r1 = 0, r2 = 0; @@ -314,6 +300,26 @@ public class MannWhitneyU { return result; } + public double transformTies(int numOfRanks, ArrayList numOfTies) { + //Calculate number of ties transformed for formula for Sigma to calculate Z-score + ArrayList transformedTies = new ArrayList<>(); + for (int count : numOfTies) { + //If every single datapoint is tied then we want to return a p-value of .5 and + //the formula for sigma that includes the number of ties breaks down. Setting + //the number of ties to 0 in this case gets the desired result in the normal + //approximation case. + if (count != numOfRanks) { + transformedTies.add((Math.pow(count, 3)) - count); + } + } + + double numOfTiesForSigma = 0.0; + for (double count : transformedTies) { + numOfTiesForSigma += count; + } + + return(numOfTiesForSigma); + } /** * Calculates the rank-sum test statisic U (sometimes W) from two sets of input data for a one-sided test * with an int indicating which group is the dominator. Returns a test statistic object with trueU and number of diff --git a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/MWUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/MWUnitTest.java index a3e781343..d99fd42dd 100644 --- a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/MWUnitTest.java +++ b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/MWUnitTest.java @@ -30,6 +30,7 @@ import org.testng.annotations.DataProvider; import org.testng.annotations.Test; import org.testng.Assert; +import java.util.ArrayList; import java.util.Arrays; /** @@ -139,4 +140,10 @@ public class MWUnitTest extends BaseTest { System.out.println("1-side p: " + test.getP()); Assert.assertEquals(test.getZ(), Z, DELTA_PRECISION, name); } + + @Test + public void testTooManyTies(){ + ArrayList listOfNumberOfTies = new ArrayList<>(Arrays.asList(26,3,6,4,13,18,29,36,60,58,87,63,98,125,158,185,193,171,17592,115,100,141,216,298,451,719,1060,1909,3210,5167,7135,10125,11035,3541,732,9)); + Assert.assertEquals(rst.transformTies(64890, listOfNumberOfTies), 8.41378729572e+12); + } } From 1b921666a7f92b971a47b5021b6526cd8c82230e Mon Sep 17 00:00:00 2001 From: meganshand Date: Tue, 28 Jun 2016 14:23:39 -0400 Subject: [PATCH 13/68] Change to max value of ExcessHet --- .../broadinstitute/gatk/tools/walkers/annotator/ExcessHet.java | 3 ++- .../gatk/tools/walkers/annotator/ExcessHetUnitTest.java | 3 +++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/ExcessHet.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/ExcessHet.java index 063e63560..b4dd52447 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/ExcessHet.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/ExcessHet.java @@ -139,8 +139,9 @@ public class ExcessHet extends InfoFieldAnnotation implements StandardAnnotation double pval = exactTest(genotypeCounts); //If the actual phredPval would be infinity we will probably still filter out just a very large number + //Since the method does not guarantee precision for any p-value smaller than 1e-16, we can return the phred scaled version if (pval == 0) { - return Integer.MAX_VALUE; + return -10.0 * Math.log10(minNeededValue); } double phredPval = -10.0 * Math.log10(pval); diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/annotator/ExcessHetUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/annotator/ExcessHetUnitTest.java index a28ce0337..ee0d2da8d 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/annotator/ExcessHetUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/annotator/ExcessHetUnitTest.java @@ -223,6 +223,9 @@ public class ExcessHetUnitTest { final double EHHets = new ExcessHet().calculateEH(allHet, allHet.getGenotypes()); Assert.assertTrue(Math.abs(EHsingleton) < Math.abs(EHHets), String.format("singleton=%f allHets=%f", EHsingleton, EHHets)); + + //Since all hets is such an extreme case and the sample size is large here, we know that the p-value should be 0 + Assert.assertTrue(EHHets == 160.0, String.format("P-value of 0 should be phred scaled to 160.0")); } @DataProvider(name = "smallSets") From 4066bcd75ce9e57bd09224c991a28a219ef4be65 Mon Sep 17 00:00:00 2001 From: Laura Gauthier Date: Thu, 30 Jul 2015 08:30:15 -0400 Subject: [PATCH 14/68] Add new annotator for M1 clustered read position filter and M1 strand bias filter. --- .../cancer/m2/AbstractPowerCalculator.java | 100 +++++++++++++ .../cancer/m2/M2ArgumentCollection.java | 27 ++++ .../gatk/tools/walkers/cancer/m2/MuTect2.java | 16 +- .../cancer/m2/SomaticGenotypingEngine.java | 86 +++++++++++ .../cancer/m2/TumorPowerCalculator.java | 139 ++++++++++++++++++ 5 files changed, 367 insertions(+), 1 deletion(-) create mode 100644 protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/AbstractPowerCalculator.java create mode 100644 protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/TumorPowerCalculator.java diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/AbstractPowerCalculator.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/AbstractPowerCalculator.java new file mode 100644 index 000000000..d9743dc33 --- /dev/null +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/AbstractPowerCalculator.java @@ -0,0 +1,100 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2014 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.tools.walkers.cancer.m2; + +import java.util.HashMap; + +public class AbstractPowerCalculator { + protected HashMap cache = new HashMap(); + protected double constantEps; + protected double constantLodThreshold; + + protected static class PowerCacheKey { + private int n; + private double delta; + + public PowerCacheKey(int n, double delta) { + this.n = n; + this.delta = delta; + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + + PowerCacheKey that = (PowerCacheKey) o; + + if (Double.compare(that.delta, delta) != 0) return false; + if (n != that.n) return false; + + return true; + } + + @Override + public int hashCode() { + int result; + long temp; + result = n; + temp = delta != +0.0d ? Double.doubleToLongBits(delta) : 0L; + result = 31 * result + (int) (temp ^ (temp >>> 32)); + return result; + } + } + + protected static double calculateLogLikelihood(int depth, int alts, double eps, double f) { + double a = (depth-alts) * Math.log10(f*eps + (1d-f)*(1d-eps)); + double b = (alts) * Math.log10(f*(1d-eps) + (1d-f)*eps); + return (a+b); + } + +} diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/M2ArgumentCollection.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/M2ArgumentCollection.java index 6aca6a35f..b3c4d1f75 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/M2ArgumentCollection.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/M2ArgumentCollection.java @@ -54,6 +54,7 @@ package org.broadinstitute.gatk.tools.walkers.cancer.m2; import org.broadinstitute.gatk.tools.walkers.haplotypecaller.AssemblyBasedCallerArgumentCollection; import org.broadinstitute.gatk.utils.commandline.Advanced; import org.broadinstitute.gatk.utils.commandline.Argument; +import org.broadinstitute.gatk.utils.commandline.Hidden; public class M2ArgumentCollection extends AssemblyBasedCallerArgumentCollection { @Advanced @@ -119,4 +120,30 @@ public class M2ArgumentCollection extends AssemblyBasedCallerArgumentCollection */ @Argument(fullName = "max_alt_allele_in_normal_fraction", required = false, doc="Threshold for maximum alternate allele fraction in normal") public double MAX_ALT_ALLELE_IN_NORMAL_FRACTION = 0.03; + + /** + * This argument is used for the M1-style strand bias filter + */ + @Argument(fullName="power_constant_qscore", doc="Phred scale quality score constant to use in power calculations", required=false) + public int POWER_CONSTANT_QSCORE = 30; + + @Hidden + @Argument(fullName = "strand_artifact_lod", required = false, doc = "LOD threshold for calling strand bias") + public float STRAND_ARTIFACT_LOD_THRESHOLD = 2.0f; + + @Hidden + @Argument(fullName = "strand_artifact_power_threshold", required = false, doc = "power threshold for calling strand bias") + public float STRAND_ARTIFACT_POWER_THRESHOLD = 0.9f; + + /** + * This argument is used for the M1-style read position filter + */ + @Argument(fullName = "pir_median_threshold", required = false, doc="threshold for clustered read position artifact median") + public double PIR_MEDIAN_THRESHOLD = 10; + + /** + * This argument is used for the M1-style read position filter + */ + @Argument(fullName = "pir_mad_threshold", required = false, doc="threshold for clustered read position artifact MAD") + public double PIR_MAD_THRESHOLD = 3; } diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/MuTect2.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/MuTect2.java index 3c9bc8402..a81827b7a 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/MuTect2.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/MuTect2.java @@ -411,6 +411,11 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i headerInfo.add(GATKVCFHeaderLines.getFilterLine(GATKVCFConstants.GERMLINE_RISK_FILTER_NAME)); headerInfo.add(GATKVCFHeaderLines.getFilterLine(GATKVCFConstants.TRIALLELIC_SITE_FILTER_NAME)); + headerInfo.add(new VCFFilterHeaderLine("M1_CLUSTERED_READ_POSITION", "Variant appears in similar read positions")); + headerInfo.add(new VCFFilterHeaderLine("M1_STRAND_BIAS", "Forward LOD vs. reverse LOD comparison indicates strand bias")); + headerInfo.add(new VCFInfoHeaderLine("TLOD_FWD",1,VCFHeaderLineType.Integer,"TLOD from forward reads only")); + headerInfo.add(new VCFInfoHeaderLine("TLOD_REV",1,VCFHeaderLineType.Integer,"TLOD from reverse reads only")); + if ( ! doNotRunPhysicalPhasing ) { headerInfo.add(GATKVCFHeaderLines.getFormatLine(GATKVCFConstants.HAPLOTYPE_CALLER_PHASING_ID_KEY)); headerInfo.add(GATKVCFHeaderLines.getFormatLine(GATKVCFConstants.HAPLOTYPE_CALLER_PHASING_GT_KEY)); @@ -829,6 +834,15 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i filters.add(GATKVCFConstants.CLUSTERED_EVENTS_FILTER_NAME); } + Integer tumorFwdPosMedian = (Integer) vc.getAttribute("TUMOR_FWD_POS_MEDIAN"); + Integer tumorRevPosMedian = (Integer) vc.getAttribute("TUMOR_REV_POS_MEDIAN"); + Integer tumorFwdPosMAD = (Integer) vc.getAttribute("TUMOR_FWD_POS_MAD"); + Integer tumorRevPosMAD = (Integer) vc.getAttribute("TUMOR_REV_POS_MAD"); + //If the variant is near the read end (median threshold) and the positions are very similar (MAD threshold) then filter + if ( (tumorFwdPosMedian != null && tumorFwdPosMedian <= MTAC.PIR_MEDIAN_THRESHOLD && tumorFwdPosMAD != null && tumorFwdPosMAD <= MTAC.PIR_MAD_THRESHOLD) || + (tumorRevPosMedian != null && tumorRevPosMedian <= MTAC.PIR_MEDIAN_THRESHOLD && tumorFwdPosMAD != null && tumorRevPosMAD <= MTAC.PIR_MAD_THRESHOLD)) + filters.add("M1_CLUSTERED_READ_POSITION"); + return filters; } @@ -1052,7 +1066,7 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i // protected List annotationsToUse = new ArrayList<>(Arrays.asList(new String[]{"ClippingRankSumTest", "DepthPerSampleHC"})); //protected List annotationsToUse = new ArrayList<>(Arrays.asList(new String[]{"DepthPerAlleleBySample", "BaseQualitySumPerAlleleBySample", "TandemRepeatAnnotator", // "RMSMappingQuality","MappingQualityRankSumTest","FisherStrand","StrandOddsRatio","ReadPosRankSumTest","QualByDepth", "Coverage"})); - protected List annotationsToUse = new ArrayList<>(Arrays.asList(new String[]{"DepthPerAlleleBySample", "BaseQualitySumPerAlleleBySample", "TandemRepeatAnnotator", "OxoGReadCounts"})); + protected List annotationsToUse = new ArrayList<>(Arrays.asList(new String[]{"DepthPerAlleleBySample", "BaseQualitySumPerAlleleBySample", "TandemRepeatAnnotator", "OxoGReadCounts", "ClusteredEventsAnnotator"})); /** * Which annotations to exclude from output in the VCF file. Note that this argument has higher priority than the -A or -G arguments, diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/SomaticGenotypingEngine.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/SomaticGenotypingEngine.java index 0a0d42b06..4c918823b 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/SomaticGenotypingEngine.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/SomaticGenotypingEngine.java @@ -78,12 +78,16 @@ import java.util.*; public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine { protected M2ArgumentCollection MTAC; + private TumorPowerCalculator strandArtifactPowerCalculator; private final static Logger logger = Logger.getLogger(SomaticGenotypingEngine.class); public SomaticGenotypingEngine(final M2ArgumentCollection configuration, final SampleList samples, final GenomeLocParser genomeLocParser, final AFCalculatorProvider afCalculatorProvider, final boolean doPhysicalPhasing, final M2ArgumentCollection MTAC) { super(configuration, samples, genomeLocParser, afCalculatorProvider, doPhysicalPhasing); this.MTAC = MTAC; + // coverage related initialization + double powerConstantEps = Math.pow(10, -1 * (MTAC.POWER_CONSTANT_QSCORE/10)); + strandArtifactPowerCalculator = new TumorPowerCalculator(powerConstantEps, MTAC.STRAND_ARTIFACT_LOD_THRESHOLD, 0.0f); } /** @@ -224,6 +228,17 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine { double[] tumorGLs = getVariableGenotypeLikelihoods(mergedVC, tumorPRALM, originalNormalReadQualities, afs); + PerReadAlleleLikelihoodMap forwardPRALM = new PerReadAlleleLikelihoodMap(); + PerReadAlleleLikelihoodMap reversePRALM = new PerReadAlleleLikelihoodMap(); + splitPRALMintoForwardAndReverseReads(tumorPRALM, forwardPRALM, reversePRALM); + + // TODO: TS uncomment and fix +// double f_fwd = estimateAlleleFraction(mergedVC, forwardPRALM); +// double[] tumorGLs_fwd = getVariableGenotypeLikelihoods(mergedVC, forwardPRALM, f_fwd); +// +// double f_rev = estimateAlleleFraction(mergedVC, reversePRALM); +// double[] tumorGLs_rev = getVariableGenotypeLikelihoods(mergedVC, reversePRALM, f_rev); + PerReadAlleleLikelihoodMap normalPRALM = null; double[] normalGLs = null; if (hasNormal) { @@ -253,6 +268,12 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine { double[] normalLods = new double[numAlts]; + // TODO: TS extend fwd-rev approach for multiple alt alleles +// int REF = 0, HET = 1; +// double tumorLod = tumorGLs[HET] - tumorGLs[REF]; +// double tumorLod_fwd = tumorGLs_fwd[HET] - tumorGLs_fwd[REF]; +// double tumorLod_rev = tumorGLs_rev[HET] - tumorGLs_rev[REF]; +// double normalLod = 0; if (hasNormal) { GenomeLoc eventGenomeLoc = genomeLocParser.createGenomeLoc(activeRegionWindow.getContig(), loc); Collection cosmicVC = tracker.getValues(cosmicRod, eventGenomeLoc); @@ -277,9 +298,27 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine { } } + // TS: if more than one passing alt alleles, filter it out, so doesn't matter which one we pick + final double tumorLod = tumorLods[lodInd]; final double normalLod = normalLods[lodInd]; + double tumorSBpower_fwd; + double tumorSBpower_rev; + + // TODO: TS fix +// try { +// tumorSBpower_fwd = strandArtifactPowerCalculator.cachingPowerCalculation(forwardPRALM.getNumberOfStoredElements(), f_fwd); +// tumorSBpower_rev = strandArtifactPowerCalculator.cachingPowerCalculation(reversePRALM.getNumberOfStoredElements(), f_rev); +// } +// catch (Throwable t) { +// System.err.println("Error processing " + activeRegionWindow.getContig() + ":" + loc); +// t.printStackTrace(System.err); +// +// throw new RuntimeException(t); +// } + + VariantContext call = null; if (tumorLod >= MTAC.INITIAL_TUMOR_LOD_THRESHOLD && normalLod >= INIT_NORMAL_LOD_THRESHOLD) { VariantContextBuilder callVcb = new VariantContextBuilder(mergedVC); @@ -289,6 +328,20 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine { } int haplotypeCount = alleleMapper.get(mergedVC.getAlternateAllele(lodInd)).size(); + // TODO: TS revisit +// callVcb.attribute("TLOD_FWD",tumorLod_fwd); +// callVcb.attribute("TLOD_REV",tumorLod_rev); +// if ( (tumorSBpower_fwd >= MTAC.STRAND_ARTIFACT_POWER_THRESHOLD && tumorLod_fwd < MTAC.STRAND_ARTIFACT_LOD_THRESHOLD) || +// (tumorSBpower_rev >= MTAC.STRAND_ARTIFACT_POWER_THRESHOLD && tumorLod_rev < MTAC.STRAND_ARTIFACT_LOD_THRESHOLD) ) +// callVcb.filter("M1_STRAND_BIAS"); + + // TODO: TS revisit +// if ( (tumorSBpower_fwd >= MTAC.STRAND_ARTIFACT_POWER_THRESHOLD && tumorLod_fwd < MTAC.STRAND_ARTIFACT_LOD_THRESHOLD) || +// (tumorSBpower_rev >= MTAC.STRAND_ARTIFACT_POWER_THRESHOLD && tumorLod_rev < MTAC.STRAND_ARTIFACT_LOD_THRESHOLD) ) +// callVcb.filter("M1_STRAND_BIAS"); +// +// // FIXME: can simply get first alternate since above we only deal with Bi-allelic sites... +// int haplotypeCount = alleleMapper.get(mergedVC.getAlternateAllele(0)).size(); callVcb.attribute(GATKVCFConstants.HAPLOTYPE_COUNT_KEY, haplotypeCount); callVcb.attribute(GATKVCFConstants.TUMOR_LOD_KEY, tumorLod); callVcb.attribute(GATKVCFConstants.NORMAL_LOD_KEY, normalLod); @@ -545,6 +598,39 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine { } } + private void splitPRALMintoForwardAndReverseReads(final PerReadAlleleLikelihoodMap original, final PerReadAlleleLikelihoodMap forward, final PerReadAlleleLikelihoodMap reverse) { + Map> origMap = original.getLikelihoodReadMap(); + Map> fwdMap = forward.getLikelihoodReadMap(); + Map> revMap = reverse.getLikelihoodReadMap(); + + // iterate through the reads, assign reads and likelihoods to the forward or reverse maps based on the read's strand + Set forwardReads = new HashSet<>(); + Set reverseReads = new HashSet<>(); + + for(GATKSAMRecord rec : origMap.keySet()) { + if (rec.isStrandless()) + continue; + if (rec.getReadNegativeStrandFlag()) + reverseReads.add(rec); + else + forwardReads.add(rec); + } + + final Iterator>> it = origMap.entrySet().iterator(); + while ( it.hasNext() ) { + final Map.Entry> record = it.next(); + if(forwardReads.contains(record.getKey())) { + fwdMap.put(record.getKey(), record.getValue()); + //logM2Debug("Dropping read " + record.getKey() + " due to overlapping read fragment rules"); + } + else if (reverseReads.contains(record.getKey())){ + revMap.put(record.getKey(),record.getValue()); + } + } + + } + + // Move to utility class so we can use one shared with HaplotypeCallerGenotypingEngine private VariantContext addNonRefSymbolicAllele(final VariantContext mergedVC) { final VariantContextBuilder vcb = new VariantContextBuilder(mergedVC); diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/TumorPowerCalculator.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/TumorPowerCalculator.java new file mode 100644 index 000000000..fbb8892e5 --- /dev/null +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/TumorPowerCalculator.java @@ -0,0 +1,139 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2014 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.tools.walkers.cancer.m2; + +import org.apache.commons.math.MathException; +import org.apache.commons.math.distribution.BinomialDistribution; +import org.apache.commons.math.distribution.BinomialDistributionImpl; + +public class TumorPowerCalculator extends AbstractPowerCalculator{ + private double constantContamination; + private boolean enableSmoothing; + + public TumorPowerCalculator(double constantEps, double constantLodThreshold, double constantContamination) { + this(constantEps, constantLodThreshold, constantContamination, true); + } + + public TumorPowerCalculator(double constantEps, double constantLodThreshold, double constantContamination, boolean enableSmoothing) { + this.constantEps = constantEps; + this.constantLodThreshold = constantLodThreshold; + this.constantContamination = constantContamination; + this.enableSmoothing = enableSmoothing; + } + + public double cachingPowerCalculation(int n, double delta) throws MathException { + PowerCacheKey key = new PowerCacheKey(n, delta); + Double power = cache.get(key); + if (power == null) { + power = calculatePower(n, constantEps, constantLodThreshold, delta, constantContamination, enableSmoothing); + cache.put(key, power); + } + return power; + } + + + + + protected static double calculateTumorLod(int depth, int alts, double eps, double contam) { + double f = (double) alts / (double) depth; + return (AbstractPowerCalculator.calculateLogLikelihood(depth, alts, eps, f) - AbstractPowerCalculator.calculateLogLikelihood(depth, alts, eps, Math.min(f,contam))); + } + + protected static double calculatePower(int depth, double eps, double lodThreshold, double delta, double contam, boolean enableSmoothing) throws MathException { + if (depth==0) return 0; + + // calculate the probability of each configuration + double p_alt_given_e_delta = delta*(1d-eps) + (1d-delta)*eps; + BinomialDistribution binom = new BinomialDistributionImpl(depth, p_alt_given_e_delta); + double[] p = new double[depth+1]; + for(int i=0; i= lodThreshold) { + k = i; + break; + } + } + + // if no depth meets the lod score, the power is zero + if (k == -1) { + return 0; + } + + double power = 0; + + // here we correct for the fact that the exact lod threshold is likely somewhere between + // the k and k-1 bin, so we prorate the power from that bin + // the k and k-1 bin, so we prorate the power from that bin + // if k==0, it must be that lodThreshold == lod[k] so we don't have to make this correction + if ( enableSmoothing && k > 0 ) { + double x = 1d - (lodThreshold - lod[k-1]) / (lod[k] - lod[k-1]); + power = x*p[k-1]; + } + + for(int i=k; i Date: Wed, 13 Jan 2016 16:06:57 +0900 Subject: [PATCH 15/68] Built on Laura's code to port the strand bias filter from M1 and refactored code aroud SomaticGenotypingEngine. Added a new integration test. --- .../walkers/annotator/StrandBiasTest.java | 2 +- .../walkers/annotator/StrandOddsRatio.java | 1 + .../cancer/m2/ClusteredEventsAnnotator.java | 191 +++++++ .../walkers/cancer/m2/Dream_Evaluations.md | 2 +- .../cancer/m2/M2ArgumentCollection.java | 3 + .../walkers/cancer/m2/M2_HapMapSensitivity.md | 68 +++ .../gatk/tools/walkers/cancer/m2/MuTect2.java | 39 +- .../{run_M2_ICE_NN.scala => MuTectStats.java} | 80 +-- .../walkers/cancer/m2/NA12878_Evaluations.md | 20 +- ...lculator.java => PerAlleleCollection.java} | 175 +++++-- .../cancer/m2/SomaticGenotypingEngine.java | 471 +++++++++--------- .../cancer/m2/TumorPowerCalculator.java | 170 +++++-- .../cancer/m2/MuTect2IntegrationTest.java | 27 +- .../cancer/m2/PerAlleleCollectionTest.java} | 119 ++--- .../cancer/m2/TumorPowerCalculatorTest.java} | 53 +- .../annotator/VariantAnnotatorEngine.java | 8 +- .../gatk/utils/sam/AlignmentUtils.java | 1 + .../gatk/utils/variant/GATKVCFConstants.java | 6 + .../utils/variant/GATKVCFHeaderLines.java | 8 + 19 files changed, 935 insertions(+), 509 deletions(-) create mode 100644 protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/ClusteredEventsAnnotator.java create mode 100644 protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/M2_HapMapSensitivity.md rename protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/{run_M2_ICE_NN.scala => MuTectStats.java} (82%) rename protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/{AbstractPowerCalculator.java => PerAlleleCollection.java} (61%) rename protected/gatk-tools-protected/src/{main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/create_M2_pon.scala => test/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/PerAlleleCollectionTest.java} (76%) rename protected/gatk-tools-protected/src/{main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/run_M2_dream.scala => test/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/TumorPowerCalculatorTest.java} (86%) diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/StrandBiasTest.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/StrandBiasTest.java index d4d2ab02b..4597d359e 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/StrandBiasTest.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/StrandBiasTest.java @@ -241,7 +241,7 @@ public abstract class StrandBiasTest extends InfoFieldAnnotation implements Acti /** Allocate and fill a 2x2 strand contingency table. In the end, it'll look something like this: - * fw rc + * fw rv * allele1 # # * allele2 # # * @return a 2x2 contingency table diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/StrandOddsRatio.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/StrandOddsRatio.java index ba2011215..c818f4a79 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/StrandOddsRatio.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/StrandOddsRatio.java @@ -153,6 +153,7 @@ public class StrandOddsRatio extends StrandBiasTest implements StandardAnnotatio double ratio = 0; ratio += (augmentedTable[0][0] / augmentedTable[0][1]) * (augmentedTable[1][1] / augmentedTable[1][0]); + // TODO: repeated computation: how about ratio += 1/ratio, or ratio = ratio + 1/ratio to be expicit ratio += (augmentedTable[0][1] / augmentedTable[0][0]) * (augmentedTable[1][0] / augmentedTable[1][1]); final double refRatio = (Math.min(augmentedTable[0][0], augmentedTable[0][1])/Math.max(augmentedTable[0][0], augmentedTable[0][1])); diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/ClusteredEventsAnnotator.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/ClusteredEventsAnnotator.java new file mode 100644 index 000000000..db47d34e8 --- /dev/null +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/ClusteredEventsAnnotator.java @@ -0,0 +1,191 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 ("BROAD") and the LICENSEE and is effective at the date the downloading is completed ("EFFECTIVE DATE"). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system ("PHONE-HOME") which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE'S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2016 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.tools.walkers.cancer.m2; + +import htsjdk.variant.variantcontext.Allele; +import htsjdk.variant.variantcontext.VariantContext; +import htsjdk.variant.vcf.VCFHeaderLineType; +import htsjdk.variant.vcf.VCFInfoHeaderLine; +import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.ActiveRegionBasedAnnotation; +import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible; +import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.InfoFieldAnnotation; +import org.broadinstitute.gatk.utils.QualityUtils; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.genotyper.MostLikelyAllele; +import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.sam.AlignmentUtils; +import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; +import org.broadinstitute.gatk.utils.sam.ReadUtils; + +import java.util.*; + +/** + * Created by gauthier on 7/27/15. + */ +public class ClusteredEventsAnnotator extends InfoFieldAnnotation implements ActiveRegionBasedAnnotation { + + private String tumorSampleName = null; + + @Override + public List getKeyNames() { return Arrays.asList("tumorForwardOffsetMedian","tumorReverseOffsetMedian","tumorForwardOffsetMAD","tumorReverseOffsetMAD"); } + + @Override + public List getDescriptions() { + //TODO: this needs a lot of re-phrasing + return Arrays.asList(new VCFInfoHeaderLine("TUMOR_FWD_POS_MEDIAN", 1, VCFHeaderLineType.Integer, "Median offset of tumor variant position from positive read end"), + new VCFInfoHeaderLine("TUMOR_FWD_POS_MAD", 1, VCFHeaderLineType.Integer, "Median absolute deviation from the median for tumor forward read positions"), + new VCFInfoHeaderLine("TUMOR_REV_POS_MEDIAN", 1, VCFHeaderLineType.Integer, "Median offset of tumor variant position from negative read end"), + new VCFInfoHeaderLine("TUMOR_REV_POS_MAD", 1, VCFHeaderLineType.Integer, "Median absolute deviation from the median for tumor reverse read positions")); + } + + @Override + public Map annotate(final RefMetaDataTracker tracker, + final AnnotatorCompatible walker, + final ReferenceContext ref, + final Map stratifiedContexts, + final VariantContext vc, + final Map stratifiedPerReadAlleleLikelihoodMap) { + + if (tumorSampleName == null){ + if (walker instanceof MuTect2 ) { + tumorSampleName = ((MuTect2) walker).tumorSampleName; + } else { + // ts: log error and exit + throw new IllegalStateException("ClusteredEventsAnnotator: walker is not MuTect2"); + } + } + + final Map map = new HashMap<>(); + + + if ( stratifiedPerReadAlleleLikelihoodMap != null ) { + final PerReadAlleleLikelihoodMap likelihoodMap = stratifiedPerReadAlleleLikelihoodMap.get(tumorSampleName); + MuTect2.logReadInfo("HAVCYADXX150109:2:2209:19034:53394", likelihoodMap.getLikelihoodReadMap().keySet(), "Present inside ClusteredEventsAnnotator:annotate"); + if ( likelihoodMap != null && !likelihoodMap.isEmpty() ) { + double[] list = fillQualsFromLikelihoodMap(vc.getStart(), likelihoodMap); // [fwdMedian, revMedian, fwdMAD, revMAD] + final int FWDMEDIAN = 0, REVMEDIAN = 1, FWDMAD = 2, REVMAD = 3; // ts: make a class to contain these values + map.put("TUMOR_FWD_POS_MEDIAN", list[FWDMEDIAN]); + map.put("TUMOR_REV_POS_MEDIAN", list[REVMEDIAN]); + map.put("TUMOR_FWD_POS_MAD", list[FWDMAD]); + map.put("TUMOR_REV_POS_MAD", list[REVMAD]); + } + } + + return map; + + } + + private double[] fillQualsFromLikelihoodMap(final int refLoc, + final PerReadAlleleLikelihoodMap likelihoodMap) { + final ArrayList tumorFwdOffset = new ArrayList<>(); + final ArrayList tumorRevOffset = new ArrayList<>(); + for ( final Map.Entry> el : likelihoodMap.getLikelihoodReadMap().entrySet() ) { + final MostLikelyAllele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue()); + if ( ! a.isInformative() ) + continue; // read is non-informative + + final GATKSAMRecord read = el.getKey(); + if ( isUsableRead(read, refLoc) ) { + if ( a.getMostLikelyAllele().isReference() ) + continue; + final Double valueRight = getElementForRead(read, refLoc, ReadUtils.ClippingTail.RIGHT_TAIL); + if ( valueRight == null ) + continue; + tumorFwdOffset.add(valueRight); + final Double valueLeft = getElementForRead(read, refLoc, ReadUtils.ClippingTail.LEFT_TAIL); + if ( valueLeft == null ) + continue; + tumorRevOffset.add(valueLeft); + } + } + + double fwdMedian = 0.0; + double revMedian = 0.0; + double fwdMAD = 0.0; + double revMAD = 0.0; + + if (!tumorFwdOffset.isEmpty() && !tumorRevOffset.isEmpty()) { + fwdMedian = MuTectStats.getMedian(tumorFwdOffset); + revMedian = MuTectStats.getMedian(tumorRevOffset); + fwdMAD = MuTectStats.calculateMAD(tumorFwdOffset, fwdMedian); + revMAD = MuTectStats.calculateMAD(tumorRevOffset, revMedian); + } + + return( new double[] {fwdMedian, revMedian, fwdMAD, revMAD} ); // TODO: make an object container instead of array + } + + protected Double getElementForRead(final GATKSAMRecord read, final int refLoc, final ReadUtils.ClippingTail tail) { + final int offset = ReadUtils.getReadCoordinateForReferenceCoordinate(read.getSoftStart(), read.getCigar(), refLoc, tail, true); + if ( offset == ReadUtils.CLIPPING_GOAL_NOT_REACHED ) // offset is the number of bases in the read, including inserted bases, from start of read to the variant + return null; + + int readPos = AlignmentUtils.calcAlignmentByteArrayOffset(read.getCigar(), offset, false, 0, 0); // readpos is the number of REF bases from start to variant. I would name it as such... + final int numAlignedBases = AlignmentUtils.getNumAlignedBasesCountingSoftClips( read ); + if (readPos > numAlignedBases / 2) + readPos = numAlignedBases - (readPos + 1); + return (double)readPos; + } + + /** + * Can the read be used in comparative tests between ref / alt bases? + * + * @param read the read to consider + * @param refLoc the reference location + * @return true if this read is meaningful for comparison, false otherwise + */ + protected boolean isUsableRead(final GATKSAMRecord read, final int refLoc) { + return !( read.getMappingQuality() == 0 || + read.getMappingQuality() == QualityUtils.MAPPING_QUALITY_UNAVAILABLE ); + } +} diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/Dream_Evaluations.md b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/Dream_Evaluations.md index dd86aed24..69aae6872 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/Dream_Evaluations.md +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/Dream_Evaluations.md @@ -54,7 +54,7 @@ and then run the following Queue command java \ -Djava.io.tmpdir=$TEMPDIR \ -jar $QUEUE_JAR \ - -S $GSA_UNSTABLE_HOME/private/gatk-tools-private/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/run_M2_dream.scala \ + -S $GSA_UNSTABLE_HOME/private/gatk-queue-extensions-internal/src/main/qscripts/org/broadinstitute/gatk/queue/qscripts/m2/run_M2_dream.scala \ --job_queue gsa -qsub -jobResReq virtual_free=5G -startFromScratch \ -sc 200 \ -normal $NORMAL_BAM \ diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/M2ArgumentCollection.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/M2ArgumentCollection.java index b3c4d1f75..339603567 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/M2ArgumentCollection.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/M2ArgumentCollection.java @@ -135,6 +135,9 @@ public class M2ArgumentCollection extends AssemblyBasedCallerArgumentCollection @Argument(fullName = "strand_artifact_power_threshold", required = false, doc = "power threshold for calling strand bias") public float STRAND_ARTIFACT_POWER_THRESHOLD = 0.9f; + @Argument(fullName = "enable_strand_artifact_filter", required = false, doc = "turn on strand artifact filter") + public boolean ENABLE_STRAND_ARTIFACT_FILTER = false; + /** * This argument is used for the M1-style read position filter */ diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/M2_HapMapSensitivity.md b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/M2_HapMapSensitivity.md new file mode 100644 index 000000000..fe4d30cde --- /dev/null +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/M2_HapMapSensitivity.md @@ -0,0 +1,68 @@ +# CRSP HapMap Sensitivity Evaluation + +###Current M2 Performance +(gsa-unstable 9/1/15, commit:a08903d) + +| Mixture | type | sensitvity | +|------|----------------------| +| 5-plex |SNP|0.9691274| +| 5-plex |INDEL|0.87466127| +| 10-plex |SNP|0.97179496| +| 10-plex |INDEL|0.8888889| +| 20-plex |SNP|0.9537307| +| 20-plex |INDEL|0.83281654| + + +###Run Procedure +Run the script separately for each HapMap mixture bam: + +inputDir=/dsde/working/mutect/laura/hapmapSensitivity/inputs/ +Queue_Jar= + +``` +java -jar $Queue_Jar -S Qscript_HapMapPlex.scala \ +-intervals $inputDir/agilent_5plex_intervalFiles.list \ +-tumors $inputDir/agilent_5plex_bams.list \ +-truthVCF $inputDir/agilent_5plex_truth_intervals.vcf \ +-snpCounts $inputDir/agilent_5plex_truth_intervals.snpCounts.list \ +-indelCounts $inputDir/agilent_5plex_truth_intervals.indelCounts.list \ +-o \ +-qsub -jobQueue gsa -jobResReq virtual_free=5G -sc 50 +``` + +The HapMap bams get run as tumors without normals because we're not interested in specificity here, so we don't need the normals to filter out noise + +###Inputs +Bam lists: +5- and 10-plex have 3 replicates, 20-plex has 9 + +Interval files: +If we're only interested in sensitivity, then we only need to run the caller around known true positive sites, which we take from the truth VCFs +This workaround repeats the truth filename for the number of bams -- in theory each could have a separate truth VCF, but they are the same titration mixture + +SNP/INDEL counts: +This is the number of events in the truth VCFs so we can find the sensitivity across all samples +TODO: this could be generalized + +###Outputs +Each run outputs its own SNP and INDEL sensitivity combined across all samples: +``` +Sensitivity across all samples: +SNPs: 0.95156 +INDELs: 0.7328859 +``` + +Note that these are not filtered for depth as described in the CRSP documentation + +###Resources +Truth file preparation for 5-plex: +Start with /cga/tcga-gsc/benchmark/data/crsp-truth/1kg_5plex_wgs_hc_calls.codingIndelSnp.db135.recode.vcf +Select out allele fraction greater than 20% using "vc.isBiallelic() ? AF >= 0.2 : vc.hasGenotypes() && vc.getCalledChrCount(vc.getAltAlleleWithHighestAlleleCount())/(1.0*vc.getCalledChrCount()) >= 0.2" + +Similar for 10-plex source: +/cga/tcga-gsc/benchmark/data/crsp-truth/1kg_10plex_wgs_hc_calls.codingIndelSnp.db135.recode.vcf + +And 20-plex source: +/cga/tcga-gsc/benchmark/data/crsp-truth/1kg_20plex_wgs_hc_calls.codingIndelSnp.db135.recode.vcf + +both also using AF filter of 0.2 \ No newline at end of file diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/MuTect2.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/MuTect2.java index a81827b7a..08ea5c243 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/MuTect2.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/MuTect2.java @@ -399,6 +399,13 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i headerInfo.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.EVENT_DISTANCE_MIN_KEY)); headerInfo.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.EVENT_DISTANCE_MAX_KEY)); + if (MTAC.ENABLE_STRAND_ARTIFACT_FILTER){ + headerInfo.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.TLOD_FWD_KEY)); + headerInfo.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.TLOD_REV_KEY)); + headerInfo.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.TUMOR_SB_POWER_FWD_KEY)); + headerInfo.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.TUMOR_SB_POWER_REV_KEY)); + } + headerInfo.add(GATKVCFHeaderLines.getFormatLine(GATKVCFConstants.ALLELE_FRACTION_KEY)); headerInfo.add(GATKVCFHeaderLines.getFilterLine(GATKVCFConstants.STR_CONTRACTION_FILTER_NAME)); @@ -410,11 +417,8 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i headerInfo.add(GATKVCFHeaderLines.getFilterLine(GATKVCFConstants.TUMOR_LOD_FILTER_NAME)); headerInfo.add(GATKVCFHeaderLines.getFilterLine(GATKVCFConstants.GERMLINE_RISK_FILTER_NAME)); headerInfo.add(GATKVCFHeaderLines.getFilterLine(GATKVCFConstants.TRIALLELIC_SITE_FILTER_NAME)); + headerInfo.add(GATKVCFHeaderLines.getFilterLine(GATKVCFConstants.STRAND_ARTIFACT_FILTER_NAME)); - headerInfo.add(new VCFFilterHeaderLine("M1_CLUSTERED_READ_POSITION", "Variant appears in similar read positions")); - headerInfo.add(new VCFFilterHeaderLine("M1_STRAND_BIAS", "Forward LOD vs. reverse LOD comparison indicates strand bias")); - headerInfo.add(new VCFInfoHeaderLine("TLOD_FWD",1,VCFHeaderLineType.Integer,"TLOD from forward reads only")); - headerInfo.add(new VCFInfoHeaderLine("TLOD_REV",1,VCFHeaderLineType.Integer,"TLOD from reverse reads only")); if ( ! doNotRunPhysicalPhasing ) { headerInfo.add(GATKVCFHeaderLines.getFormatLine(GATKVCFConstants.HAPLOTYPE_CALLER_PHASING_ID_KEY)); @@ -728,7 +732,7 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i filters.add(GATKVCFConstants.TUMOR_LOD_FILTER_NAME); } - // if we are in artifact detection mode, apply the thresholds for the LOD scores + // if we are in artifact detection mode, apply the thresholds for the LOD scores if (!MTAC.ARTIFACT_DETECTION_MODE) { filters.addAll(calculateFilters(metaDataTracker, originalVC, eventDistanceAttributes)); } @@ -754,11 +758,8 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i annotatedCalls.add(vcb.make()); } - - - - - + // TODO: find a better place for this debug message + // logger.info("We had " + TumorPowerCalculator.numCacheHits + " hits in starnd artifact power calculation"); return annotatedCalls; } @@ -834,16 +835,10 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i filters.add(GATKVCFConstants.CLUSTERED_EVENTS_FILTER_NAME); } - Integer tumorFwdPosMedian = (Integer) vc.getAttribute("TUMOR_FWD_POS_MEDIAN"); - Integer tumorRevPosMedian = (Integer) vc.getAttribute("TUMOR_REV_POS_MEDIAN"); - Integer tumorFwdPosMAD = (Integer) vc.getAttribute("TUMOR_FWD_POS_MAD"); - Integer tumorRevPosMAD = (Integer) vc.getAttribute("TUMOR_REV_POS_MAD"); - //If the variant is near the read end (median threshold) and the positions are very similar (MAD threshold) then filter - if ( (tumorFwdPosMedian != null && tumorFwdPosMedian <= MTAC.PIR_MEDIAN_THRESHOLD && tumorFwdPosMAD != null && tumorFwdPosMAD <= MTAC.PIR_MAD_THRESHOLD) || - (tumorRevPosMedian != null && tumorRevPosMedian <= MTAC.PIR_MEDIAN_THRESHOLD && tumorFwdPosMAD != null && tumorRevPosMAD <= MTAC.PIR_MAD_THRESHOLD)) - filters.add("M1_CLUSTERED_READ_POSITION"); - + // TODO: Add clustered read position filter here + // TODO: Move strand bias filter here return filters; + } @@ -1064,9 +1059,9 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i @Advanced @Argument(fullName="annotation", shortName="A", doc="One or more specific annotations to apply to variant calls", required=false) // protected List annotationsToUse = new ArrayList<>(Arrays.asList(new String[]{"ClippingRankSumTest", "DepthPerSampleHC"})); - //protected List annotationsToUse = new ArrayList<>(Arrays.asList(new String[]{"DepthPerAlleleBySample", "BaseQualitySumPerAlleleBySample", "TandemRepeatAnnotator", - // "RMSMappingQuality","MappingQualityRankSumTest","FisherStrand","StrandOddsRatio","ReadPosRankSumTest","QualByDepth", "Coverage"})); - protected List annotationsToUse = new ArrayList<>(Arrays.asList(new String[]{"DepthPerAlleleBySample", "BaseQualitySumPerAlleleBySample", "TandemRepeatAnnotator", "OxoGReadCounts", "ClusteredEventsAnnotator"})); +// protected List annotationsToUse = new ArrayList<>(Arrays.asList(new String[]{"DepthPerAlleleBySample", "BaseQualitySumPerAlleleBy ruSample", "TandemRepeatAnnotator", +// "RMSMappingQuality","MappingQualityRankSumTest","FisherStrand","StrandOddsRatio","ReadPosRankSumTest","QualByDepth", "Coverage"})); + protected List annotationsToUse = new ArrayList<>(Arrays.asList(new String[]{"DepthPerAlleleBySample", "BaseQualitySumPerAlleleBySample", "TandemRepeatAnnotator", "OxoGReadCounts"})); /** * Which annotations to exclude from output in the VCF file. Note that this argument has higher priority than the -A or -G arguments, diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/run_M2_ICE_NN.scala b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/MuTectStats.java similarity index 82% rename from protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/run_M2_ICE_NN.scala rename to protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/MuTectStats.java index f4959e296..bb02db4ad 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/run_M2_ICE_NN.scala +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/MuTectStats.java @@ -49,54 +49,60 @@ * 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. */ -package org.broadinstitute.gatk.queue.qscripts.dev +package org.broadinstitute.gatk.tools.walkers.cancer.m2; -import org.broadinstitute.gatk.queue.QScript -import org.broadinstitute.gatk.queue.extensions.gatk._ -import org.broadinstitute.gatk.queue.util.QScriptUtils +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; -class run_M2_ICE_NN extends QScript { +/** + * Collection of Statistical methods and tests used by MuTect + */ +public class MuTectStats { - @Argument(shortName = "bams", required = true, doc = "file of all BAM files") - var allBams: String = "" + public static double calculateMAD(ArrayList xs, double median) { + ArrayList deviations = new ArrayList<>(xs.size()); - @Argument(shortName = "o", required = false, doc = "Output prefix") - var outputPrefix: String = "" + for(double x : xs) { + deviations.add(Math.abs(x - median)); + } - @Argument(shortName = "pon", required = false, doc = "Normal PON") - var panelOfNormals: String = "/dsde/working/mutect/panel_of_normals/panel_of_normals_m2_ice_wgs_territory/m2_406_ice_normals_wgs_calling_regions.vcf"; + return getMedian(deviations); - @Argument(shortName = "sc", required = false, doc = "base scatter count") - var scatter: Int = 10 - - - def script() { - val bams = QScriptUtils.createSeqFromFile(allBams) - - for (tumor <- bams) { - for (normal <- bams) { - if (tumor != normal) add( createM2Config(tumor, normal, new File(panelOfNormals), outputPrefix)) - } } - } + public static double getMedian(ArrayList data) { + Collections.sort(data); + Double result; - def createM2Config(tumorBAM : File, normalBAM : File, panelOfNormals : File, outputPrefix : String): M2 = { - val mutect2 = new MuTect2 + if (data.size() % 2 == 1) { + // If the number of entries in the list is not even. - mutect2.reference_sequence = new File("/seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta") - mutect2.cosmic :+= new File("/xchip/cga/reference/hg19/hg19_cosmic_v54_120711.vcf") - mutect2.dbsnp = new File("/humgen/gsa-hpprojects/GATK/bundle/current/b37/dbsnp_138.b37.vcf") - mutect2.normal_panel :+= panelOfNormals + // Get the middle value. + // You must floor the result of the division to drop the + // remainder. + result = data.get((int) Math.floor(data.size()/2) ); - mutect2.intervalsString :+= new File("/dsde/working/mutect/crsp_nn/whole_exome_illumina_coding_v1.Homo_sapiens_assembly19.targets.no_empty.interval_list") - mutect2.memoryLimit = 2 - mutect2.input_file = List(new TaggedFile(normalBAM, "normal"), new TaggedFile(tumorBAM, "tumor")) + } else { + // If the number of entries in the list are even. - mutect2.scatterCount = scatter - mutect2.out = outputPrefix + tumorBAM.getName + "-vs-" + normalBAM.getName + ".vcf" + // Get the middle two values and average them. + Double lowerMiddle = data.get(data.size()/2 ); + Double upperMiddle = data.get(data.size()/2 - 1 ); + result = (lowerMiddle + upperMiddle) / 2; + } - println("Adding " + tumorBAM + " vs " + normalBAM + " as " + mutect2.out) - mutect2 - } + return result; + } + + public static double[] convertIntegersToDoubles(List integers) + { + double[] ret = new double[integers.size()]; + for (int i=0; i < ret.length; i++) + { + ret[i] = integers.get(i); + } + return ret; + } } diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/NA12878_Evaluations.md b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/NA12878_Evaluations.md index b7d0c58f2..e5ed5572c 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/NA12878_Evaluations.md +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/NA12878_Evaluations.md @@ -22,16 +22,18 @@ TODO: write a simple tool to do this more easily To calculate per pair-counts, run: ``` # for SNPs -for vcf in *.bam.vcf -do - cat $vcf | grep PASS | awk '{ if ( length($4) + length($5) == 2) print $0 }' | wc -l -done +for vcf in *.vcf +do + cat $vcf | grep PASS | awk '{ if ( length($4) + length($5) == 2) print $0 }' | wc -l +done > snp-fps.txt +cat snp-fps.txt | awk '{ sum += $1 } END { print sum }' # for INDELs -for vcf in *.bam.vcf -do - cat $vcf | grep PASS | awk '{ if ( length($4) + length($5) != 2) print $0 }' | wc -l -done +for vcf in *.vcf +do + cat $vcf | grep PASS | awk '{ if ( length($4) + length($5) != 2) print $0 }' | wc -l +done > indel-fps.txt +cat indel-fps.txt | awk '{ sum += $1 } END { print sum }' ``` ### Current M1 and Indelocator Performance @@ -72,7 +74,7 @@ and then run the following Queue command java \ -Djava.io.tmpdir=$TEMPDIR \ -jar $QUEUE_JAR \ - -S $GSA_UNSTABLE_HOME/private/gatk-tools-private/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/run_M2_ICE_NN.scala \ + -S $GSA_UNSTABLE_HOME/private/gatk-queue-extensions-internal/src/main/qscripts/org/broadinstitute/gatk/queue/qscripts/m2/run_M2_ICE_NN.scala \ -sc 50 \ --job_queue gsa -qsub -jobResReq virtual_free=5G -startFromScratch \ --allbams /humgen/gsa-hpprojects/NA12878Collection/bams/crsp_ice_validation//NA12878.intra.flowcell.replicate.bam_list \ diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/AbstractPowerCalculator.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/PerAlleleCollection.java similarity index 61% rename from protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/AbstractPowerCalculator.java rename to protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/PerAlleleCollection.java index d9743dc33..d5ac3094d 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/AbstractPowerCalculator.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/PerAlleleCollection.java @@ -5,7 +5,7 @@ * SOFTWARE LICENSE AGREEMENT * FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY * -* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 ("BROAD") and the LICENSEE and is effective at the date the downloading is completed ("EFFECTIVE DATE"). * * WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and * WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. @@ -21,11 +21,11 @@ * 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. * * 3. PHONE-HOME FEATURE -* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system ("PHONE-HOME") which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE'S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. * * 4. OWNERSHIP OF INTELLECTUAL PROPERTY * LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012-2014 Broad Institute, Inc. +* Copyright 2012-2016 Broad Institute, Inc. * Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. * LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. * @@ -51,50 +51,145 @@ package org.broadinstitute.gatk.tools.walkers.cancer.m2; -import java.util.HashMap; +import htsjdk.variant.variantcontext.Allele; -public class AbstractPowerCalculator { - protected HashMap cache = new HashMap(); - protected double constantEps; - protected double constantLodThreshold; +import java.util.*; - protected static class PowerCacheKey { - private int n; - private double delta; +/** + * A container for allele to value mapping. + * + * Each PerAlleleCollection may hold a value for each ALT allele and, optionally, a value for the REF allele. + * For example, + * + * PerAlleleCollection alleleFractions = PerAlleleCollection.createPerAltAlleleCollection() + * + * may be a container for allele fractions for ALT alleles in a variant context. While + * + * PerAlleleCollection alleleCount = PerAlleleCollection.createPerRefAndAltAlleleCollection() + * + * may hold the allele counts for the REF allele and all ALT alleles in a variant context. + * + * + **/ +public class PerAlleleCollection { + // TODO: consider using Optional for ref allele + private Optional refAllele; + private Optional refValue; + private Map altAlleleValueMap; + private boolean altOnly; - public PowerCacheKey(int n, double delta) { - this.n = n; - this.delta = delta; + private PerAlleleCollection(boolean altOnly){ + this.altOnly = altOnly; + this.altAlleleValueMap = new HashMap(); + this.refAllele = Optional.empty(); + + } + + public static PerAlleleCollection createPerAltAlleleCollection(){ + return new PerAlleleCollection(true); + } + + public static PerAlleleCollection createPerRefAndAltAlleleCollection(){ + return new PerAlleleCollection(false); + } + + /** + * Take an allele, REF or ALT, and update its value appropriately + * + * @param allele : REF or ALT allele + * @param newValue : + */ + public void set(Allele allele, X newValue){ + if (allele == null || newValue == null){ + throw new IllegalArgumentException("allele or newValue is null"); } - - @Override - public boolean equals(Object o) { - if (this == o) return true; - if (o == null || getClass() != o.getClass()) return false; - - PowerCacheKey that = (PowerCacheKey) o; - - if (Double.compare(that.delta, delta) != 0) return false; - if (n != that.n) return false; - - return true; + if (allele.isReference() && altOnly){ + throw new IllegalArgumentException("Collection stores values for alternate alleles only"); } - - @Override - public int hashCode() { - int result; - long temp; - result = n; - temp = delta != +0.0d ? Double.doubleToLongBits(delta) : 0L; - result = 31 * result + (int) (temp ^ (temp >>> 32)); - return result; + if (allele.isReference()){ + this.setRef(allele, newValue); + } else { + this.setAlt(allele, newValue); } } - protected static double calculateLogLikelihood(int depth, int alts, double eps, double f) { - double a = (depth-alts) * Math.log10(f*eps + (1d-f)*(1d-eps)); - double b = (alts) * Math.log10(f*(1d-eps) + (1d-f)*eps); - return (a+b); + public void setRef(Allele refAllele, X newValue){ + if (refAllele == null || newValue == null){ + throw new IllegalArgumentException("refAllele or newValue is null"); + } + if (refAllele.isNonReference()){ + throw new IllegalArgumentException("Setting Non-reference allele as reference"); + } + + if (this.refAllele.isPresent()){ + throw new IllegalArgumentException("Resetting the reference allele not permitted"); + } + + this.refAllele = Optional.of(refAllele); + this.refValue = Optional.of(newValue); } -} + public void setAlt(Allele altAllele, X newValue){ + if (altAllele == null || newValue == null){ + throw new IllegalArgumentException("altAllele or newValue is null"); + } + if (altAllele.isReference()){ + throw new IllegalArgumentException("Setting reference allele as alt"); + } + + altAlleleValueMap.put(altAllele, newValue); + } + + /** + * Get the value for an allele, REF or ALT + * @param allele + */ + public X get(Allele allele){ + if (allele == null){ + throw new IllegalArgumentException("allele is null"); + } + + if (allele.isReference()){ + if (allele.equals(this.refAllele.get())){ + return(getRef()); + } else { + throw new IllegalArgumentException("Requested ref allele does not match the stored ref allele"); + } + } else { + return(getAlt(allele)); + } + } + + public X getRef(){ + if (altOnly) { + throw new IllegalStateException("Collection does not hold the REF allele"); + } + + if (this.refAllele.isPresent()){ + return(refValue.get()); + } else { + throw new IllegalStateException("Collection's ref allele has not been set yet"); + } + } + + public X getAlt(Allele allele){ + if (allele == null){ + throw new IllegalArgumentException("allele is null"); + } + if (allele.isReference()){ + throw new IllegalArgumentException("allele is not an alt allele"); + } + + if (altAlleleValueMap.containsKey(allele)) { + return(altAlleleValueMap.get(allele)); + } else { + throw new IllegalArgumentException("Requested alt allele is not in the collection"); + } + + } + + + public Set getAltAlleles(){ + return(altAlleleValueMap.keySet()); + } +} \ No newline at end of file diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/SomaticGenotypingEngine.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/SomaticGenotypingEngine.java index 4c918823b..536283bc3 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/SomaticGenotypingEngine.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/SomaticGenotypingEngine.java @@ -54,8 +54,9 @@ package org.broadinstitute.gatk.tools.walkers.cancer.m2; import com.google.java.contract.Ensures; import htsjdk.samtools.util.StringUtil; import htsjdk.variant.variantcontext.*; +import org.apache.commons.lang.mutable.MutableDouble; +import org.apache.commons.lang.mutable.MutableInt; import org.apache.log4j.Logger; -import org.broadinstitute.gatk.tools.walkers.genotyper.GenotypeLikelihoodsCalculationModel; import org.broadinstitute.gatk.tools.walkers.genotyper.afcalc.AFCalculatorProvider; import org.broadinstitute.gatk.tools.walkers.haplotypecaller.HaplotypeCallerGenotypingEngine; import org.broadinstitute.gatk.utils.GenomeLoc; @@ -77,8 +78,10 @@ import java.util.*; public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine { - protected M2ArgumentCollection MTAC; - private TumorPowerCalculator strandArtifactPowerCalculator; + protected final M2ArgumentCollection MTAC; + private final TumorPowerCalculator strandArtifactPowerCalculator; + final boolean REF_AND_ALT = false; + final boolean ALT_ONLY = true; private final static Logger logger = Logger.getLogger(SomaticGenotypingEngine.class); @@ -86,8 +89,8 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine { super(configuration, samples, genomeLocParser, afCalculatorProvider, doPhysicalPhasing); this.MTAC = MTAC; // coverage related initialization - double powerConstantEps = Math.pow(10, -1 * (MTAC.POWER_CONSTANT_QSCORE/10)); - strandArtifactPowerCalculator = new TumorPowerCalculator(powerConstantEps, MTAC.STRAND_ARTIFACT_LOD_THRESHOLD, 0.0f); + final double errorProbability = Math.pow(10, -(MTAC.POWER_CONSTANT_QSCORE/10)); + strandArtifactPowerCalculator = new TumorPowerCalculator(errorProbability, MTAC.STRAND_ARTIFACT_LOD_THRESHOLD, 0.0f); } /** @@ -95,7 +98,7 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine { * genotype likelihoods and assemble into a list of variant contexts and genomic events ready for calling * * The list of samples we're working with is obtained from the readLikelihoods - * + * @param haplotypes Haplotypes to assign likelihoods to * @param readLikelihoods Map from reads->(haplotypes,likelihoods) * @param perSampleFilteredReadList Map from sample to reads that were filtered after assembly and before calculating per-read likelihoods. @@ -112,7 +115,7 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine { // @Requires({"refLoc.containsP(activeRegionWindow)", "haplotypes.size() > 0"}) @Ensures("result != null") // TODO - can this be refactored? this is hard to follow! - public HaplotypeCallerGenotypingEngine.CalledHaplotypes callMutations ( + public CalledHaplotypes callMutations ( final List haplotypes, //final Map haplotypeReadMap, final ReadLikelihoods readLikelihoods, @@ -145,7 +148,7 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine { // Somatic Tumor/Normal Sample Handling verifySamplePresence(tumorSampleName, readLikelihoods.samples()); - final boolean hasNormal = (matchedNormalSampleName != null); + final boolean hasNormal = matchedNormalSampleName != null; // update the haplotypes so we're ready to call, getting the ordered list of positions on the reference // that carry events among the haplotypes @@ -175,13 +178,6 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine { if( mergedVC == null ) { continue; } - final int numAlts = mergedVC.getNAlleles()-1; - -// final VariantContextBuilder vcb = new VariantContextBuilder(mergedVC); - - final GenotypeLikelihoodsCalculationModel.Model calculationModel = mergedVC.isSNP() - ? GenotypeLikelihoodsCalculationModel.Model.SNP : GenotypeLikelihoodsCalculationModel.Model.INDEL; - if (emitReferenceConfidence) mergedVC = addNonRefSymbolicAllele(mergedVC); @@ -216,173 +212,183 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine { MuTect2.logReadInfo(DEBUG_READ_NAME, tumorPRALM.getLikelihoodReadMap().keySet(), "Present after filtering for overlapping reads"); // extend to multiple samples - //handle existence of secondary alts - double[] afs = estimateAlleleFraction(mergedVC, tumorPRALM); + // compute tumor LOD for each alternate allele + // TODO: somewhere we have to ensure that the all the alleles in the variant context is in alleleFractions passed to getHetGenotypeLogLikelihoods. getHetGenotypeLogLikelihoods will not check that for you + final PerAlleleCollection altAlleleFractions = estimateAlleleFraction(mergedVC, tumorPRALM, false); + final PerAlleleCollection tumorHetGenotypeLLs = getHetGenotypeLogLikelihoods(mergedVC, tumorPRALM, originalNormalReadQualities, altAlleleFractions); if( configuration.DEBUG && logger != null ) { - String output = "Calculated allelic fraction at " + loc + " = "; - for (int i = 0; i tumorLods = PerAlleleCollection.createPerAltAlleleCollection(); + for (Allele altAllele : mergedVC.getAlternateAlleles()){ + tumorLods.set(altAllele, tumorHetGenotypeLLs.get(altAllele) - tumorHetGenotypeLLs.getRef()); + } - PerReadAlleleLikelihoodMap forwardPRALM = new PerReadAlleleLikelihoodMap(); - PerReadAlleleLikelihoodMap reversePRALM = new PerReadAlleleLikelihoodMap(); - splitPRALMintoForwardAndReverseReads(tumorPRALM, forwardPRALM, reversePRALM); - - // TODO: TS uncomment and fix -// double f_fwd = estimateAlleleFraction(mergedVC, forwardPRALM); -// double[] tumorGLs_fwd = getVariableGenotypeLikelihoods(mergedVC, forwardPRALM, f_fwd); -// -// double f_rev = estimateAlleleFraction(mergedVC, reversePRALM); -// double[] tumorGLs_rev = getVariableGenotypeLikelihoods(mergedVC, reversePRALM, f_rev); + if (configuration.DEBUG && logger != null) { + StringBuilder outputSB = new StringBuilder("Tumor LOD at " + loc + " = ["); + for (Allele altAllele : tumorLods.getAltAlleles()) { + outputSB.append( altAllele + ": " + tumorLods.getAlt(altAllele) + ", "); + } + outputSB.append("]"); + logger.info(outputSB.toString()); + } + double INIT_NORMAL_LOD_THRESHOLD = -Double.MAX_VALUE; + double NORMAL_LOD_THRESHOLD = -Double.MAX_VALUE; PerReadAlleleLikelihoodMap normalPRALM = null; - double[] normalGLs = null; + PerAlleleCollection normalLods = PerAlleleCollection.createPerAltAlleleCollection(); + + // if normal bam is available, compute normal LOD if (hasNormal) { normalPRALM = readAlleleLikelihoods.toPerReadAlleleLikelihoodMap(readAlleleLikelihoods.sampleIndex(matchedNormalSampleName)); filterPRALMForOverlappingReads(normalPRALM, mergedVC.getReference(), loc, true); MuTect2.logReadInfo(DEBUG_READ_NAME, normalPRALM.getLikelihoodReadMap().keySet(), "Present after filtering for overlapping reads"); - double[] diploidAFarray = new double[numAlts]; - Arrays.fill(diploidAFarray, 0.5d); - normalGLs = getVariableGenotypeLikelihoods(mergedVC, normalPRALM, originalNormalReadQualities, diploidAFarray); - } - - double INIT_NORMAL_LOD_THRESHOLD = -Double.MAX_VALUE; - double NORMAL_LOD_THRESHOLD = -Double.MAX_VALUE; - - final int REF_INDEX = 0; - double[] tumorLods = new double[numAlts]; - for (int altInd = 0; altInd < numAlts; altInd++) { - tumorLods[altInd] = tumorGLs[altInd+1] - tumorGLs[REF_INDEX]; - } - if (configuration.DEBUG && logger != null) { - String output = "Tumor LOD at " + loc + " = "; - for (int i = 0; i cosmicVC = tracker.getValues(cosmicRod, eventGenomeLoc); Collection dbsnpVC = tracker.getValues(dbsnpRod, eventGenomeLoc); // remove the effect of cosmic from dbSNP - boolean germlineAtRisk = (!dbsnpVC.isEmpty() && cosmicVC.isEmpty()); + final boolean germlineAtRisk = (!dbsnpVC.isEmpty() && cosmicVC.isEmpty()); INIT_NORMAL_LOD_THRESHOLD = MTAC.INITIAL_NORMAL_LOD_THRESHOLD; //only set this if this job has a normal - NORMAL_LOD_THRESHOLD = (germlineAtRisk)?MTAC.NORMAL_DBSNP_LOD_THRESHOLD:MTAC.NORMAL_LOD_THRESHOLD; - for (int altInd = 0; altInd < numAlts; altInd++) - normalLods[altInd] = normalGLs[REF_INDEX] - normalGLs[altInd+1]; + NORMAL_LOD_THRESHOLD = (germlineAtRisk)?MTAC.NORMAL_DBSNP_LOD_THRESHOLD:MTAC.NORMAL_LOD_THRESHOLD; + + + // compute normal LOD = LL(X|REF)/LL(X|ALT) where ALT is the diploid HET with AF = 0.5 + // note normal LOD is REF over ALT, the reciprocal of the tumor LOD + final PerAlleleCollection diploidHetAlleleFractions = PerAlleleCollection.createPerRefAndAltAlleleCollection(); + for (final Allele allele : mergedVC.getAlternateAlleles()){ + diploidHetAlleleFractions.setAlt(allele, 0.5); + } + + final PerAlleleCollection normalGenotypeLLs = getHetGenotypeLogLikelihoods(mergedVC, normalPRALM, originalNormalReadQualities, diploidHetAlleleFractions); + + for (final Allele altAllele : mergedVC.getAlternateAlleles()){ + normalLods.setAlt(altAllele, normalGenotypeLLs.getRef() - normalGenotypeLLs.getAlt(altAllele)); + } + } - //reconcile multiple alts, if applicable int numPassingAlts = 0; - int lodInd = 0; - for (int altInd = 0; altInd < numAlts; altInd++) { - if (tumorLods[altInd] >= MTAC.INITIAL_TUMOR_LOD_THRESHOLD && normalLods[altInd] >= INIT_NORMAL_LOD_THRESHOLD) { + Set allelesThatPassThreshold = new HashSet<>(); + Allele alleleWithHighestTumorLOD = null; + + // TODO: use lambda + for (Allele altAllele : mergedVC.getAlternateAlleles()) { + final boolean passesTumorLodThreshold = tumorLods.getAlt(altAllele) >= MTAC.INITIAL_TUMOR_LOD_THRESHOLD; + final boolean passesNormalLodThreshold = hasNormal ? normalLods.getAlt(altAllele) >= INIT_NORMAL_LOD_THRESHOLD : true; + if (passesTumorLodThreshold && passesNormalLodThreshold) { numPassingAlts++; - lodInd = altInd; + allelesThatPassThreshold.add(altAllele); + if (alleleWithHighestTumorLOD == null + || tumorLods.getAlt(altAllele) > tumorLods.getAlt(alleleWithHighestTumorLOD)){ + alleleWithHighestTumorLOD = altAllele; + } } } - // TS: if more than one passing alt alleles, filter it out, so doesn't matter which one we pick - - final double tumorLod = tumorLods[lodInd]; - final double normalLod = normalLods[lodInd]; - - double tumorSBpower_fwd; - double tumorSBpower_rev; - - // TODO: TS fix -// try { -// tumorSBpower_fwd = strandArtifactPowerCalculator.cachingPowerCalculation(forwardPRALM.getNumberOfStoredElements(), f_fwd); -// tumorSBpower_rev = strandArtifactPowerCalculator.cachingPowerCalculation(reversePRALM.getNumberOfStoredElements(), f_rev); -// } -// catch (Throwable t) { -// System.err.println("Error processing " + activeRegionWindow.getContig() + ":" + loc); -// t.printStackTrace(System.err); -// -// throw new RuntimeException(t); -// } - + final boolean emitVariant = numPassingAlts > 0; VariantContext call = null; - if (tumorLod >= MTAC.INITIAL_TUMOR_LOD_THRESHOLD && normalLod >= INIT_NORMAL_LOD_THRESHOLD) { + if (emitVariant) { VariantContextBuilder callVcb = new VariantContextBuilder(mergedVC); - - if (normalLod < NORMAL_LOD_THRESHOLD) { - callVcb.filter(GATKVCFConstants.GERMLINE_RISK_FILTER_NAME); - } - - int haplotypeCount = alleleMapper.get(mergedVC.getAlternateAllele(lodInd)).size(); - // TODO: TS revisit -// callVcb.attribute("TLOD_FWD",tumorLod_fwd); -// callVcb.attribute("TLOD_REV",tumorLod_rev); -// if ( (tumorSBpower_fwd >= MTAC.STRAND_ARTIFACT_POWER_THRESHOLD && tumorLod_fwd < MTAC.STRAND_ARTIFACT_LOD_THRESHOLD) || -// (tumorSBpower_rev >= MTAC.STRAND_ARTIFACT_POWER_THRESHOLD && tumorLod_rev < MTAC.STRAND_ARTIFACT_LOD_THRESHOLD) ) -// callVcb.filter("M1_STRAND_BIAS"); - - // TODO: TS revisit -// if ( (tumorSBpower_fwd >= MTAC.STRAND_ARTIFACT_POWER_THRESHOLD && tumorLod_fwd < MTAC.STRAND_ARTIFACT_LOD_THRESHOLD) || -// (tumorSBpower_rev >= MTAC.STRAND_ARTIFACT_POWER_THRESHOLD && tumorLod_rev < MTAC.STRAND_ARTIFACT_LOD_THRESHOLD) ) -// callVcb.filter("M1_STRAND_BIAS"); -// -// // FIXME: can simply get first alternate since above we only deal with Bi-allelic sites... -// int haplotypeCount = alleleMapper.get(mergedVC.getAlternateAllele(0)).size(); + // FIXME: can simply get first alternate since above we only deal with Bi-allelic sites... + int haplotypeCount = alleleMapper.get(mergedVC.getAlternateAllele(0)).size(); callVcb.attribute(GATKVCFConstants.HAPLOTYPE_COUNT_KEY, haplotypeCount); - callVcb.attribute(GATKVCFConstants.TUMOR_LOD_KEY, tumorLod); - callVcb.attribute(GATKVCFConstants.NORMAL_LOD_KEY, normalLod); + callVcb.attribute(GATKVCFConstants.TUMOR_LOD_KEY, tumorLods.getAlt(alleleWithHighestTumorLOD)); - if (normalLod < NORMAL_LOD_THRESHOLD) { - callVcb.filter(GATKVCFConstants.GERMLINE_RISK_FILTER_NAME); + if (hasNormal) { + callVcb.attribute(GATKVCFConstants.NORMAL_LOD_KEY, normalLods.getAlt(alleleWithHighestTumorLOD)); + if (normalLods.getAlt(alleleWithHighestTumorLOD) < NORMAL_LOD_THRESHOLD) { + callVcb.filter(GATKVCFConstants.GERMLINE_RISK_FILTER_NAME); + } } + // M1-style strand artifact filter + // TODO: move code to MuTect2::calculateFilters() + // skip if VC has multiple alleles - it will get filtered later anyway + if (MTAC.ENABLE_STRAND_ARTIFACT_FILTER && numPassingAlts == 1) { + final PerReadAlleleLikelihoodMap forwardPRALM = new PerReadAlleleLikelihoodMap(); + final PerReadAlleleLikelihoodMap reversePRALM = new PerReadAlleleLikelihoodMap(); + splitPRALMintoForwardAndReverseReads(tumorPRALM, forwardPRALM, reversePRALM); + + // TODO: build a new type for probability, likelihood, and log_likelihood. e.g. f_fwd :: probability[], tumorGLs_fwd :: likelihood[] + // TODO: don't want to call getHetGenotypeLogLikelihoods on more than one alternate alelle. May need to overload it to take a scalar f_fwd. + final PerAlleleCollection alleleFractionsForward = estimateAlleleFraction(mergedVC, forwardPRALM, true); + final PerAlleleCollection tumorGenotypeLLForward = getHetGenotypeLogLikelihoods(mergedVC, forwardPRALM, originalNormalReadQualities, alleleFractionsForward); + + final PerAlleleCollection alleleFractionsReverse = estimateAlleleFraction(mergedVC, reversePRALM, true); + final PerAlleleCollection tumorGenotypeLLReverse = getHetGenotypeLogLikelihoods(mergedVC, reversePRALM, originalNormalReadQualities, alleleFractionsReverse); + + double tumorLod_fwd = tumorGenotypeLLForward.getAlt(alleleWithHighestTumorLOD) - tumorGenotypeLLForward.getRef(); + double tumorLod_rev = tumorGenotypeLLReverse.getAlt(alleleWithHighestTumorLOD) - tumorGenotypeLLReverse.getRef(); + + double tumorSBpower_fwd = 0.0; + double tumorSBpower_rev = 0.0; + try { + // Note that we use the observed combined (+ and -) allele fraction for power calculation in either direction + tumorSBpower_fwd = strandArtifactPowerCalculator.cachedPowerCalculation(forwardPRALM.getNumberOfStoredElements(), altAlleleFractions.getAlt(alleleWithHighestTumorLOD)); + tumorSBpower_rev = strandArtifactPowerCalculator.cachedPowerCalculation(reversePRALM.getNumberOfStoredElements(), altAlleleFractions.getAlt(alleleWithHighestTumorLOD)); + } + catch (Throwable t) { + System.err.println("Error processing " + activeRegionWindow.getContig() + ":" + loc); + t.printStackTrace(System.err); + throw new RuntimeException(t); + } + + callVcb.attribute(GATKVCFConstants.TLOD_FWD_KEY, tumorLod_fwd); + callVcb.attribute(GATKVCFConstants.TLOD_REV_KEY, tumorLod_rev); + callVcb.attribute(GATKVCFConstants.TUMOR_SB_POWER_FWD_KEY, tumorSBpower_fwd); + callVcb.attribute(GATKVCFConstants.TUMOR_SB_POWER_REV_KEY, tumorSBpower_rev); + // TODO: add vcf INFO fields. see callVcb.attribute(GATKVCFConstants.HAPLOTYPE_COUNT_KEY, haplotypeCount); + + if ((tumorSBpower_fwd > MTAC.STRAND_ARTIFACT_POWER_THRESHOLD && tumorLod_fwd < MTAC.STRAND_ARTIFACT_LOD_THRESHOLD) || + (tumorSBpower_rev > MTAC.STRAND_ARTIFACT_POWER_THRESHOLD && tumorLod_rev < MTAC.STRAND_ARTIFACT_LOD_THRESHOLD)) + callVcb.filter(GATKVCFConstants.STRAND_ARTIFACT_FILTER_NAME); + } + + // TODO: this probably belongs in M2::calculateFilters() if (numPassingAlts > 1) { callVcb.filter(GATKVCFConstants.TRIALLELIC_SITE_FILTER_NAME); } + // build genotypes TODO: this part needs review and refactor List tumorAlleles = new ArrayList<>(); tumorAlleles.add(mergedVC.getReference()); - tumorAlleles.add(mergedVC.getAlternateAllele(lodInd)); - GenotypeBuilder tumorGenotype = - new GenotypeBuilder(tumorSampleName, tumorAlleles); - - tumorGenotype.attribute(GATKVCFConstants.ALLELE_FRACTION_KEY, afs[lodInd]); - - // how should we set the genotype properly here? - List refAlleles = new ArrayList<>(); - refAlleles.add(mergedVC.getReference()); - refAlleles.add(mergedVC.getReference()); - - + tumorAlleles.add(alleleWithHighestTumorLOD); + Genotype tumorGenotype = new GenotypeBuilder(tumorSampleName, tumorAlleles) + .attribute(GATKVCFConstants.ALLELE_FRACTION_KEY, altAlleleFractions.getAlt(alleleWithHighestTumorLOD)) + .make(); // TODO: add ADs? List genotypes = new ArrayList<>(); - genotypes.add(tumorGenotype.make()); + genotypes.add(tumorGenotype); - // if we are calling with a normal, add that sample in + // We assume that the genotype in the normal is 0/0 + // TODO: is normal always homozygous reference? + List homRefAllelesforNormalGenotype = new ArrayList<>(); + homRefAllelesforNormalGenotype.addAll(Collections.nCopies(2, mergedVC.getReference())); + + // if we are calling with a normal, build the genotype for the sample to appear in vcf + int REF = 0, ALT = 1; if (hasNormal) { - int[] normalCounts = getRefAltCount(mergedVC, normalPRALM); - int[] normalAD = new int[2]; - normalAD[REF_INDEX] = normalCounts[REF_INDEX]; - normalAD[1] = normalCounts[lodInd+1]; - double normalF = (double) normalAD[1] / ((double) normalAD[REF_INDEX] + (double) normalAD[1]); + PerAlleleCollection normalCounts = getRefAltCount(mergedVC, normalPRALM, false); + final int normalRefAlleleDepth = normalCounts.getRef(); + final int normalAltAlleleDepth = normalCounts.getAlt(alleleWithHighestTumorLOD); + final int[] normalAlleleDepths = { normalRefAlleleDepth, normalAltAlleleDepth }; + final double normalAlleleFraction = (double) normalAltAlleleDepth / ( normalRefAlleleDepth + normalAltAlleleDepth); - GenotypeBuilder normalGenotype = - new GenotypeBuilder(matchedNormalSampleName, refAlleles).AD(normalAD); - normalGenotype.attribute(GATKVCFConstants.ALLELE_FRACTION_KEY, normalF); - genotypes.add(normalGenotype.make()); + final Genotype normalGenotype = new GenotypeBuilder(matchedNormalSampleName, homRefAllelesforNormalGenotype) + .AD(normalAlleleDepths) + .attribute(GATKVCFConstants.ALLELE_FRACTION_KEY, normalAlleleFraction) + .make(); + genotypes.add(normalGenotype); } //only use alleles found in the tumor ( @@ -426,91 +432,125 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine { } } - /** Calculate the genotype likelihoods for variable allele fraction + /** Calculate the likelihoods of hom ref and each het genotype of the form ref/alt * * @param mergedVC input VC * @param tumorPRALM read likelihoods * @param originalNormalMQs original MQs, before boosting normals to avoid qual capping - * @param afs allele fraction(s) for alternate allele(s) + * @param alleleFractions allele fraction(s) for alternate allele(s) * - * @return genotype likelihoods for homRef (index 0) and het for each alternate allele + * @return genotype likelihoods for homRef and het for each alternate allele */ - private double[] getVariableGenotypeLikelihoods(final VariantContext mergedVC, final PerReadAlleleLikelihoodMap tumorPRALM, - final Map originalNormalMQs, double[] afs) { - double[] genotypeLikelihoods = new double[mergedVC.getNAlleles()]; - for(Map.Entry> e : tumorPRALM.getLikelihoodReadMap().entrySet()) { - Map m = e.getValue(); - Double refLL = m.get(mergedVC.getReference()); - if (originalNormalMQs.get(e.getKey().getReadName()) != 0) { - genotypeLikelihoods[0] += Math.log10(Math.pow(10, refLL)); - - for (int altInd = 0; altInd < mergedVC.getNAlleles()-1; altInd++) { - Double altLL = m.get(mergedVC.getAlternateAllele(altInd)); - genotypeLikelihoods[altInd+1] += Math.log10(Math.pow(10, refLL) * (1 - afs[altInd]) + Math.pow(10, altLL) * afs[altInd]); - } - } + private PerAlleleCollection getHetGenotypeLogLikelihoods(final VariantContext mergedVC, + final PerReadAlleleLikelihoodMap tumorPRALM, + final Map originalNormalMQs, + final PerAlleleCollection alleleFractions) { + // make sure that alleles in alleleFraction are a subset of alleles in the variant context + if (! mergedVC.getAlternateAlleles().containsAll(alleleFractions.getAltAlleles()) ){ + throw new IllegalArgumentException("alleleFractions has alleles that are not in the variant context"); } - return genotypeLikelihoods; + + final PerAlleleCollection genotypeLogLikelihoods = PerAlleleCollection.createPerRefAndAltAlleleCollection(); + for (final Allele allele : mergedVC.getAlleles()){ + genotypeLogLikelihoods.set(allele, new MutableDouble(0.0)); + } + + final Allele refAllele = mergedVC.getReference(); + for(Map.Entry> readAlleleLikelihoodMap : tumorPRALM.getLikelihoodReadMap().entrySet()) { + Map alleleLikelihoodMap = readAlleleLikelihoodMap.getValue(); + if (originalNormalMQs.get(readAlleleLikelihoodMap.getKey().getReadName()) == 0) { + continue; + } + + final double readRefLogLikelihood = alleleLikelihoodMap.get(refAllele); + genotypeLogLikelihoods.getRef().add(readRefLogLikelihood); + + for (Allele altAllele : alleleFractions.getAltAlleles()) { + double readAltLogLikelihood = alleleLikelihoodMap.get(altAllele); + double adjustedReadAltLL = Math.log10( + Math.pow(10, readRefLogLikelihood) * (1 - alleleFractions.getAlt(altAllele)) + + Math.pow(10, readAltLogLikelihood) * alleleFractions.getAlt(altAllele) + ); + genotypeLogLikelihoods.get(altAllele).add(adjustedReadAltLL); + } + + } + + final PerAlleleCollection result = PerAlleleCollection.createPerRefAndAltAlleleCollection(); + mergedVC.getAlleles().stream().forEach(a -> result.set(a,genotypeLogLikelihoods.get(a).toDouble())); + + return result; } /** * Find the allele fractions for each alternate allele * * @param vc input VC, for alleles - * @param map read likelihoods + * @param pralm read likelihoods * @return estimated AF for each alt */ // FIXME: calculate using the uncertainty rather than this cheap approach - private double[] estimateAlleleFraction(VariantContext vc, PerReadAlleleLikelihoodMap map) { - int[] counts = getRefAltCount(vc, map); - int numAlts = vc.getNAlleles()-1; - double[] afs = new double[numAlts]; - int refCount = counts[0]; - int altCount; + private PerAlleleCollection estimateAlleleFraction(final VariantContext vc, + final PerReadAlleleLikelihoodMap pralm, + final boolean oneStrandOnly) { + final PerAlleleCollection alleleCounts = getRefAltCount(vc, pralm, oneStrandOnly); + final PerAlleleCollection alleleFractions = PerAlleleCollection.createPerAltAlleleCollection(); - for(int altInd = 0; altInd < numAlts; altInd++) { - altCount = counts[altInd+1]; - afs[altInd] = (double) altCount / ((double) refCount + (double) altCount); - //logger.info("Counted " + refCount + " ref and " + altCount + " alt " ); + int refCount = alleleCounts.getRef(); + for ( final Allele altAllele : vc.getAlternateAlleles() ) { + int altCount = alleleCounts.getAlt(altAllele); + double alleleFraction = (double) altCount / (refCount + altCount); + alleleFractions.setAlt(altAllele, alleleFraction); + // logger.info("Counted " + refCount + " ref and " + altCount + " alt " ); } - return afs; + return alleleFractions; } /** - * Evalutate the most likely allele for each read, if it is in fact informative + * Go through the PRALM and tally the most likely allele in each read. Only count informative reads. * - * @param mergedVC input VC, for alleles - * @param afMap read likelihoods + * @param vc input VC, for alleles + * @param pralm read likelihoods * @return an array giving the read counts for the ref and each alt allele */ - // TODO: ensure there are only two alleles in the VC - private int[] getRefAltCount(VariantContext mergedVC, PerReadAlleleLikelihoodMap afMap) { - int counts[] = new int[mergedVC.getNAlleles()]; - int REF = 0; + private PerAlleleCollection getRefAltCount(final VariantContext vc, + final PerReadAlleleLikelihoodMap pralm, + final boolean oneStrandOnly) { + // Check that the alleles in Variant Context are in PRALM + // Skip the check for strand-conscious PRALM; + reads may not have alleles in - reads, for example. + final Set vcAlleles = new HashSet<>(vc.getAlleles()); + if ( ! oneStrandOnly && ! pralm.getAllelesSet().containsAll( vcAlleles ) ) { + StringBuilder message = new StringBuilder(); + message.append("At Locus chr" + vc.getContig() + ":" + vc.getStart() + ", we detected that variant context had alleles that not in PRALM. "); + message.append("VC alleles = " + vcAlleles + ", PRALM alleles = " + pralm.getAllelesSet()); + logger.warn(message); + } - for(Map.Entry> e : afMap.getLikelihoodReadMap().entrySet()) { - Map m = e.getValue(); - Double rl = m.get(mergedVC.getReference()); - for(int altInd=0; altInd alleleCounts = PerAlleleCollection.createPerRefAndAltAlleleCollection(); + + // initialize the allele counts to 0 + for (final Allele allele : vcAlleles) { + alleleCounts.set(allele, new MutableInt(0)); + } + + for (final Map.Entry> readAlleleLikelihoodMap : pralm.getLikelihoodReadMap().entrySet()) { + final GATKSAMRecord read = readAlleleLikelihoodMap.getKey(); + final Map alleleLikelihoodMap = readAlleleLikelihoodMap.getValue(); + MostLikelyAllele mostLikelyAllele = PerReadAlleleLikelihoodMap.getMostLikelyAllele(alleleLikelihoodMap, vcAlleles); + + if (read.getMappingQuality() > 0 && mostLikelyAllele.isInformative()) { + alleleCounts.get(mostLikelyAllele.getMostLikelyAllele()).increment(); } -// if (al >= rl) logger.info("Alt found in " + e.getKey().getReadName()); } - return counts; - } + final PerAlleleCollection result = PerAlleleCollection.createPerRefAndAltAlleleCollection(); + vc.getAlleles().stream().forEach(a -> result.set(a, alleleCounts.get(a).toInteger())); + + return(result); + } private void logM2Debug(String s) { if (MTAC.M2_DEBUG) { @@ -518,14 +558,6 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine { } } - // would have used org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap.getMostLikelyAllele but we have this case where - // there is a read that doesn't overlap the variant site, and thus supports both alleles equally. - private boolean arePairHMMLikelihoodsInformative(double l1, double l2) { - // TODO: should this be parameterized, or simply encoded - double EPSILON = 0.1; - return (Math.abs(l1 - l2) >= EPSILON); - } - private void filterPRALMForOverlappingReads(PerReadAlleleLikelihoodMap pralm, Allele ref, int location, boolean retainMismatches) { Map> m = pralm.getLikelihoodReadMap(); @@ -598,36 +630,21 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine { } } - private void splitPRALMintoForwardAndReverseReads(final PerReadAlleleLikelihoodMap original, final PerReadAlleleLikelihoodMap forward, final PerReadAlleleLikelihoodMap reverse) { - Map> origMap = original.getLikelihoodReadMap(); - Map> fwdMap = forward.getLikelihoodReadMap(); - Map> revMap = reverse.getLikelihoodReadMap(); - - // iterate through the reads, assign reads and likelihoods to the forward or reverse maps based on the read's strand - Set forwardReads = new HashSet<>(); - Set reverseReads = new HashSet<>(); - - for(GATKSAMRecord rec : origMap.keySet()) { - if (rec.isStrandless()) + private void splitPRALMintoForwardAndReverseReads(final PerReadAlleleLikelihoodMap originalPRALM, final PerReadAlleleLikelihoodMap forwardPRALM, final PerReadAlleleLikelihoodMap reversePRALM) { + Map> origReadAlleleLikelihoodMap = originalPRALM.getLikelihoodReadMap(); + for (final GATKSAMRecord read : origReadAlleleLikelihoodMap.keySet()) { + if (read.isStrandless()) continue; - if (rec.getReadNegativeStrandFlag()) - reverseReads.add(rec); - else - forwardReads.add(rec); - } - final Iterator>> it = origMap.entrySet().iterator(); - while ( it.hasNext() ) { - final Map.Entry> record = it.next(); - if(forwardReads.contains(record.getKey())) { - fwdMap.put(record.getKey(), record.getValue()); - //logM2Debug("Dropping read " + record.getKey() + " due to overlapping read fragment rules"); - } - else if (reverseReads.contains(record.getKey())){ - revMap.put(record.getKey(),record.getValue()); + for (final Map.Entry alleleLikelihoodMap : origReadAlleleLikelihoodMap.get(read).entrySet()) { + final Allele allele = alleleLikelihoodMap.getKey(); + final Double likelihood = alleleLikelihoodMap.getValue(); + if (read.getReadNegativeStrandFlag()) + reversePRALM.add(read, allele, likelihood); + else + forwardPRALM.add(read, allele, likelihood); } } - } diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/TumorPowerCalculator.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/TumorPowerCalculator.java index fbb8892e5..5dd37dc7c 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/TumorPowerCalculator.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/TumorPowerCalculator.java @@ -5,7 +5,7 @@ * SOFTWARE LICENSE AGREEMENT * FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY * -* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 ("BROAD") and the LICENSEE and is effective at the date the downloading is completed ("EFFECTIVE DATE"). * * WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and * WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. @@ -21,11 +21,11 @@ * 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. * * 3. PHONE-HOME FEATURE -* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system ("PHONE-HOME") which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE'S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. * * 4. OWNERSHIP OF INTELLECTUAL PROPERTY * LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012-2014 Broad Institute, Inc. +* Copyright 2012-2016 Broad Institute, Inc. * Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. * LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. * @@ -54,83 +54,143 @@ package org.broadinstitute.gatk.tools.walkers.cancer.m2; import org.apache.commons.math.MathException; import org.apache.commons.math.distribution.BinomialDistribution; import org.apache.commons.math.distribution.BinomialDistributionImpl; +import org.apache.commons.math3.util.Pair; -public class TumorPowerCalculator extends AbstractPowerCalculator{ - private double constantContamination; - private boolean enableSmoothing; +import java.util.Arrays; +import java.util.HashMap; +import java.util.OptionalInt; +import java.util.stream.IntStream; - public TumorPowerCalculator(double constantEps, double constantLodThreshold, double constantContamination) { - this(constantEps, constantLodThreshold, constantContamination, true); +/** + * We store a memo to avoid repeated computation of statistical power to detect a variant. + * The key of the memo is a pair of numbers: number of reads and estimated allele fraction + */ +public class TumorPowerCalculator { + private final double errorProbability; + private final double tumorLODThreshold; + private final double contamination; + private final boolean enableSmoothing; + public static int numCacheHits = 0; + + private final HashMap cache = new HashMap(); + + public TumorPowerCalculator(double errorProbability, double constantLodThreshold, double contamination) { + this(errorProbability, constantLodThreshold, contamination, true); } - public TumorPowerCalculator(double constantEps, double constantLodThreshold, double constantContamination, boolean enableSmoothing) { - this.constantEps = constantEps; - this.constantLodThreshold = constantLodThreshold; - this.constantContamination = constantContamination; + public TumorPowerCalculator(double errorProbability, double tumorLODThreshold, double contamination, boolean enableSmoothing) { + this.errorProbability = errorProbability; + this.tumorLODThreshold = tumorLODThreshold; + this.contamination = contamination; this.enableSmoothing = enableSmoothing; } - public double cachingPowerCalculation(int n, double delta) throws MathException { - PowerCacheKey key = new PowerCacheKey(n, delta); + /** + * A helper class that acts as the key to the memo of pre-computed power + * + * TODO: Not ideal to use double as a key. Refactor such that we use as keys numAlts and numReads, which are integers. Then calculate numAlts/numReads when we need allele fraction. + * + */ + private static class PowerCacheKey extends Pair { + private final Double alleleFraction; + private final Integer numReads; + + public PowerCacheKey(final int numReads, final double alleleFraction) { + super(numReads, alleleFraction); + this.alleleFraction = alleleFraction; + this.numReads = numReads; + } + + private boolean closeEnough(final double x, final double y, final double epsilon){ + return(Math.abs(x - y) < epsilon); + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + + PowerCacheKey that = (PowerCacheKey) o; + return (closeEnough(alleleFraction, that.alleleFraction, 0.001) && numReads != that.numReads); + } + + @Override + public int hashCode() { + int result; + long temp; + result = numReads; + temp = alleleFraction != +0.0d ? Double.doubleToLongBits(alleleFraction) : 0L; + result = 31 * result + (int) (temp ^ (temp >>> 32)); + return result; + } + } + + /** + * + * @param numReads total number of reads, REF and ALT combined, in + or - strand + * @param alleleFraction the true allele fraction estimated as the combined allele fraction from + and - reads + * @return probability of correctly calling the variant (i.e. power) given the above estimated allele fraction and number of reads. + * we compute power separately for each strand (+ and -) + * @throws MathException + * + */ + public double cachedPowerCalculation(final int numReads, final double alleleFraction) throws MathException { + PowerCacheKey key = new PowerCacheKey(numReads, alleleFraction); + // we first look up if power for given number of read and allele fraction has already been computed and stored in the cache. + // if not we compute it and store it in teh cache. Double power = cache.get(key); if (power == null) { - power = calculatePower(n, constantEps, constantLodThreshold, delta, constantContamination, enableSmoothing); + power = calculatePower(numReads, alleleFraction); cache.put(key, power); + } else { + numCacheHits++; } - return power; + return power; } - - - - protected static double calculateTumorLod(int depth, int alts, double eps, double contam) { - double f = (double) alts / (double) depth; - return (AbstractPowerCalculator.calculateLogLikelihood(depth, alts, eps, f) - AbstractPowerCalculator.calculateLogLikelihood(depth, alts, eps, Math.min(f,contam))); + /* helper function for calculateTumorLod */ + private double calculateLogLikelihood(final int numReads, final int numAlts, final double alleleFraction) { + return((numReads-numAlts) * Math.log10( alleleFraction * errorProbability + (1 - alleleFraction)*(1 - errorProbability) ) + + numAlts * Math.log10(alleleFraction * (1 - errorProbability) + (1 - alleleFraction) * errorProbability)); } - protected static double calculatePower(int depth, double eps, double lodThreshold, double delta, double contam, boolean enableSmoothing) throws MathException { - if (depth==0) return 0; + private double calculateTumorLod(final int numReads, final int numAlts) { + final double alleleFraction = (double) numAlts / (double) numReads; + final double altLikelihod = calculateLogLikelihood(numReads, numAlts, alleleFraction); + final double refLikelihood = calculateLogLikelihood(numReads, numAlts, contamination); + return(altLikelihod - refLikelihood); +} - // calculate the probability of each configuration - double p_alt_given_e_delta = delta*(1d-eps) + (1d-delta)*eps; - BinomialDistribution binom = new BinomialDistributionImpl(depth, p_alt_given_e_delta); - double[] p = new double[depth+1]; - for(int i=0; i= lodThreshold) { - k = i; - break; - } - } + // find the smallest number of ALT reads k such that tumorLOD(k) > tumorLODThreshold + final OptionalInt smallestKAboveLogThreshold = IntStream.range(0, numReads + 1) + .filter(k -> calculateTumorLod(numReads, k) > tumorLODThreshold) + .findFirst(); - // if no depth meets the lod score, the power is zero - if (k == -1) { + if (! smallestKAboveLogThreshold.isPresent()){ return 0; } - double power = 0; + if (smallestKAboveLogThreshold.getAsInt() <= 0){ + throw new IllegalStateException("smallest k that meets the tumor LOD threshold is less than or equal to 0"); + } + + double power = Arrays.stream(binomialProbabilities, smallestKAboveLogThreshold.getAsInt(), binomialProbabilities.length).sum(); // here we correct for the fact that the exact lod threshold is likely somewhere between // the k and k-1 bin, so we prorate the power from that bin - // the k and k-1 bin, so we prorate the power from that bin - // if k==0, it must be that lodThreshold == lod[k] so we don't have to make this correction - if ( enableSmoothing && k > 0 ) { - double x = 1d - (lodThreshold - lod[k-1]) / (lod[k] - lod[k-1]); - power = x*p[k-1]; - } - - for(int i=k; i alleleCounts = PerAlleleCollection.createPerRefAndAltAlleleCollection(); + Allele refA = Allele.create("A", true); + Allele altT = Allele.create("T", false); + alleleCounts.set(refA, 40); + alleleCounts.set(altT, 10); + assertEquals((int)alleleCounts.getRef(), 40); + assertEquals((int)alleleCounts.getAlt(altT), 10); } - val cv = new CombineVariants() - cv.reference_sequence = reference - cv.memoryLimit = 2 - cv.setKey = "null" - cv.minimumN = minN - cv.memoryLimit = 16 - cv.filteredrecordsmergetype = FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED - cv.filteredAreUncalled = true - cv.variant = perSampleVcfs - cv.out = genotypesVcf - - // using this instead of "sites_only" because we want to keep the AC info - val vc = new VcfCutter() - vc.inVcf = genotypesVcf - vc.outVcf = finalVcf - - add (cv, vc) - - } + @Test + public void testGet() throws Exception { + PerAlleleCollection alleleCounts = PerAlleleCollection.createPerRefAndAltAlleleCollection(); + Allele refA = Allele.create("A", true); + Allele altT = Allele.create("T", false); + alleleCounts.set(refA, 40); + alleleCounts.set(altT, 10); + assertEquals((int)alleleCounts.get(refA), 40); + assertEquals((int)alleleCounts.get(altT), 10); + } - def createM2Config(bam : File, outputVcf : File): org.broadinstitute.gatk.queue.extensions.gatk.MuTect2 = { - val mutect2 = new org.broadinstitute.gatk.queue.extensions.gatk.MuTect2 + @Test + public void testGetAltAlleles() throws Exception { + PerAlleleCollection alleleCounts = PerAlleleCollection.createPerAltAlleleCollection(); + Allele altA = Allele.create("A", false); + Allele altC = Allele.create("C", false); + Allele altG = Allele.create("G", false); + Allele altT = Allele.create("T", false); + Allele[] altAlleles = {altA, altC, altG, altT}; + for (Allele altAllele : altAlleles ) { + alleleCounts.set(altAllele, 3); + } - mutect2.reference_sequence = reference - mutect2.artifact_detection_mode = true - mutect2.intervalsString :+= intervals - mutect2.memoryLimit = 2 - mutect2.input_file = List(new TaggedFile(bam, "tumor")) + for (Allele altAllele : altAlleles ) { + assertTrue(alleleCounts.getAltAlleles().contains(altAllele)); + } - mutect2.scatterCount = scatter - mutect2.out = outputVcf - - mutect2 - } -} - -class VcfCutter extends CommandLineFunction { - @Input(doc = "vcf to cut") var inVcf: File = _ - @Output(doc = "output vcf") var outVcf: File = _ - - def commandLine = "cat %s | cut -f1-8 > %s".format(inVcf, outVcf) + assertFalse(alleleCounts.getAltAlleles().contains(Allele.create("A", true))); + } } \ No newline at end of file diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/run_M2_dream.scala b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/TumorPowerCalculatorTest.java similarity index 86% rename from protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/run_M2_dream.scala rename to protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/TumorPowerCalculatorTest.java index 728c44681..2ed6584a2 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/run_M2_dream.scala +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/TumorPowerCalculatorTest.java @@ -49,41 +49,32 @@ * 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. */ -package org.broadinstitute.gatk.queue.qscripts.dev +package org.broadinstitute.gatk.tools.walkers.cancer.m2; -import org.broadinstitute.gatk.queue.QScript -import org.broadinstitute.gatk.queue.extensions.gatk._ +import org.testng.annotations.Test; -class run_M2_dream extends QScript { +import static org.testng.Assert.*; - @Argument(shortName = "L", required=false, doc = "Intervals file") - var intervalsFile: List[File] = Nil - @Argument(shortName = "normal", required=true, doc = "Normal sample BAM") - var normalBAM: String = "" - @Argument(shortName = "tumor", required=true, doc = "Tumor sample BAM") - var tumorBAM: String = "" - @Argument(shortName = "o", required=true, doc = "Output file") - var outputFile: String = "" - @Argument(shortName = "sc", required=false, doc = "base scatter count") - var scatter: Int = 10 +/** + * Created by tsato on 6/19/16. + */ +public class TumorPowerCalculatorTest { + + private boolean closeEnough(double x, double y, double epsilon){ + return(Math.abs(x - y) < epsilon); + } + + @Test + public void testCachedPowerCalculation() throws Exception { + TumorPowerCalculator tpc = new TumorPowerCalculator(0.001, 2.0, 0.0); + final double epsilon = 0.0001; + assertTrue(closeEnough(tpc.cachedPowerCalculation(100,0.2), 1.0, epsilon)); + assertTrue(closeEnough(tpc.cachedPowerCalculation(30,0.1), 0.8864, epsilon)); + assertTrue(closeEnough(tpc.cachedPowerCalculation(0,0.02), 0.0, epsilon)); + assertTrue(closeEnough(tpc.cachedPowerCalculation(5, 0.01), 0.0520, epsilon)); - def script() { + } - val mutect2 = new MuTect2 - mutect2.reference_sequence = new File("/seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta") - mutect2.cosmic :+= new File("/xchip/cga/reference/hg19/hg19_cosmic_v54_120711.vcf") - mutect2.dbsnp = new File("/humgen/gsa-hpprojects/GATK/bundle/current/b37/dbsnp_138.b37.vcf") - mutect2.normal_panel :+= new File("/xchip/cga/reference/hg19/wgs_hg19_125_cancer_blood_normal_panel.vcf") - - mutect2.intervalsString = intervalsFile - mutect2.memoryLimit = 2 - mutect2.input_file = List(new TaggedFile(normalBAM, "normal"), new TaggedFile(tumorBAM, "tumor")) - - mutect2.scatterCount = scatter - mutect2.out = outputFile - add(mutect2) - } - -} +} \ No newline at end of file diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/VariantAnnotatorEngine.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/VariantAnnotatorEngine.java index 40360262a..193c01120 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/VariantAnnotatorEngine.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/VariantAnnotatorEngine.java @@ -303,10 +303,10 @@ public class VariantAnnotatorEngine { if ( !(annotationType instanceof ActiveRegionBasedAnnotation) ) continue; - final Map annotationsFromCurrentType = annotationType.annotate(referenceContext, perReadAlleleLikelihoodMap, newGenotypeAnnotatedVC); - if (annotationsFromCurrentType != null) { - infoAnnotations.putAll(annotationsFromCurrentType); - } + final Map annotationsFromCurrentType = annotationType.annotate(null, walker, referenceContext, null, newGenotypeAnnotatedVC, perReadAlleleLikelihoodMap); + if (annotationsFromCurrentType != null) { + infoAnnotations.putAll(annotationsFromCurrentType); + } } // create a new VC with info and genotype annotations diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/sam/AlignmentUtils.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/sam/AlignmentUtils.java index 4cc189aea..5aae6af85 100644 --- a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/sam/AlignmentUtils.java +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/sam/AlignmentUtils.java @@ -488,6 +488,7 @@ public final class AlignmentUtils { } } + // pos counts read bases. alignmentPos counts ref bases int pos = 0; int alignmentPos = 0; diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVCFConstants.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVCFConstants.java index 5c7494d7d..06c626496 100644 --- a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVCFConstants.java +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVCFConstants.java @@ -134,6 +134,11 @@ public final class GATKVCFConstants { public static final String OXOG_FRACTION_KEY = "FOXOG"; public static final String AS_INSERT_SIZE_RANK_SUM_KEY = "AS_InsertSizeRankSum"; public static final String AS_RAW_INSERT_SIZE_RANK_SUM_KEY = "AS_RAW_InsertSizeRankSum"; + public static final String TLOD_FWD_KEY = "TLOD_FWD"; + public static final String TLOD_REV_KEY = "TLOD_REV"; + public static final String TUMOR_SB_POWER_FWD_KEY = "TUMOR_SB_POWER_FWD"; + public static final String TUMOR_SB_POWER_REV_KEY = "TUMOR_SB_POWER_REV"; + //FORMAT keys public static final String ALLELE_BALANCE_KEY = "AB"; @@ -173,6 +178,7 @@ public final class GATKVCFConstants { public static final String STR_CONTRACTION_FILTER_NAME = "str_contraction"; //M2 public static final String TUMOR_LOD_FILTER_NAME = "t_lod_fstar"; //M2 public static final String TRIALLELIC_SITE_FILTER_NAME = "triallelic_site"; //M2 + public static final String STRAND_ARTIFACT_FILTER_NAME = "strand_artifact"; // M2 // Symbolic alleles public final static String SYMBOLIC_ALLELE_DEFINITION_HEADER_TAG = "ALT"; diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVCFHeaderLines.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVCFHeaderLines.java index e8f960bff..a64eccf92 100644 --- a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVCFHeaderLines.java +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVCFHeaderLines.java @@ -72,6 +72,10 @@ public class GATKVCFHeaderLines { addFilterLine(new VCFFilterHeaderLine(GATKVCFConstants.TUMOR_LOD_FILTER_NAME, "Tumor does not meet likelihood threshold")); addFilterLine(new VCFFilterHeaderLine(GATKVCFConstants.STR_CONTRACTION_FILTER_NAME, "Site filtered due to contraction of short tandem repeat region")); addFilterLine(new VCFFilterHeaderLine(GATKVCFConstants.TRIALLELIC_SITE_FILTER_NAME, "Site filtered because more than two alt alleles pass tumor LOD")); + addFilterLine(new VCFFilterHeaderLine(GATKVCFConstants.STRAND_ARTIFACT_FILTER_NAME, "Strand bias detected: evidence for alt allele comes from one read direction only")); + // addFilterLine(new VCFFilterHeaderLine(GATKVCFConstants.CLUSTERED_READ_POSITION_FILTER_NAME, "Variant appears in similar read positions")); + + addFormatLine(new VCFFormatHeaderLine(ALLELE_BALANCE_KEY, 1, VCFHeaderLineType.Float, "Allele balance for each het genotype")); addFormatLine(new VCFFormatHeaderLine(BASE_COUNTS_BY_SAMPLE_KEY, 4, VCFHeaderLineType.Integer, "Counts of each base by sample")); @@ -201,6 +205,10 @@ public class GATKVCFHeaderLines { addInfoLine(new VCFInfoHeaderLine(GATKVCFConstants.NORMAL_LOD_KEY, 1, VCFHeaderLineType.String, "Normal LOD score")); addInfoLine(new VCFInfoHeaderLine(GATKVCFConstants.PANEL_OF_NORMALS_COUNT_KEY, 1, VCFHeaderLineType.String, "Count from Panel of Normals")); addInfoLine(new VCFInfoHeaderLine(GATKVCFConstants.TUMOR_LOD_KEY, 1, VCFHeaderLineType.String, "Tumor LOD score")); + addInfoLine(new VCFInfoHeaderLine(GATKVCFConstants.TLOD_FWD_KEY,1,VCFHeaderLineType.Float,"TLOD from forward reads only")); + addInfoLine(new VCFInfoHeaderLine(GATKVCFConstants.TLOD_REV_KEY,1,VCFHeaderLineType.Float,"TLOD from reverse reads only")); + addInfoLine(new VCFInfoHeaderLine(GATKVCFConstants.TUMOR_SB_POWER_FWD_KEY,1,VCFHeaderLineType.Float,"Strand bias power for forward reads")); + addInfoLine(new VCFInfoHeaderLine(GATKVCFConstants.TUMOR_SB_POWER_REV_KEY,1,VCFHeaderLineType.Float,"Stand bias power for reverse reads")); } } From 1ff234e7dd035e1894f1959b99c81181f69f10cc Mon Sep 17 00:00:00 2001 From: Steve Huang Date: Thu, 30 Jun 2016 22:36:49 -0400 Subject: [PATCH 16/68] remove alt alleles, when genotype count is explosively large, based on alleles' highest supporting haplotype score; max tolerable genotype count is controlled by a default value overridable by user remove alt alleles, when genotype count is explosively large, based on alleles' highest supporting haplotype score; max tolerable genotype count is controlled by a default value overridable by user --- ...GenotypeCalculationArgumentCollection.java | 22 +- .../GenotypeLikelihoodCalculator.java | 32 +-- .../GenotypeLikelihoodCalculators.java | 91 ++++++--- .../HaplotypeCallerGenotypingEngine.java | 190 ++++++++++++++---- .../GenotypeLikelihoodCalculatorUnitTest.java | 9 + .../gatk/utils/haplotype/Haplotype.java | 1 + 6 files changed, 268 insertions(+), 77 deletions(-) diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/arguments/GenotypeCalculationArgumentCollection.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/arguments/GenotypeCalculationArgumentCollection.java index d8c10145f..4a5456d94 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/arguments/GenotypeCalculationArgumentCollection.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/arguments/GenotypeCalculationArgumentCollection.java @@ -122,12 +122,32 @@ public class GenotypeCalculationArgumentCollection implements Cloneable{ * scales exponentially based on the number of alternate alleles. Unless there is a good reason to change the default value, we highly recommend * that you not play around with this parameter. * - * As of GATK 2.2 the genotyper can handle a very large number of events, so the default maximum has been increased to 6. + * See also {@link #MAX_GENOTYPE_COUNT}. */ @Advanced @Argument(fullName = "max_alternate_alleles", shortName = MAX_ALTERNATE_ALLELES_SHORT_NAME, doc = "Maximum number of alternate alleles to genotype", required = false) public int MAX_ALTERNATE_ALLELES = 6; + /** + * If there are more than this number of genotypes at a locus presented to the genotyper, then only this many genotypes will be used. + * The possible genotypes are simply different ways of partitioning alleles given a specific ploidy asumption. + * Therefore, we remove genotypes from consideration by removing alternate alleles that are the least well supported. + * The estimate of allele support is based on the ranking of the candidate haplotypes coming out of the graph building step. + * Note that the reference allele is always kept. + * + * Note that genotyping sites with large genotype counts is both CPU and memory intensive. + * Unless there is a good reason to change the default value, we highly recommend that you not play around with this parameter. + * + * The maximum number of alternative alleles used in the genotyping step will be the lesser of the two: + * 1. the largest number of alt alleles, given ploidy, that yields a genotype count no higher than {@link #MAX_GENOTYPE_COUNT} + * 2. the value of {@link #MAX_ALTERNATE_ALLELES} + * + * See also {@link #MAX_ALTERNATE_ALLELES}. + */ + @Advanced + @Argument(fullName = "max_genotype_count", shortName = "maxGT", doc = "Maximum number of genotypes to consider at any site", required = false) + public int MAX_GENOTYPE_COUNT = 1024; + /** * Determines the maximum number of PL values that will be logged in the output. If the number of genotypes * (which is determined by the ploidy and the number of alleles) exceeds the value provided by this argument, diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypeLikelihoodCalculator.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypeLikelihoodCalculator.java index 1a3cdf2b8..aead2ab15 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypeLikelihoodCalculator.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypeLikelihoodCalculator.java @@ -163,7 +163,7 @@ public class GenotypeLikelihoodCalculator { *

This is in fact a shallow copy if {@link GenotypeLikelihoodCalculators#ploidyLog10}

and is not meant to be modified by * this class.

*/ - private final double[] log10; + private final double[] ploidyLog10; /** * Buffer field use as a temporal container for sorted allele counts when calculating the likelihood of a @@ -202,24 +202,22 @@ public class GenotypeLikelihoodCalculator { * Creates a new calculator providing its ploidy and number of genotyping alleles. */ protected GenotypeLikelihoodCalculator(final int ploidy, final int alleleCount, - final int[][] alleleFirstGenotypeOffsetByPloidy, - final GenotypeAlleleCounts[][] genotypeTableByPloidy, - final double[] ploidyLog10) { + final int[][] alleleFirstGenotypeOffsetByPloidy, + final GenotypeAlleleCounts[][] genotypeTableByPloidy, + final double[] ploidyLog10) { + this.alleleFirstGenotypeOffsetByPloidy = alleleFirstGenotypeOffsetByPloidy; genotypeAlleleCounts = genotypeTableByPloidy[ploidy]; this.alleleCount = alleleCount; this.ploidy = ploidy; genotypeCount = this.alleleFirstGenotypeOffsetByPloidy[ploidy][alleleCount]; - if (genotypeCount == GenotypeLikelihoodCalculators.GENOTYPE_COUNT_OVERFLOW) - throw new IllegalArgumentException( - String.format("the combination of ploidy (%s) and number of alleles (%s) results in a very large number of genotypes (> %s). You need to limit ploidy or the number of alternative alleles to analyze this locus", - ploidy,alleleCount,Integer.MAX_VALUE)); + alleleHeap = new IntMaxHeap(ploidy); readLikelihoodsByGenotypeIndex = new double[genotypeCount][]; - log10 = ploidyLog10; + this.ploidyLog10 = ploidyLog10; // The number of possible components is limited by distinct allele count and ploidy. maximumDistinctAllelesInGenotype = Math.min(ploidy, alleleCount); - genotypeAllelesAndCounts = new int[maximumDistinctAllelesInGenotype << 1]; + genotypeAllelesAndCounts = new int[maximumDistinctAllelesInGenotype*2]; } /** @@ -349,7 +347,7 @@ public class GenotypeLikelihoodCalculator { */ private double[] genotypeLikelihoods(final double[][] readLikelihoodsByGenotypeIndex, final int readCount) { final double[] result = new double[genotypeCount]; - final double denominator = readCount * log10[ploidy]; // instead of dividing each read likelihood by ploidy + final double denominator = readCount * ploidyLog10[ploidy]; // instead of dividing each read likelihood by ploidy // ( so subtract log10(ploidy) ) we multiply them all and the divide by ploidy^readCount (so substract readCount * log10(ploidy) ) for (int g = 0; g < genotypeCount; g++) { final double[] likelihoodsByRead = readLikelihoodsByGenotypeIndex[g]; @@ -464,7 +462,9 @@ public class GenotypeLikelihoodCalculator { * exactly one allele present in the genotype. */ private void singleComponentGenotypeLikelihoodByRead(final GenotypeAlleleCounts genotypeAlleleCounts, - final double[] likelihoodByRead, final double[] readLikelihoodComponentsByAlleleCount, final int readCount) { + final double[] likelihoodByRead, + final double[] readLikelihoodComponentsByAlleleCount, + final int readCount) { final int allele = genotypeAlleleCounts.alleleIndexAt(0); // the count of the only component must be = ploidy. int offset = (allele * (ploidy + 1) + ploidy) * readCount; @@ -493,7 +493,7 @@ public class GenotypeLikelihoodCalculator { // p = 2 because the frequency == 1 we already have it. for (int frequency = 2, destinationOffset = frequency1Offset + readCount; frequency <= ploidy; frequency++) { - final double log10frequency = log10[frequency]; + final double log10frequency = ploidyLog10[frequency]; for (int r = 0, sourceOffset = frequency1Offset; r < readCount; r++) readAlleleLikelihoodByAlleleCount[destinationOffset++] = readAlleleLikelihoodByAlleleCount[sourceOffset++] + log10frequency; @@ -620,7 +620,11 @@ public class GenotypeLikelihoodCalculator { * @param destination where to store the new genotype index mapping to old. * @param sortedAlleleCountsBuffer a buffer to re-use to get the genotype-allele-count's sorted allele counts. */ - private void genotypeIndexMapPerGenotypeIndex(final int newGenotypeIndex, final GenotypeAlleleCounts alleleCounts, final int[] oldToNewAlleleIndexMap, final int[] destination, final int[] sortedAlleleCountsBuffer) { + private void genotypeIndexMapPerGenotypeIndex(final int newGenotypeIndex, + final GenotypeAlleleCounts alleleCounts, + final int[] oldToNewAlleleIndexMap, + final int[] destination, + final int[] sortedAlleleCountsBuffer) { final int distinctAlleleCount = alleleCounts.distinctAlleleCount(); alleleCounts.copyAlleleCounts(sortedAlleleCountsBuffer,0); for (int j = 0, jj = 0; j < distinctAlleleCount; j++) { diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypeLikelihoodCalculators.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypeLikelihoodCalculators.java index 83b280bff..da0ff0975 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypeLikelihoodCalculators.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypeLikelihoodCalculators.java @@ -51,7 +51,11 @@ package org.broadinstitute.gatk.tools.walkers.genotyper; +import org.broadinstitute.gatk.utils.MathUtils; +import org.broadinstitute.gatk.utils.exceptions.GATKException; + import java.util.Arrays; +import java.util.stream.IntStream; /** * Genotype likelihood calculator utility. @@ -116,7 +120,10 @@ public class GenotypeLikelihoodCalculators { private volatile static GenotypeAlleleCounts[][] genotypeTableByPloidy = buildGenotypeAlleleCountsTable(maximumPloidy,maximumAllele,alleleFirstGenotypeOffsetByPloidy); - + /** + * Cached log10 values for the first integer up to the maximum ploidy requested thus far. + */ + private volatile static double[] ploidyLog10 = IntStream.range(0, maximumPloidy + 1).mapToDouble(Math::log10).toArray(); /** * Build the table with the genotype offsets based on ploidy and the maximum allele index with representation @@ -291,40 +298,29 @@ public class GenotypeLikelihoodCalculators { return result; } - /** - * Cached log10 values for the first integer up to the maximum ploidy requested thus far. - */ - private volatile static double[] ploidyLog10; - - // Initialize {@link #ploidyLog10}. - static { - ploidyLog10 = new double[maximumPloidy + 1]; - for (int i = 0; i <= maximumPloidy; i++) - ploidyLog10[i] = Math.log10(i); - } - /** * Returns an instance given its ploidy and the number of alleles. * * @param alleleCount the required allele-count. * @param ploidy the required ploidy-count. * - * @throws IllegalArgumentException if either {@code ploidy} or {@code alleleCount} is {@code null}, or - * the resulting number of genotypes is too large. + * @throws IllegalArgumentException if either {@code ploidy} or {@code alleleCount} is negative, or the resulting number of genotypes is too large. * * @return never {@code null}. */ - public static GenotypeLikelihoodCalculator getInstance(final int ploidy, - final int alleleCount) { + public static GenotypeLikelihoodCalculator getInstance(final int ploidy, final int alleleCount) { checkPloidyAndMaximumAllele(ploidy, alleleCount); // Non-thread safe (fast) check on tables capacities, - // if not enough capacity we expand the tables in a thread-safe manner: - if (alleleCount > maximumAllele || ploidy > maximumPloidy) - ensureCapacity(alleleCount, ploidy); + // if not enough capacity we expand the tables in a thread-safe manner + // also checks if the requested ploidy and allele count result in a genotype count too large to deal with + if(calculateGenotypeCountUsingTables(ploidy, alleleCount) == GENOTYPE_COUNT_OVERFLOW){ + final double largeGenotypeCount = MathUtils.binomialCoefficient(ploidy + alleleCount - 1, alleleCount - 1); + throw new IllegalArgumentException(String.format("the number of genotypes is too large for ploidy %d and allele %d: approx. %.0f", ploidy, alleleCount, largeGenotypeCount)); + } // At this point the tables must have at least the requested capacity, likely to be much more. - return new GenotypeLikelihoodCalculator(ploidy,alleleCount,alleleFirstGenotypeOffsetByPloidy,genotypeTableByPloidy,ploidyLog10); + return new GenotypeLikelihoodCalculator(ploidy, alleleCount, alleleFirstGenotypeOffsetByPloidy, genotypeTableByPloidy, ploidyLog10); } /** @@ -413,14 +409,59 @@ public class GenotypeLikelihoodCalculators { * @param ploidy the requested ploidy. * @param alleleCount the requested number of alleles. * - * @throws IllegalArgumentException if {@code ploidy} or {@code alleleCount} is negative. + * @throws IllegalArgumentException if {@code ploidy} or {@code alleleCount} is negative or + * the number of genotypes is too large (more than {@link Integer#MAX_VALUE}). * - * @return 0 or greater. + * @return the number of genotypes given ploidy and allele count (0 or greater). */ public final static int genotypeCount(final int ploidy, final int alleleCount) { + + final int result = calculateGenotypeCountUsingTables(ploidy, alleleCount); + if (result == GENOTYPE_COUNT_OVERFLOW) { + final double largeGenotypeCount = MathUtils.binomialCoefficient(ploidy + alleleCount - 1, alleleCount - 1); + throw new IllegalArgumentException(String.format("the number of genotypes is too large for ploidy %d and allele %d: approx. %.0f", ploidy, alleleCount, largeGenotypeCount)); + } + return result; + } + + /** + * Compute the maximally acceptable allele count (ref allele included) given the maximally acceptable genotype count. + * @param ploidy sample ploidy + * @param maxGenotypeCount maximum number of genotype count used to calculate upper bound on number of alleles given ploidy + * @throws IllegalArgumentException if {@code ploidy} or {@code alleleCount} is negative. + * @return the maximally acceptable allele count given ploidy and maximum number of genotypes acceptable + */ + public static int computeMaxAcceptableAlleleCount(final int ploidy, final int maxGenotypeCount){ + + checkPloidyAndMaximumAllele(ploidy, ploidy); // a hack to check ploidy makes sense (could duplicate code but choice must be made) + + final double log10MaxGenotypeCount = Math.log10(maxGenotypeCount); + + // Math explanation: genotype count is determined by ${P+A-1 \choose A-1}$, this leads to constraint + // $\log(\frac{(P+A-1)!}{(A-1)!}) \le \log(P!G)$, + // where $P$ is ploidy, $A$ is allele count, and $G$ is maxGenotypeCount + // The upper and lower bounds of the left hand side of the constraint are $P \log(A-1+P)$ and $P \log(A)$ + // which require $A$ to be searched in interval $[10^{\log(P!G)/P} - (P-1), 10^{\log(P!G)/P}]$ + // Denote $[10^{\log(P!G)/P}$ as $x$ in the code. + + final double x = Math.pow(10, (MathUtils.log10Factorial(ploidy) + log10MaxGenotypeCount)/ploidy ); + final int lower = (int)Math.floor(x) - ploidy - 1; + final int upper = (int)Math.ceil(x); + for(int a=upper; a>=lower; --a){// check one by one + + final double log10GTCnt = MathUtils.log10BinomialCoefficient(ploidy+a-1, a-1); + if(log10MaxGenotypeCount >= log10GTCnt) { + return a; + } + } + throw new GATKException("Code should never reach here."); + } + + private static int calculateGenotypeCountUsingTables(int ploidy, int alleleCount) { checkPloidyAndMaximumAllele(ploidy, alleleCount); - if (ploidy > maximumPloidy || alleleCount > maximumAllele) - ensureCapacity(alleleCount,ploidy); + if (ploidy > maximumPloidy || alleleCount > maximumAllele) { + ensureCapacity(alleleCount, ploidy); + } return alleleFirstGenotypeOffsetByPloidy[ploidy][alleleCount]; } } \ No newline at end of file diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngine.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngine.java index 30e7efe22..ad0f2b773 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngine.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngine.java @@ -56,6 +56,8 @@ import com.google.java.contract.Ensures; import com.google.java.contract.Requires; import htsjdk.samtools.util.StringUtil; import htsjdk.variant.variantcontext.*; +import org.apache.commons.lang.ArrayUtils; +import org.apache.commons.math3.stat.StatUtils; import org.broadinstitute.gatk.engine.arguments.GenotypeCalculationArgumentCollection; import org.broadinstitute.gatk.utils.*; import org.broadinstitute.gatk.utils.contexts.ReferenceContext; @@ -73,6 +75,7 @@ import org.broadinstitute.gatk.utils.variant.GATKVCFConstants; import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils; import java.util.*; +import java.util.stream.Collectors; /** * {@link HaplotypeCaller}'s genotyping strategy implementation. @@ -84,6 +87,9 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine practicaAlleleCountForPloidy = new HashMap<>(); + private MergeVariantsAcrossHaplotypes crossHaplotypeEventMerger; protected final boolean doPhysicalPhasing; @@ -135,7 +141,7 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine calls; @@ -189,16 +195,16 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine 0"}) @Ensures("result != null") // TODO - can this be refactored? this is hard to follow! - CalledHaplotypes assignGenotypeLikelihoods( final List haplotypes, - final ReadLikelihoods readLikelihoods, - final Map> perSampleFilteredReadList, - final byte[] ref, - final GenomeLoc refLoc, - final GenomeLoc activeRegionWindow, - final GenomeLocParser genomeLocParser, - final RefMetaDataTracker tracker, - final List activeAllelesToGenotype, - final boolean emitReferenceConfidence) { + CalledHaplotypes assignGenotypeLikelihoods(final List haplotypes, + final ReadLikelihoods readLikelihoods, + final Map> perSampleFilteredReadList, + final byte[] ref, + final GenomeLoc refLoc, + final GenomeLoc activeRegionWindow, + final GenomeLocParser genomeLocParser, + final RefMetaDataTracker tracker, + final List activeAllelesToGenotype, + final boolean emitReferenceConfidence) { // sanity check input arguments if (haplotypes == null || haplotypes.isEmpty()) throw new IllegalArgumentException("haplotypes input should be non-empty and non-null, got "+haplotypes); if (readLikelihoods == null || readLikelihoods.sampleCount() == 0) throw new IllegalArgumentException("readLikelihoods input should be non-empty and non-null, got "+readLikelihoods); @@ -232,15 +238,17 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine mergeMap = new LinkedHashMap<>(); mergeMap.put(null, mergedVC.getReference()); // the reference event (null) --> the reference allele @@ -248,13 +256,37 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine> alleleMapper = createAlleleMapper(mergeMap, eventMapper); - if( configuration.DEBUG && logger != null ) { if (logger != null) logger.info("Genotyping event at " + loc + " with alleles = " + mergedVC.getAlleles()); } - final ReadLikelihoods readAlleleLikelihoods = readLikelihoods.marginalize(alleleMapper, genomeLocParser.createPaddedGenomeLoc(genomeLocParser.createGenomeLoc(mergedVC), ALLELE_EXTENSION)); + final Map> alleleMapper = createAlleleMapper(mergeMap, eventMapper); + + // if the number of allele is so high that enumerating all possible genotypes is impractical, + // as determined by MAX_GENOTYPE_COUNT_TO_ENUMERATE, + // trim alleles that are not well supported by good-scored haplotypes, + // otherwise alias, i.e. no trimming if genotype count is small + final int originalAlleleCount = alleleMapper.size(); + Integer practicalAlleleCount = practicaAlleleCountForPloidy.get(ploidy); + if(practicalAlleleCount==null) { // maximum allele count given this ploidy and MAX_GENOTYPE_COUNT_TO_ENUMERATE hasn't been computed + practicalAlleleCount = GenotypeLikelihoodCalculators.computeMaxAcceptableAlleleCount(ploidy, MAX_GENOTYPE_COUNT_TO_ENUMERATE); + } + + Map> practicalAlleleMapper = null; + if (practicalAlleleCount < originalAlleleCount) { + practicalAlleleMapper = reduceNumberOfAlternativeAllelesBasedOnHaplotypesScores(alleleMapper, practicalAlleleCount); + if( configuration.DEBUG && logger != null ) { + logger.warn(String.format("Removed alt alleles where ploidy is %d and original allele count is %d, whereas after trimming the allele count becomes %d", + ploidy, originalAlleleCount, practicalAlleleCount)); + logger.warn(String.format("Alleles kept are:%s", practicalAlleleMapper.keySet())); + } + }else{ + practicalAlleleMapper = alleleMapper; + } + + final ReadLikelihoods readAlleleLikelihoods = readLikelihoods.marginalize(practicalAlleleMapper, + genomeLocParser.createPaddedGenomeLoc(genomeLocParser.createGenomeLoc(mergedVC), + ALLELE_EXTENSION)); if (configuration.isSampleContaminationPresent()) readAlleleLikelihoods.contaminationDownsampling(configuration.getSampleContamination()); @@ -269,10 +301,23 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine> reduceNumberOfAlternativeAllelesBasedOnHaplotypesScores(final Map> alleleMapper, final int desiredNumOfAlleles) { + + final PriorityQueue altAlleleMaxPriorityQ = new PriorityQueue<>((sa1, sa2) -> - sa2.compareTo(sa1)); // -1 to turn it into max priority q + + final Set allelesToRetain = new HashSet<>(); + // populate allelePriorityQ with the relevant information + for(final Allele allele : alleleMapper.keySet()){ + + if(allele.isReference()){ // collect scores information only on alt alleles; ref allele is never trimmed by this function + allelesToRetain.add(allele); + continue; + } + + final List hapScores = alleleMapper.get(allele).stream().map(hap -> hap.getScore()).collect(Collectors.toList()); + Collections.sort(hapScores); + final Double highestScore = hapScores.get(hapScores.size()-1); + final Double secondHighestScore = hapScores.size()>1 ? hapScores.get(hapScores.size()-2) : Double.NEGATIVE_INFINITY; + + altAlleleMaxPriorityQ.add(new AlleleScoredByHaplotypeScores(allele, highestScore, secondHighestScore)); + } + + while(allelesToRetain.size() allelesToRetain.contains(p.getKey())) + .collect(Collectors.toMap(p->p.getKey(), p->p.getValue())); + } + + /** + * A utility class that provides ordering information, given best and second best haplotype scores. + * If there's a tie between the two alleles when comparing their best haplotype score, the second best haplotype score + * is used for breaking the tie. In the case that one allele doesn't have a second best allele, i.e. it has only one + * supportive haplotype, its second best score is set as null, and is always considered "worse" than another allele + * that has the same best haplotype score, but also has a second best haplotype score. + * TODO: in the extremely unlikely case that two alleles, having the same best haplotype score, neither have a second + * best haplotype score, the case is undecided. + */ + private static final class AlleleScoredByHaplotypeScores { + private final Allele allele; + private final Double bestHaplotypeScore; + private final Double secondBestHaplotypeScore; + + public AlleleScoredByHaplotypeScores(final Allele allele, final Double bestHaplotypeScore, final Double secondBestHaplotypeScore){ + this.allele = allele; + this.bestHaplotypeScore = bestHaplotypeScore; + this.secondBestHaplotypeScore = secondBestHaplotypeScore; + } + + public int compareTo(final AlleleScoredByHaplotypeScores other) { + if(bestHaplotypeScore > other.bestHaplotypeScore) { + return 1; + } else if (bestHaplotypeScore < other.bestHaplotypeScore) { + return -1; + } else { + return secondBestHaplotypeScore > other.secondBestHaplotypeScore ? 1 : -1; + } + } + + public Allele getAllele(){ + return allele; + } + } + /** * Reduce the number alternative alleles in a read-likelihoods collection to the maximum-alt-allele user parameter value. *

@@ -462,7 +579,7 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine> constructHaplotypeMapping(final List originalCalls, - final Set calledHaplotypes) { + final Set calledHaplotypes) { final Map> haplotypeMap = new HashMap<>(originalCalls.size()); for ( final VariantContext call : originalCalls ) { // don't try to phase if there is not exactly 1 alternate allele @@ -674,14 +791,13 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine prepareReadAlleleLikelihoodsForAnnotation( - final ReadLikelihoods readHaplotypeLikelihoods, - final Map> perSampleFilteredReadList, - final GenomeLocParser genomeLocParser, - final boolean emitReferenceConfidence, - final Map> alleleMapper, - final ReadLikelihoods readAlleleLikelihoodsForGenotyping, - final VariantContext call) { + protected ReadLikelihoods prepareReadAlleleLikelihoodsForAnnotation(final ReadLikelihoods readHaplotypeLikelihoods, + final Map> perSampleFilteredReadList, + final GenomeLocParser genomeLocParser, + final boolean emitReferenceConfidence, + final Map> alleleMapper, + final ReadLikelihoods readAlleleLikelihoodsForGenotyping, + final VariantContext call) { final ReadLikelihoods readAlleleLikelihoodsForAnnotations; final GenomeLoc loc = genomeLocParser.createGenomeLoc(call); @@ -744,10 +860,10 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine decomposeHaplotypesIntoVariantContexts(final List haplotypes, - final ReadLikelihoods readLikelihoods, - final byte[] ref, - final GenomeLoc refLoc, - final List activeAllelesToGenotype) { + final ReadLikelihoods readLikelihoods, + final byte[] ref, + final GenomeLoc refLoc, + final List activeAllelesToGenotype) { final boolean in_GGA_mode = !activeAllelesToGenotype.isEmpty(); // Using the cigar from each called haplotype figure out what events need to be written out in a VCF file @@ -782,8 +898,8 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine getVCsAtThisLocation(final List haplotypes, - final int loc, - final List activeAllelesToGenotype) { + final int loc, + final List activeAllelesToGenotype) { // the overlapping events to merge into a common reference view final List eventsAtThisLoc = new ArrayList<>(); diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypeLikelihoodCalculatorUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypeLikelihoodCalculatorUnitTest.java index e7a9c9467..27c7ac04e 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypeLikelihoodCalculatorUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypeLikelihoodCalculatorUnitTest.java @@ -170,6 +170,15 @@ public class GenotypeLikelihoodCalculatorUnitTest { } + @Test + public void testComputeMaxAcceptableAlleleCount(){ + Assert.assertEquals(1024, GenotypeLikelihoodCalculators.computeMaxAcceptableAlleleCount(1, 1024)); + Assert.assertEquals(44, GenotypeLikelihoodCalculators.computeMaxAcceptableAlleleCount(2, 1024)); + Assert.assertEquals(17, GenotypeLikelihoodCalculators.computeMaxAcceptableAlleleCount(3, 1024)); + Assert.assertEquals(5, GenotypeLikelihoodCalculators.computeMaxAcceptableAlleleCount(10, 1024)); + Assert.assertEquals(3, GenotypeLikelihoodCalculators.computeMaxAcceptableAlleleCount(20, 1024)); + Assert.assertEquals(2, GenotypeLikelihoodCalculators.computeMaxAcceptableAlleleCount(100, 1024)); + } // Simple inefficient calculation of the genotype count given the ploidy. private int calculateGenotypeCount(final int ploidy, final int alleleCount) { if (ploidy == 0) diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/haplotype/Haplotype.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/haplotype/Haplotype.java index 28df151c9..59459b027 100644 --- a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/haplotype/Haplotype.java +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/haplotype/Haplotype.java @@ -130,6 +130,7 @@ public class Haplotype extends Allele { final Haplotype ret = new Haplotype(newBases, isReference()); ret.setCigar(newCigar); ret.setGenomeLocation(loc); + ret.setScore(score); ret.setAlignmentStartHapwrtRef(newStart + getAlignmentStartHapwrtRef()); return ret; } From 45607d1b30c15013ddaa13a40ebc8bddd38c3f98 Mon Sep 17 00:00:00 2001 From: Valentin Ruano Rubio Date: Thu, 30 Jun 2016 11:41:03 -0400 Subject: [PATCH 17/68] RCM Variant sites merger won't output PL when there are too many alleles in order to avoid memory issues with large cohort runs. Small additional "cosmetic" changes to the code Addresses issue #1419. --- ...ferenceConfidenceVariantContextMerger.java | 41 ++++++--- .../VariantContextMergerUnitTest.java | 86 ++++++++++++++++++- 2 files changed, 111 insertions(+), 16 deletions(-) diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/ReferenceConfidenceVariantContextMerger.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/ReferenceConfidenceVariantContextMerger.java index 5ad0c581c..05c0a930d 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/ReferenceConfidenceVariantContextMerger.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/ReferenceConfidenceVariantContextMerger.java @@ -63,7 +63,6 @@ import org.broadinstitute.gatk.utils.GenomeLoc; import org.broadinstitute.gatk.utils.MathUtils; import org.broadinstitute.gatk.utils.Utils; import org.broadinstitute.gatk.utils.collections.Pair; -import org.broadinstitute.gatk.utils.exceptions.GATKException; import org.broadinstitute.gatk.utils.exceptions.UserException; import org.broadinstitute.gatk.utils.variant.GATKVCFConstants; import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils; @@ -147,11 +146,22 @@ public class ReferenceConfidenceVariantContextMerger { final List allelesList = new ArrayList<>(finalAlleleSet); + //TODO quick fix patch to address memory issue described in https://github.com/broadinstitute/gsa-unstable/issues/1419 + //TODO The reason to impose this limit here is that in practice the tool that is affected by the mem issue, GenotypeGVCFs will + //TODO skip the site when the number of alleles is bigger than that limit so this change does not change the outputs. + //TODO However we need to change this with a more permanent solution. + //TODO For example we could impose maxAltAlleles or maxGenotypes in the output at every step including CombineGVCFs and GenotypeGVCFs + //TODO in order to avoid to add yet another limit . + final boolean shouldComputePLs = allelesList.size() <= GenotypeLikelihoods.MAX_DIPLOID_ALT_ALLELES_THAT_CAN_BE_GENOTYPED; + if (!shouldComputePLs) { + logger.debug(String.format("location %s:%d has too many alleles (%d) to compute PLs (maximum allowed %d). PL genotype annotations won't be produced at this site", loc.getContig(), loc.getStart(), allelesList.size(), GenotypeLikelihoods.MAX_DIPLOID_ALT_ALLELES_THAT_CAN_BE_GENOTYPED)); + } + for ( final Pair> pair : vcAndNewAllelePairs ) { final VariantContext vc = pair.getFirst(); final List remappedAlleles = pair.getSecond(); - mergeRefConfidenceGenotypes(genotypes, vc, remappedAlleles, allelesList, samplesAreUniquified); + mergeRefConfidenceGenotypes(genotypes, vc, remappedAlleles, allelesList, samplesAreUniquified, shouldComputePLs); // special case DP (add it up) for all events if ( vc.hasAttribute(VCFConstants.DEPTH_KEY) ) { @@ -413,18 +423,20 @@ public class ReferenceConfidenceVariantContextMerger { * @param remappedAlleles the list of remapped alleles for the sample * @param targetAlleles the list of target alleles * @param samplesAreUniquified true if sample names have been uniquified + * @param shouldComputePLs true if the PL can be computed in this merge. */ private static void mergeRefConfidenceGenotypes(final GenotypesContext mergedGenotypes, final VariantContext vc, final List remappedAlleles, final List targetAlleles, - final boolean samplesAreUniquified) { + final boolean samplesAreUniquified, + final boolean shouldComputePLs) { final int maximumPloidy = vc.getMaxPloidy(GATKVariantContextUtils.DEFAULT_PLOIDY); // the map is different depending on the ploidy, so in order to keep this method flexible (mixed ploidies) // we need to get a map done (lazily inside the loop) for each ploidy, up to the maximum possible. final int[][] genotypeIndexMapsByPloidy = new int[maximumPloidy + 1][]; final int maximumAlleleCount = Math.max(remappedAlleles.size(),targetAlleles.size()); - int[] perSampleIndexesOfRelevantAlleles; + for (final Genotype g : vc.getGenotypes()) { final String name; @@ -433,23 +445,28 @@ public class ReferenceConfidenceVariantContextMerger { else name = g.getSampleName(); final int ploidy = g.getPloidy(); - final GenotypeBuilder genotypeBuilder = new GenotypeBuilder(g).alleles(GATKVariantContextUtils.noCallAlleles(g.getPloidy())); + final GenotypeBuilder genotypeBuilder = new GenotypeBuilder(g).alleles(GATKVariantContextUtils.noCallAlleles(g.getPloidy())) + .noPL(); genotypeBuilder.name(name); - final boolean hasPL = g.hasPL(); + + final boolean doPLs = shouldComputePLs && g.hasPL(); + final boolean hasAD = g.hasAD(); final boolean hasSAC = g.hasExtendedAttribute(GATKVCFConstants.STRAND_COUNT_BY_SAMPLE_KEY); - if (hasPL || hasSAC) { - perSampleIndexesOfRelevantAlleles = getIndexesOfRelevantAlleles(remappedAlleles, targetAlleles, vc.getStart(), g); - if (g.hasPL()) { + if (doPLs || hasSAC || hasAD) { + final int[] perSampleIndexesOfRelevantAlleles = getIndexesOfRelevantAlleles(remappedAlleles, targetAlleles, vc.getStart(), g); + if (doPLs) { // lazy initialization of the genotype index map by ploidy. final int[] genotypeIndexMapByPloidy = genotypeIndexMapsByPloidy[ploidy] == null ? GenotypeLikelihoodCalculators.getInstance(ploidy, maximumAlleleCount).genotypeIndexMap(perSampleIndexesOfRelevantAlleles) : genotypeIndexMapsByPloidy[ploidy]; final int[] PLs = generatePL(g, genotypeIndexMapByPloidy); - final int[] AD = g.hasAD() ? generateAD(g.getAD(), perSampleIndexesOfRelevantAlleles) : null; - genotypeBuilder.PL(PLs).AD(AD); + genotypeBuilder.PL(PLs); } - if (g.hasExtendedAttribute(GATKVCFConstants.STRAND_COUNT_BY_SAMPLE_KEY)) { + if (hasAD) { + genotypeBuilder.AD(generateAD(g.getAD(), perSampleIndexesOfRelevantAlleles)); + } + if (hasSAC) { final List sacIndexesToUse = adaptToSACIndexes(perSampleIndexesOfRelevantAlleles); final int[] SACs = GATKVariantContextUtils.makeNewSACs(g, sacIndexesToUse); genotypeBuilder.attribute(GATKVCFConstants.STRAND_COUNT_BY_SAMPLE_KEY, SACs); diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantContextMergerUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantContextMergerUnitTest.java index dabfa27d5..1705ebe49 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantContextMergerUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantContextMergerUnitTest.java @@ -51,13 +51,16 @@ package org.broadinstitute.gatk.tools.walkers.variantutils; +import htsjdk.samtools.SAMFileHeader; import htsjdk.variant.variantcontext.*; import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; import org.broadinstitute.gatk.tools.walkers.annotator.VariantAnnotatorEngine; import org.broadinstitute.gatk.utils.*; import org.broadinstitute.gatk.utils.exceptions.UserException; import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile; +import org.broadinstitute.gatk.utils.sam.ArtificialSAMUtils; import org.broadinstitute.gatk.utils.variant.GATKVCFConstants; +import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils; import org.testng.Assert; import org.testng.annotations.BeforeSuite; import org.testng.annotations.DataProvider; @@ -65,10 +68,7 @@ import org.testng.annotations.Test; import java.io.File; import java.io.IOException; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collections; -import java.util.List; +import java.util.*; /** * Tests {@link org.broadinstitute.gatk.tools.walkers.variantutils.ReferenceConfidenceVariantContextMerger}. @@ -170,6 +170,84 @@ public class VariantContextMergerUnitTest extends BaseTest { } } + @Test + public void testMergeTooManyAlleles() { + final int ALLELE_COUNT = 100; + final int SAMPLE_COUNT = 200; + final double MNP_PROB = 0.1; + final double MNP_MUT_RATE = 0.1; + final double NO_PL_PROB = 0.1; + final Random rdn = new Random(13); + final RandomDNA randomDNA = new RandomDNA(rdn); + final Allele refAllele = Allele.create(randomDNA.nextBases(30), true); + final Set alleles = new LinkedHashSet<>(ALLELE_COUNT); + alleles.add(refAllele); + alleles.add(GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE); + while (alleles.size() < ALLELE_COUNT) { + if (rdn.nextDouble() <= MNP_PROB) { + final byte[] bytes = refAllele.getBases().clone(); + for (int i = 0; i < bytes.length; i++) { + bytes[i] = rdn.nextDouble() <= MNP_MUT_RATE ? randomDNA.nextBase() : bytes[i]; + } + if (!Arrays.equals(bytes, refAllele.getBases())) { + alleles.add(Allele.create(bytes, false)); + } + } else { + final int newLength = rdn.nextInt(refAllele.getBases().length + 100) + 1; + if (newLength < refAllele.getBases().length) { + alleles.add(Allele.create(Arrays.copyOf(refAllele.getBases(), newLength), false)); + } else if (newLength > refAllele.getBases().length) { + final byte[] bases = randomDNA.nextBases(newLength); + System.arraycopy(refAllele.getBases(), 0, bases, 0, refAllele.getBases().length); + } // else same length... we skip and try again. + } + } + final List alleleList = new ArrayList<>(alleles); + + final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 100000); + final GenomeLocParser genomeLocParser = new GenomeLocParser(header.getSequenceDictionary()); + final List variantContexts = new ArrayList<>(SAMPLE_COUNT); + for (int i = 0; i < SAMPLE_COUNT; i++) { + final GenotypeBuilder genotypeBuilder = new GenotypeBuilder("SAMPLE_" + (i+1)); + genotypeBuilder.alleles(GATKVariantContextUtils.noCallAlleles(2)); + final List sampleAlleles = new ArrayList<>(); + sampleAlleles.add(refAllele); + sampleAlleles.add(GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE); + for (int j = 2; j < alleles.size(); j++) { + if (rdn.nextDouble() <= 0.01) { + sampleAlleles.add(alleleList.get(j)); + } + } + if (rdn.nextDouble() > NO_PL_PROB) { + final int[] PLs = new int[(sampleAlleles.size() * (sampleAlleles.size() + 1)) >> 1]; + for (int j = 0; j < PLs.length; j++) { + PLs[j] = rdn.nextInt(1000); + } + genotypeBuilder.PL(PLs); + } + final int[] AD = new int[sampleAlleles.size()]; + for (int j = 0; j < AD.length; j++) { + AD[j] = rdn.nextInt(100); + } + genotypeBuilder.AD(AD); + + variantContexts.add(new VariantContextBuilder() + .loc("chr1", 10000, 10000 + refAllele.getBases().length - 1) + .alleles(sampleAlleles) + .genotypes(genotypeBuilder.make()) + .make()); + } + final VariantContext result = ReferenceConfidenceVariantContextMerger.merge(variantContexts, + genomeLocParser.createGenomeLoc("chr1", 10000), refAllele.getBases()[0], false, true, null); + Assert.assertNotNull(result); + Assert.assertEquals(result.getGenotypes().size(), SAMPLE_COUNT); + Assert.assertTrue(result.getNAlleles() > GenotypeLikelihoods.MAX_DIPLOID_ALT_ALLELES_THAT_CAN_BE_GENOTYPED, + "Not necessarily a bug, need to fix the random data generation to make sure there is enough alt-alleles to go beyond this threshold"); + System.err.println("Alleles: " + result.getNAlleles()); + for (int i = 0; i < SAMPLE_COUNT; i++) { + Assert.assertFalse(result.getGenotype(i).hasPL(), "" + i); + } + } @DataProvider(name = "referenceConfidenceMergeData") public Object[][] makeReferenceConfidenceMergeData() { From 9b32cf5291e1e46bb0e29682958fef7204476efb Mon Sep 17 00:00:00 2001 From: Samuel Lee Date: Tue, 21 Jun 2016 21:44:27 -0400 Subject: [PATCH 18/68] Fixed merging of GVCF blocks by fixing rounding of GQ values in ReferenceConfidenceModel. --- .../ReferenceConfidenceModel.java | 9 ++-- .../gatk/utils/gvcf/GVCFWriter.java | 22 +--------- .../HaplotypeCallerGVCFIntegrationTest.java | 44 ++++++++++++------- .../variant/GATKVariantContextUtils.java | 26 +++++++++++ .../GATKVariantContextUtilsUnitTest.java | 29 ++++++++++++ 5 files changed, 89 insertions(+), 41 deletions(-) diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ReferenceConfidenceModel.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ReferenceConfidenceModel.java index 397179f35..9da43df3f 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ReferenceConfidenceModel.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ReferenceConfidenceModel.java @@ -63,7 +63,9 @@ import org.broadinstitute.gatk.utils.MathUtils; import org.broadinstitute.gatk.utils.QualityUtils; import org.broadinstitute.gatk.utils.activeregion.ActiveRegion; import org.broadinstitute.gatk.utils.contexts.AlignmentContext; -import org.broadinstitute.gatk.utils.genotyper.*; +import org.broadinstitute.gatk.utils.genotyper.ReadLikelihoods; +import org.broadinstitute.gatk.utils.genotyper.SampleList; +import org.broadinstitute.gatk.utils.genotyper.SampleListUtils; import org.broadinstitute.gatk.utils.haplotype.Haplotype; import org.broadinstitute.gatk.utils.locusiterator.LocusIteratorByState; import org.broadinstitute.gatk.utils.pileup.PileupElement; @@ -250,8 +252,9 @@ public class ReferenceConfidenceModel { // as our GLs for the site. final GenotypeLikelihoods leastConfidenceGLs = getGLwithWorstGQ(indelGLs, snpGLs); - gb.GQ((int) (-10 * leastConfidenceGLs.getLog10GQ(GenotypeType.HOM_REF))); - gb.PL(leastConfidenceGLs.getAsPLs()); + final int[] leastConfidenceGLsAsPLs = leastConfidenceGLs.getAsPLs(); + gb.GQ(GATKVariantContextUtils.calculateGQFromPLs(leastConfidenceGLsAsPLs)); + gb.PL(leastConfidenceGLsAsPLs); //gb.attribute(INDEL_INFORMATIVE_DEPTH, nIndelInformativeReads); vcb.genotypes(gb.make()); diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/gvcf/GVCFWriter.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/gvcf/GVCFWriter.java index 18379f990..f0e4ca9eb 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/gvcf/GVCFWriter.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/gvcf/GVCFWriter.java @@ -246,7 +246,7 @@ public class GVCFWriter implements VariantContextWriter { final int[] minPLs = block.getMinPLs(); gb.PL(minPLs); - final int gq = genotypeQualityFromPLs(minPLs); + final int gq = GATKVariantContextUtils.calculateGQFromPLs(minPLs); gb.GQ(gq); gb.DP(block.getMedianDP()); gb.attribute(GATKVCFConstants.MIN_DP_FORMAT_KEY, block.getMinDP()); @@ -257,26 +257,6 @@ public class GVCFWriter implements VariantContextWriter { return vcb.genotypes(gb.make()).make(); } - - private int genotypeQualityFromPLs(final int[] minPLs) { - int first = minPLs[0]; - int second = minPLs[1]; - if (first > second) { - second = first; - first = minPLs[1]; - } - for (int i = 3; i < minPLs.length; i++) { - final int candidate = minPLs[i]; - if (candidate >= second) continue; - if (candidate <= first) { - second = first; - first = candidate; - } else - second = candidate; - } - return second - first; - } - /** * Helper function to create a new HomRefBlock from a variant context and current genotype * diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java index fd15a2834..bb8373b20 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java @@ -86,7 +86,7 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest { //TODO the following test is commented out for the record //tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.NONE, PCRFreeIntervals, "7f09c261950bf86e435edfa69ed2ec71"}); tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.NONE, PCRFreeIntervals, "8d30370465d74fd549d76dd31adc4c0c"}); - tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "119a30fac57a0e5cf1b8164c1059b22c"}); + tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "cf5545094ebb264fa8eb879fd848d9ef"}); tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "a6bbc30b82e7864baf64163d55f5aee5"}); tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.NONE, WExIntervals, "2e81881e92061ad4eb29025ffdc129c7"}); tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "2c67bdc08c8784f2114c2039270b9766"}); @@ -105,8 +105,8 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest { // this functionality can be adapted to provide input data for whatever you might want in your data tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.NONE, PCRFreeIntervals, "3ae2c7e570855f6d6ca58ddd1089a970"}); - tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "822856b75c792be81693019bee672c09"}); - tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "6ef5ce3fbc943f15c077a0f12ff5bc2e"}); + tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "8bb824886fb0e77d0e8317d69f9d1b62"}); + tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "ca87b62a070801e4954d72169b88fb9c"}); tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.NONE, WExIntervals, "63ff771eed3e62340c8938b4963d0add"}); tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "1122a0b3849f42d1c4a654f93b660e1b"}); @@ -128,10 +128,10 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest { // this functionality can be adapted to provide input data for whatever you might want in your data tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.NONE, PCRFreeIntervals, "8bf132d73cf6b0851ae73c6799f19ba9"}); - tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "2f1534d30b51fd8a7861d73091be2336"}); - tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "2307bcb9f9e3468375a389720036b7da"}); + tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "450906ce3c11860c25b90cf0a56bb1a0"}); + tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "3c0346d41a7e57b45b85a920cc04f51f"}); tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.NONE, WExIntervals, "6ad7855dbf6dda2060aa93a3ee010b3e"}); - tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "a0be095ed902a8acdb80fb56ca4e8fb4"}); + tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "50e628de2a79cd6887af020b713ca3b8"}); tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.GVCF, WExIntervals, "8123d8b68b6fa77ef084f292e191622a"}); return tests.toArray(new Object[][]{}); @@ -146,11 +146,11 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest { // this functionality can be adapted to provide input data for whatever you might want in your data tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.NONE, PCRFreeIntervals, "6662cfc41393257dfd6c39f1af1e3843"}); - tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "7dc7cfd463ecb7ac535c6ba925c46ef0"}); - tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "689d4b9cdc21be370c82251e1f7a3c4f"}); + tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "0bc1ca3bff07381a344685b048e76ee4"}); + tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "9d1724150feccb0a09b6fad522605bb1"}); tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.NONE, WExIntervals, "af0fe243e3b96e59097187cd16ba1597"}); - tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "228e1d2ec2e729a5f79c37f3f2557708"}); - tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.GVCF, WExIntervals, "2fc7020457dde4439b4133c098d9ab9b"}); + tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "8a094080fb25bbcd39325dcdd62bcf65"}); + tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.GVCF, WExIntervals, "f35192d245babba9764128abad669019"}); return tests.toArray(new Object[][]{}); } @@ -293,7 +293,7 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest { public void testNoCallGVCFMissingPLsBugFix() { final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -pairHMMSub %s %s -R %s -I %s -L %s -ERC GVCF --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d", HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, b37KGReference, NOCALL_GVCF_BUGFIX_BAM, NOCALL_GVCF_BUGFIX_INTERVALS, GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER); - final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("d55ccf214fd5095e6d586c1547cb1a7a")); + final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("9fc3c68f46e747b730615c0be98cb013")); spec.disableShadowBCF(); executeTest("testNoCallGVCFMissingPLsBugFix", spec); } @@ -326,7 +326,7 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest { public void testAlleleSpecificAnnotations() { final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -pairHMMSub %s %s -R %s -I %s -L %s -ERC GVCF --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d -G Standard -G AS_Standard --disableDithering", HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, b37KGReference, privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.bam", "20:10433000-10437000", GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER); - final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Collections.singletonList("1733d15e960ed473f58a2bfc7f686a2e")); + final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Collections.singletonList("d986868d83057c0ecdf7ba177b8282f3")); spec.disableShadowBCF(); executeTest(" testAlleleSpecificAnnotations", spec); } @@ -335,7 +335,7 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest { public void testASMQMateRankSumAnnotation() { final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -pairHMMSub %s %s -R %s -I %s -L %s -ERC GVCF --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d -A AS_MQMateRankSumTest --disableDithering", HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, b37KGReference, privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.bam", "20:10433000-10437000", GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER); - final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Collections.singletonList("e6e09a82cade24f8121c81c1d43b5d03")); + final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Collections.singletonList("87723bd4442c7ec25f65a77d6434957a")); spec.disableShadowBCF(); executeTest(" testASMQMateRankSumAnnotation", spec); } @@ -344,7 +344,7 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest { public void testASInsertSizeRankSum() { final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -pairHMMSub %s %s -R %s -I %s -L %s -ERC GVCF --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d -G Standard -G AS_Standard --disableDithering -A AS_InsertSizeRankSum", HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, b37KGReference, privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.bam", "20:10433000-10437000", GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER); - final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Collections.singletonList("33db0c7e64fc963c160f8bb59d983375")); + final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Collections.singletonList("a63d6912b2f2fab7debee9488fbbd0b0")); spec.disableShadowBCF(); executeTest(" testASInsertSizeRankSum", spec); } @@ -362,7 +362,7 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest { public void testHaplotypeCallerMaxNumPLValues() { final String commandLine = String.format("-T HaplotypeCaller -R %s -I %s -L %s -ERC GVCF --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d -ploidy 4 -maxNumPLValues 70", b37KGReference, privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.bam", validationDataLocation + "NA12878.HiSeq.b37.chr20.10_11mb.test.intervals", GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER); - final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Collections.singletonList("1176028faca6cd397f581f9e60c474a8")); + final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Collections.singletonList("446604d4398d4c1bad41b9506624ab91")); spec.disableShadowBCF(); executeTest("testHaplotypeCallerMaxNumPLValues", spec); } @@ -379,7 +379,7 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest { final String commandLine = String.format("-T HaplotypeCaller -R %s -I %s -L %s -ERC GVCF --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d -ploidy 4 -maxNumPLValues 30 -log %s", b37KGReference, privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.bam", validationDataLocation + "NA12878.HiSeq.b37.chr20.10_11mb.test.intervals", GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER, logFileName); - final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("19f5398e4013c06b52c0085fe0b3469e")); + final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("a01abc7e0b4a486125967d3a1ebcc33f")); spec.disableShadowBCF(); executeTest("testHaplotypeCallerMaxNumPLValuesExceededWithWarnLogLevel", spec); // Make sure the "Maximum allowed number of PLs exceeded" messages are in the log @@ -404,7 +404,7 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest { final String commandLine = String.format("-T HaplotypeCaller -R %s -I %s -L %s -ERC GVCF --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d -ploidy 4 -maxNumPLValues 30 -log %s", b37KGReference, privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.bam", validationDataLocation + "NA12878.HiSeq.b37.chr20.10_11mb.test.intervals", GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER, logFileName); - final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("19f5398e4013c06b52c0085fe0b3469e")); + final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("a01abc7e0b4a486125967d3a1ebcc33f")); spec.disableShadowBCF(); executeTest("testHaplotypeCallerMaxNumPLValuesExceededWithDebugLogLevel", spec); // Make sure the "Maximum allowed number of PLs exceeded" messages are in the log @@ -414,4 +414,14 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest { // Set the log level back logger.setLevel(level); } + + //Regression test for https://github.com/broadinstitute/gsa-unstable/issues/1345 + @Test + public void testHaplotypeCallerGVCFBlocks() { + final String commandLine = String.format("-T HaplotypeCaller -R %s -I %s -L 1:1-1000000 -ERC GVCF --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d", + b37KGReference, privateTestDir + "gvcf_blocks_test.bam", GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER); + final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("802c53621bd2004d9052a8e81d91df3e")); + spec.disableShadowBCF(); + executeTest("testHaplotypeCallerGVCFBlocks", spec); + } } diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVariantContextUtils.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVariantContextUtils.java index 407a49570..d67dd6c01 100644 --- a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVariantContextUtils.java +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVariantContextUtils.java @@ -2314,5 +2314,31 @@ public class GATKVariantContextUtils { builder.attribute(VCFConstants.ALLELE_FREQUENCY_KEY, alleleFrequency.toArray()); } } + + /** + * @param plValues array of PL values + * @return the genotype quality corresponding to the PL values + */ + public static int calculateGQFromPLs(final int[] plValues) { + if ( plValues == null ) throw new IllegalArgumentException("Array of PL values cannot be null."); + if ( plValues.length < 2 ) throw new IllegalArgumentException("Array of PL values must contain at least two elements."); + + int first = plValues[0]; + int second = plValues[1]; + if (first > second) { + second = first; + first = plValues[1]; + } + for (int i = 2; i < plValues.length; i++) { + final int candidate = plValues[i]; + if (candidate >= second) continue; + if (candidate <= first) { + second = first; + first = candidate; + } else + second = candidate; + } + return second - first; + } } diff --git a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/variant/GATKVariantContextUtilsUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/variant/GATKVariantContextUtilsUnitTest.java index f875d2e8f..20ec7da58 100644 --- a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/variant/GATKVariantContextUtilsUnitTest.java +++ b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/variant/GATKVariantContextUtilsUnitTest.java @@ -1872,5 +1872,34 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { final Map calledAltAlleles = new LinkedHashMap<>(); GATKVariantContextUtils.updateChromosomeCountsInfo(calledAltAlleles, calledAlleles, null); } + + @DataProvider(name="gqFromPLsData") + public Object[][] gqFromPLsData() { + return new Object[][]{ + {new int[]{0, 15}, 15}, + {new int[]{15, 0}, 15}, + {new int[]{0, 10, 20}, 10}, + {new int[]{20, 10, 0}, 10}, + {new int[]{0, 10, 20, 30, 40}, 10}, + {new int[]{30, 40, 20, 10, 0}, 10}, + {new int[]{-10, 20, 35}, 30}, + {new int[]{35, 40, -10, 15, 20}, 25}, + {new int[]{0, 10, 20, 30, 40, 50, 5}, 5}, + {new int[]{15, 15, 0, 5}, 5}, + {new int[]{15, 15, 0, 25}, 15}, + {new int[]{0, 15, 0, 25}, 0} + }; + } + + @Test(dataProvider = "gqFromPLsData") + public void testCalculateGQFromPLs(final int[] plValues, final int expectedGQ) { + Assert.assertEquals(GATKVariantContextUtils.calculateGQFromPLs(plValues), expectedGQ); + } + + @Test(expectedExceptions = IllegalArgumentException.class) + public void testCalculateGQFromShortPLArray() { + final int[] plValues = new int[]{0}; + GATKVariantContextUtils.calculateGQFromPLs(plValues); + } } From d6d0678b50a6a4a36a6be5c22ed8c8926c7e4a2d Mon Sep 17 00:00:00 2001 From: Takuto Sato Date: Wed, 15 Jun 2016 15:32:17 -0400 Subject: [PATCH 19/68] Build on Laura's code and finish porting MuTect1 clustered read position filter. --- .../walkers/cancer/ClusteredReadPosition.java | 350 ++++++++++++++++++ ...MuTectStats.java => MedianStatistics.java} | 71 ++-- .../cancer/m2/ClusteredEventsAnnotator.java | 191 ---------- .../cancer/m2/M2ArgumentCollection.java | 3 + .../gatk/tools/walkers/cancer/m2/MuTect2.java | 28 +- .../cancer/m2/SomaticGenotypingEngine.java | 29 +- .../cancer/m2/MuTect2IntegrationTest.java | 33 +- .../gatk/utils/sam/ReadUtils.java | 4 +- .../gatk/utils/variant/GATKVCFConstants.java | 6 + .../utils/variant/GATKVCFHeaderLines.java | 10 +- .../broadinstitute/gatk/utils/BaseTest.java | 1 + 11 files changed, 456 insertions(+), 270 deletions(-) create mode 100644 protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/ClusteredReadPosition.java rename protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/{m2/MuTectStats.java => MedianStatistics.java} (85%) delete mode 100644 protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/ClusteredEventsAnnotator.java diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/ClusteredReadPosition.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/ClusteredReadPosition.java new file mode 100644 index 000000000..01fb11c69 --- /dev/null +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/ClusteredReadPosition.java @@ -0,0 +1,350 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 ("BROAD") and the LICENSEE and is effective at the date the downloading is completed ("EFFECTIVE DATE"). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system ("PHONE-HOME") which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE'S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2016 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.tools.walkers.cancer; + +import htsjdk.variant.variantcontext.Allele; +import htsjdk.variant.variantcontext.VariantContext; +import htsjdk.variant.vcf.VCFInfoHeaderLine; +import org.apache.commons.math3.stat.descriptive.rank.Median; +import org.apache.log4j.Logger; +import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.ActiveRegionBasedAnnotation; +import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible; +import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.InfoFieldAnnotation; +import org.broadinstitute.gatk.tools.walkers.cancer.m2.MuTect2; +import org.broadinstitute.gatk.utils.QualityUtils; +import org.broadinstitute.gatk.utils.collections.Pair; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.genotyper.MostLikelyAllele; +import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; +import org.broadinstitute.gatk.utils.sam.ReadUtils; +import org.broadinstitute.gatk.utils.variant.GATKVCFConstants; +import org.broadinstitute.gatk.utils.variant.GATKVCFHeaderLines; + +import java.util.*; +import java.util.stream.Collectors; + +/** + * Detect clustering of variants near the ends of reads + * + *

This annotation detects clustering of evidence for a somatic variant near the ends of reads. To turn on the annotation and the accompanying filter (clustered_read_position), add --enable_clustered_read_position_filter flag in the commandline. + * + * + *

Statistical notes

+ *

ClusteredReadPosition produces four INFO field annotations. At a given somatic variant site, MEDIAN_LEFT_OFFSET is the median of the number of bases from the left end of the tumor read to the variant. MEDIAN_RIGHT_OFFSET is similar, but counts from the right end of the read. MAD_LEFT_OFFSET and MAD_RIGHT_OFFSET measure the median absolute deviations. The median gives us the offset of a representative read, while the median absolute deviation captures the spread. We filter a variant if MEDIAN_LEFT_OFFSET <= 10 and MAD_LEFT_OFFSET <= 3, or if MEDIAN_RIGHT_OFFSET <= 10 and MAD_RIGHT_OFFSET <= 3. + * + * + *

Caveat

+ *

ClusteredReadPosition is available with MuTect2 only

+ * + *

RelatedAnnotation

+ *
  • ReadPosRankSum is a similar annotation designed for germline variants. + * + */ +public class ClusteredReadPosition extends InfoFieldAnnotation implements ActiveRegionBasedAnnotation { + private final static Logger logger = Logger.getLogger(ClusteredReadPosition.class); + private String tumorSampleName = null; + + @Override + public List getKeyNames() { return Arrays.asList( + GATKVCFConstants.MEDIAN_LEFT_OFFSET_KEY, + GATKVCFConstants.MEDIAN_RIGHT_OFFSET_KEY, + GATKVCFConstants.MAD_MEDIAN_LEFT_OFFSET_KEY, + GATKVCFConstants.MAD_MEDIAN_RIGHT_OFFSET_KEY); + } + + @Override + public List getDescriptions() { + List descriptions = new ArrayList<>(); + for (final String infoFieldKey : getKeyNames()){ + descriptions.add(GATKVCFHeaderLines.getInfoLine(infoFieldKey)); + } + return descriptions; + + // the following causes a cryptic class not found error, similar to the one in computeReadPositionStats + // return getKeyNames().stream().map(GATKVCFHeaderLines::getInfoLine).collect(Collectors.toList()); + } + + @Override + public Map annotate(final RefMetaDataTracker tracker, + final AnnotatorCompatible walker, + final ReferenceContext ref, + final Map stratifiedContexts, + final VariantContext vc, + final Map stratifiedPerReadAlleleLikelihoodMap) { + // TODO: might make sense to move this code to SomaticGenoypingEngine. + // FIXME: checking walker is mutect2 is not ideal...moving this annotation to SomaticGenoypingEngine will solve it + + // populate tumorSampleName the first time we call this method. skip afterwards. + if (tumorSampleName == null){ + if (walker instanceof MuTect2) { + tumorSampleName = ((MuTect2) walker).getTumorSampleName(); + } else { + throw new IllegalStateException("ClusteredReadPosition: walker is not MuTect2"); + } + } + + // we skip multi-allelic sites + if (vc.getAlternateAlleles().size() > 1){ + return null; + } + + final Map result = new HashMap<>(); + + if ( stratifiedPerReadAlleleLikelihoodMap != null ) { + final PerReadAlleleLikelihoodMap likelihoodMap = stratifiedPerReadAlleleLikelihoodMap.get(tumorSampleName); + if ( likelihoodMap != null && !likelihoodMap.isEmpty() ) { + final Optional readPositionStatsOption = computeReadPositionStats(vc, likelihoodMap); + if (readPositionStatsOption.isPresent()){ + MedianStatistics readPositionStats = readPositionStatsOption.get(); + result.put(GATKVCFConstants.MEDIAN_LEFT_OFFSET_KEY, readPositionStats.getLeftMedian()); + result.put(GATKVCFConstants.MEDIAN_RIGHT_OFFSET_KEY, readPositionStats.getRightMedian()); + result.put(GATKVCFConstants.MAD_MEDIAN_LEFT_OFFSET_KEY, readPositionStats.getLeftMAD()); + result.put(GATKVCFConstants.MAD_MEDIAN_RIGHT_OFFSET_KEY, readPositionStats.getRightMAD()); + } else { + return null; + } + } + } + + return result; + } + + /** + * + * @param vc + * @param pralm + * @return median of left and right offsets and their median absolute deviations. does not return null. + */ + private Optional computeReadPositionStats(final VariantContext vc, + final PerReadAlleleLikelihoodMap pralm) { + final int variantStartPosition = vc.getStart(); + final List tumorLeftOffsets = new ArrayList<>(); + final List tumorRightOffsets = new ArrayList<>(); + for ( final Map.Entry> readAlleleLikelihood : pralm.getLikelihoodReadMap().entrySet() ) { + final MostLikelyAllele mostLikelyAllele = PerReadAlleleLikelihoodMap.getMostLikelyAllele(readAlleleLikelihood.getValue()); + final GATKSAMRecord read = readAlleleLikelihood.getKey(); + if ( mostLikelyAllele.getMostLikelyAllele().isReference() || ! mostLikelyAllele.isInformative() || ! isUsableRead(read)) { + continue; + } + + final Pair offsetPair = getVariantPositionInRead(read, variantStartPosition); + final OptionalInt variantPositionInReadFromLeft = offsetPair.getFirst(); + final OptionalInt variantPositionInReadFromRight = offsetPair.getSecond(); + + // suffices to check only the left offset because the right offset depends on it + if ( variantPositionInReadFromLeft.isPresent() ) { + tumorLeftOffsets.add(variantPositionInReadFromLeft.getAsInt()); + tumorRightOffsets.add(variantPositionInReadFromRight.getAsInt()); + } + } + + if (tumorLeftOffsets.isEmpty() || tumorRightOffsets.isEmpty()) { + // This condition seems to arise when the reads as aligned in the bam (as represented by PRALM) do not contain the alt read found by HaplotypeCaller + logger.warn("At Position " + vc.getContig() + ": " + vc.getStart() + " , the left or right offset list is empty"); + return Optional.empty(); + } + + // The following (mapToDouble() in particular) causes ClusteredReadPosition to be not added to ClassMap + // leftMedian = median.evaluate(tumorLeftOffsets.stream().mapToDouble( x -> x ).toArray()); + // rightMedian = median.evaluate(tumorRightOffsets.stream().mapToDouble( x -> x).toArray()); + + // until we understand why mapToDouble() causes the above error, have to compute medians in two steps + // first use a for loop to manually cast integer to doubles, then call median :: evaluate + double[] tumorLeftOffsetsDouble = new double[tumorLeftOffsets.size()]; + double[] tumorRightOffsetsDouble = new double[tumorRightOffsets.size()]; + for (int i = 0; i < tumorLeftOffsets.size(); i++){ + tumorLeftOffsetsDouble[i] = (double) tumorLeftOffsets.get(i); + tumorRightOffsetsDouble[i] = (double) tumorRightOffsets.get(i); + } + + Median median = new Median(); + double leftMedian = median.evaluate(tumorLeftOffsetsDouble); + double rightMedian = median.evaluate(tumorRightOffsetsDouble); + double leftMAD = calculateMAD(tumorLeftOffsets, leftMedian); + double rightMAD = calculateMAD(tumorRightOffsets, rightMedian); + + return( Optional.of(new MedianStatistics(leftMedian, rightMedian, leftMAD, rightMAD) ) ); + } + + private static class MedianStatistics { + private double leftMedian; + private double rightMedian; + private double leftMAD; + private double rightMAD; + + public MedianStatistics(double leftMedian, double rightMedian, double leftMAD, double rightMAD) { + this.leftMedian = leftMedian; + this.rightMedian = rightMedian; + this.leftMAD = leftMAD; + this.rightMAD = rightMAD; + } + + public double getLeftMedian() { + return leftMedian; + } + + public double getRightMedian() { + return rightMedian; + } + + public double getLeftMAD() { + return leftMAD; + } + + public double getRightMAD() { + return rightMAD; + } + } + + + /** + Examples below show how we compute the position of the variant with respect to the left and right end of the reads. + Note that a variant may be SNP, deletion, or insertion, and we are counting the number of bases from the left/right end of the read to that variant. + We first compute the left offset. Then, right offset = read length - left offset. + This means that if there is an insertion between the either end of a read and the variant, we count the inserted bases. Conversely, we do not count the deleted bases between the end of a read and a variant. + We count soft-clipped bases. + + example 1 : SNP + + right offset: 9 8 7 6 5 4 3 2 1 0 + ref: _ _ _ _ _ _ _ _ _ _ + read: _ _ _ _ x _ _ _ _ _ + left offset: 0 1 2 3 4 5 6 7 8 9 + + left-offset = 4. right offset = 5. + read.getReadLength() = 10. numReadBasesToVariant = 5. + + example 2: deletion + + We count from the left end of the read to the last non-deleted base i.e. the first deleted base is not counted. + From the right end, we count bases to the *end* of the deletion. + + right offset: 9 8 7 6 5 4 3 2 1 0 + ref: _ _ _ _ _ _ _ _ _ _ + read: _ _ _ _|- - - -|_ _ + left offset: 0 1 2 3 4 5 6 7 8 9 + + left-offset = 3. right-offset = 2. + read.getReadLength() = 6. numReadBasesToVariant = 4 + + example 3: insertion + + For insertions, we count from the left to the first inserted base. From the right, we count all the way to the first inserted base. + In the future, we may modify this; it might be desirable to count from the right to the *last* inserted base. + + right offset: 9 8 7 6 5 4 3 2 1 0 + ref: _ _ _ _ _ _ _ _ + read: _ _ _ I I I _ _ _ _ + left offset: 0 1 2 3 4 5 6 7 8 9 + + left-offset = 3. right offset = 6 + read.getReadLength() = 10. numReadBasesToVariant = 4. + + */ + + /** + * The function assumes that read contains the variant allele. + * + * @param read + * @param variantStartPosition the location of the variant in the reference + * @return + */ + + protected Pair getVariantPositionInRead(final GATKSAMRecord read, final int variantStartPosition) { + final Pair refPositionAndDeletionFlag = ReadUtils.getReadCoordinateForReferenceCoordinate(read.getSoftStart(), read.getCigar(), variantStartPosition, true); + // the +1 is needed there because getReadCoordinateForReferenceCoordinate() returns the number of read bases from the left end to the variant - 1 + int numReadBasesFromLeftEndToVariant = refPositionAndDeletionFlag.getFirst() + 1; + + // we don't take advantage of fallsInsideOrJustBeforeDeletionOrSkippedRegion flag now, but we might want to, so I will leave it here in comments. + // boolean fallsInsideOrJustBeforeDeletionOrSkippedRegion = refPositionAndDeletionFlag.getSecond(); + + if ( numReadBasesFromLeftEndToVariant == ReadUtils.CLIPPING_GOAL_NOT_REACHED ) { + return new Pair(OptionalInt.empty(), OptionalInt.empty()); + } else { + int leftOffset = numReadBasesFromLeftEndToVariant - 1; + int rightOffset = read.getReadLength() - numReadBasesFromLeftEndToVariant; + return new Pair(OptionalInt.of(leftOffset), OptionalInt.of(rightOffset)); + } + } + + /** + * Can the read be used in comparative tests between ref / alt bases? + * + * @param read the read to consider + * @return false if MQ is either 0 or unavailable. true otherwise. + */ + private boolean isUsableRead(final GATKSAMRecord read) { + return( read.getMappingQuality() != 0 || read.getMappingQuality() != QualityUtils.MAPPING_QUALITY_UNAVAILABLE); + } + + /** + * + * @param offsets a list of integers + * @param median median of the list offsets. + * @return median absolute deviation (median of the list of deviations from the median) + */ + private double calculateMAD(final List offsets, final double median) { + // This code is concise but somehow leads to ClusteredReadPosition class being removed from ClassMap. + // mapToDouble() seems to be the trigger + // return new Median().evaluate(offsets.stream().mapToDouble(x -> Math.abs(x - median)).toArray()); + + double[] medianAbsoluteDeviations = new double[offsets.size()]; + for (int i = 0; i < offsets.size(); i++){ + medianAbsoluteDeviations[i] = Math.abs(offsets.get(i) - median); + } + + return new Median().evaluate(medianAbsoluteDeviations); + } +} diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/MuTectStats.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/MedianStatistics.java similarity index 85% rename from protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/MuTectStats.java rename to protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/MedianStatistics.java index bb02db4ad..0a88c28de 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/MuTectStats.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/MedianStatistics.java @@ -49,60 +49,37 @@ * 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. */ -package org.broadinstitute.gatk.tools.walkers.cancer.m2; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collections; -import java.util.List; +package org.broadinstitute.gatk.tools.walkers.cancer; /** - * Collection of Statistical methods and tests used by MuTect + * Created by tsato on 6/27/16. */ -public class MuTectStats { - - public static double calculateMAD(ArrayList xs, double median) { - ArrayList deviations = new ArrayList<>(xs.size()); - - for(double x : xs) { - deviations.add(Math.abs(x - median)); - } - - return getMedian(deviations); +public class MedianStatistics { + private double leftMedian; + private double rightMedian; + private double leftMAD; + private double rightMAD; + public MedianStatistics(double leftMedian, double rightMedian, double leftMAD, double rightMAD) { + this.leftMedian = leftMedian; + this.rightMedian = rightMedian; + this.leftMAD = leftMAD; + this.rightMAD = rightMAD; } - public static double getMedian(ArrayList data) { - Collections.sort(data); - Double result; - - if (data.size() % 2 == 1) { - // If the number of entries in the list is not even. - - // Get the middle value. - // You must floor the result of the division to drop the - // remainder. - result = data.get((int) Math.floor(data.size()/2) ); - - } else { - // If the number of entries in the list are even. - - // Get the middle two values and average them. - Double lowerMiddle = data.get(data.size()/2 ); - Double upperMiddle = data.get(data.size()/2 - 1 ); - result = (lowerMiddle + upperMiddle) / 2; - } - - return result; + public double getLeftMedian() { + return leftMedian; } - public static double[] convertIntegersToDoubles(List integers) - { - double[] ret = new double[integers.size()]; - for (int i=0; i < ret.length; i++) - { - ret[i] = integers.get(i); - } - return ret; + public double getRightMedian() { + return rightMedian; + } + + public double getLeftMAD() { + return leftMAD; + } + + public double getRightMAD() { + return rightMAD; } } diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/ClusteredEventsAnnotator.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/ClusteredEventsAnnotator.java deleted file mode 100644 index db47d34e8..000000000 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/ClusteredEventsAnnotator.java +++ /dev/null @@ -1,191 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE -* SOFTWARE LICENSE AGREEMENT -* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 ("BROAD") and the LICENSEE and is effective at the date the downloading is completed ("EFFECTIVE DATE"). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. PHONE-HOME FEATURE -* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system ("PHONE-HOME") which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE'S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. -* -* 4. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012-2016 Broad Institute, Inc. -* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 5. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 6. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 7. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 8. MISCELLANEOUS -* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.gatk.tools.walkers.cancer.m2; - -import htsjdk.variant.variantcontext.Allele; -import htsjdk.variant.variantcontext.VariantContext; -import htsjdk.variant.vcf.VCFHeaderLineType; -import htsjdk.variant.vcf.VCFInfoHeaderLine; -import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.ActiveRegionBasedAnnotation; -import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible; -import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.InfoFieldAnnotation; -import org.broadinstitute.gatk.utils.QualityUtils; -import org.broadinstitute.gatk.utils.contexts.AlignmentContext; -import org.broadinstitute.gatk.utils.contexts.ReferenceContext; -import org.broadinstitute.gatk.utils.genotyper.MostLikelyAllele; -import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap; -import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; -import org.broadinstitute.gatk.utils.sam.AlignmentUtils; -import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; -import org.broadinstitute.gatk.utils.sam.ReadUtils; - -import java.util.*; - -/** - * Created by gauthier on 7/27/15. - */ -public class ClusteredEventsAnnotator extends InfoFieldAnnotation implements ActiveRegionBasedAnnotation { - - private String tumorSampleName = null; - - @Override - public List getKeyNames() { return Arrays.asList("tumorForwardOffsetMedian","tumorReverseOffsetMedian","tumorForwardOffsetMAD","tumorReverseOffsetMAD"); } - - @Override - public List getDescriptions() { - //TODO: this needs a lot of re-phrasing - return Arrays.asList(new VCFInfoHeaderLine("TUMOR_FWD_POS_MEDIAN", 1, VCFHeaderLineType.Integer, "Median offset of tumor variant position from positive read end"), - new VCFInfoHeaderLine("TUMOR_FWD_POS_MAD", 1, VCFHeaderLineType.Integer, "Median absolute deviation from the median for tumor forward read positions"), - new VCFInfoHeaderLine("TUMOR_REV_POS_MEDIAN", 1, VCFHeaderLineType.Integer, "Median offset of tumor variant position from negative read end"), - new VCFInfoHeaderLine("TUMOR_REV_POS_MAD", 1, VCFHeaderLineType.Integer, "Median absolute deviation from the median for tumor reverse read positions")); - } - - @Override - public Map annotate(final RefMetaDataTracker tracker, - final AnnotatorCompatible walker, - final ReferenceContext ref, - final Map stratifiedContexts, - final VariantContext vc, - final Map stratifiedPerReadAlleleLikelihoodMap) { - - if (tumorSampleName == null){ - if (walker instanceof MuTect2 ) { - tumorSampleName = ((MuTect2) walker).tumorSampleName; - } else { - // ts: log error and exit - throw new IllegalStateException("ClusteredEventsAnnotator: walker is not MuTect2"); - } - } - - final Map map = new HashMap<>(); - - - if ( stratifiedPerReadAlleleLikelihoodMap != null ) { - final PerReadAlleleLikelihoodMap likelihoodMap = stratifiedPerReadAlleleLikelihoodMap.get(tumorSampleName); - MuTect2.logReadInfo("HAVCYADXX150109:2:2209:19034:53394", likelihoodMap.getLikelihoodReadMap().keySet(), "Present inside ClusteredEventsAnnotator:annotate"); - if ( likelihoodMap != null && !likelihoodMap.isEmpty() ) { - double[] list = fillQualsFromLikelihoodMap(vc.getStart(), likelihoodMap); // [fwdMedian, revMedian, fwdMAD, revMAD] - final int FWDMEDIAN = 0, REVMEDIAN = 1, FWDMAD = 2, REVMAD = 3; // ts: make a class to contain these values - map.put("TUMOR_FWD_POS_MEDIAN", list[FWDMEDIAN]); - map.put("TUMOR_REV_POS_MEDIAN", list[REVMEDIAN]); - map.put("TUMOR_FWD_POS_MAD", list[FWDMAD]); - map.put("TUMOR_REV_POS_MAD", list[REVMAD]); - } - } - - return map; - - } - - private double[] fillQualsFromLikelihoodMap(final int refLoc, - final PerReadAlleleLikelihoodMap likelihoodMap) { - final ArrayList tumorFwdOffset = new ArrayList<>(); - final ArrayList tumorRevOffset = new ArrayList<>(); - for ( final Map.Entry> el : likelihoodMap.getLikelihoodReadMap().entrySet() ) { - final MostLikelyAllele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue()); - if ( ! a.isInformative() ) - continue; // read is non-informative - - final GATKSAMRecord read = el.getKey(); - if ( isUsableRead(read, refLoc) ) { - if ( a.getMostLikelyAllele().isReference() ) - continue; - final Double valueRight = getElementForRead(read, refLoc, ReadUtils.ClippingTail.RIGHT_TAIL); - if ( valueRight == null ) - continue; - tumorFwdOffset.add(valueRight); - final Double valueLeft = getElementForRead(read, refLoc, ReadUtils.ClippingTail.LEFT_TAIL); - if ( valueLeft == null ) - continue; - tumorRevOffset.add(valueLeft); - } - } - - double fwdMedian = 0.0; - double revMedian = 0.0; - double fwdMAD = 0.0; - double revMAD = 0.0; - - if (!tumorFwdOffset.isEmpty() && !tumorRevOffset.isEmpty()) { - fwdMedian = MuTectStats.getMedian(tumorFwdOffset); - revMedian = MuTectStats.getMedian(tumorRevOffset); - fwdMAD = MuTectStats.calculateMAD(tumorFwdOffset, fwdMedian); - revMAD = MuTectStats.calculateMAD(tumorRevOffset, revMedian); - } - - return( new double[] {fwdMedian, revMedian, fwdMAD, revMAD} ); // TODO: make an object container instead of array - } - - protected Double getElementForRead(final GATKSAMRecord read, final int refLoc, final ReadUtils.ClippingTail tail) { - final int offset = ReadUtils.getReadCoordinateForReferenceCoordinate(read.getSoftStart(), read.getCigar(), refLoc, tail, true); - if ( offset == ReadUtils.CLIPPING_GOAL_NOT_REACHED ) // offset is the number of bases in the read, including inserted bases, from start of read to the variant - return null; - - int readPos = AlignmentUtils.calcAlignmentByteArrayOffset(read.getCigar(), offset, false, 0, 0); // readpos is the number of REF bases from start to variant. I would name it as such... - final int numAlignedBases = AlignmentUtils.getNumAlignedBasesCountingSoftClips( read ); - if (readPos > numAlignedBases / 2) - readPos = numAlignedBases - (readPos + 1); - return (double)readPos; - } - - /** - * Can the read be used in comparative tests between ref / alt bases? - * - * @param read the read to consider - * @param refLoc the reference location - * @return true if this read is meaningful for comparison, false otherwise - */ - protected boolean isUsableRead(final GATKSAMRecord read, final int refLoc) { - return !( read.getMappingQuality() == 0 || - read.getMappingQuality() == QualityUtils.MAPPING_QUALITY_UNAVAILABLE ); - } -} diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/M2ArgumentCollection.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/M2ArgumentCollection.java index 339603567..8ad89a848 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/M2ArgumentCollection.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/M2ArgumentCollection.java @@ -138,6 +138,9 @@ public class M2ArgumentCollection extends AssemblyBasedCallerArgumentCollection @Argument(fullName = "enable_strand_artifact_filter", required = false, doc = "turn on strand artifact filter") public boolean ENABLE_STRAND_ARTIFACT_FILTER = false; + @Argument(fullName = "enable_clustered_read_position_filter", required = false, doc = "turn on clustered read position filter") + public boolean ENABLE_CLUSTERED_READ_POSITION_FILTER = false; + /** * This argument is used for the M1-style read position filter */ diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/MuTect2.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/MuTect2.java index 08ea5c243..b3c7d950f 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/MuTect2.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/MuTect2.java @@ -205,8 +205,6 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i private byte MIN_TAIL_QUALITY; private double log10GlobalReadMismappingRate; - - @ArgumentCollection protected M2ArgumentCollection MTAC = new M2ArgumentCollection(); @@ -364,6 +362,9 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i private VariantAnnotatorEngine initializeVCFOutput() { // initialize the output VCF header + if (MTAC.ENABLE_CLUSTERED_READ_POSITION_FILTER) { + annotationsToUse.add("ClusteredReadPosition"); + } final VariantAnnotatorEngine annotationEngine = new VariantAnnotatorEngine(Arrays.asList(annotationClassesToUse), annotationsToUse, annotationsToExclude, this, getToolkit()); Set headerInfo = new HashSet<>(); @@ -418,6 +419,7 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i headerInfo.add(GATKVCFHeaderLines.getFilterLine(GATKVCFConstants.GERMLINE_RISK_FILTER_NAME)); headerInfo.add(GATKVCFHeaderLines.getFilterLine(GATKVCFConstants.TRIALLELIC_SITE_FILTER_NAME)); headerInfo.add(GATKVCFHeaderLines.getFilterLine(GATKVCFConstants.STRAND_ARTIFACT_FILTER_NAME)); + headerInfo.add(GATKVCFHeaderLines.getFilterLine(GATKVCFConstants.CLUSTERED_READ_POSITION_FILTER_NAME)); if ( ! doNotRunPhysicalPhasing ) { @@ -835,10 +837,21 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i filters.add(GATKVCFConstants.CLUSTERED_EVENTS_FILTER_NAME); } - // TODO: Add clustered read position filter here - // TODO: Move strand bias filter here - return filters; + // clustered read position filter + if (MTAC.ENABLE_CLUSTERED_READ_POSITION_FILTER){ + Double tumorFwdPosMedian = (Double) vc.getAttribute(GATKVCFConstants.MEDIAN_LEFT_OFFSET_KEY); + Double tumorRevPosMedian = (Double) vc.getAttribute(GATKVCFConstants.MEDIAN_RIGHT_OFFSET_KEY); + Double tumorFwdPosMAD = (Double) vc.getAttribute(GATKVCFConstants.MAD_MEDIAN_LEFT_OFFSET_KEY); + Double tumorRevPosMAD = (Double) vc.getAttribute(GATKVCFConstants.MAD_MEDIAN_RIGHT_OFFSET_KEY); + //If the variant is near the read end (median threshold) and the positions are very similar (MAD threshold) then filter + if ( (tumorFwdPosMedian != null && tumorFwdPosMedian <= MTAC.PIR_MEDIAN_THRESHOLD && tumorFwdPosMAD != null && tumorFwdPosMAD <= MTAC.PIR_MAD_THRESHOLD) || + (tumorRevPosMedian != null && tumorRevPosMedian <= MTAC.PIR_MEDIAN_THRESHOLD && tumorRevPosMAD != null && tumorRevPosMAD <= MTAC.PIR_MAD_THRESHOLD)) + filters.add(GATKVCFConstants.CLUSTERED_READ_POSITION_FILTER_NAME); + } + // TODO: Move strand bias filter here + + return filters; } @@ -1313,6 +1326,11 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i return normalSampleName != null && normalSampleName.equals(rec.getReadGroup().getSample()); } + + public String getTumorSampleName(){ + return tumorSampleName; + } + // KCIBUL: new stuff -- read up on this!! /** * As of GATK 3.3, HaplotypeCaller outputs physical (read-based) information (see version 3.3 release notes and documentation for details). This argument disables that behavior. diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/SomaticGenotypingEngine.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/SomaticGenotypingEngine.java index 536283bc3..dbacfbe41 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/SomaticGenotypingEngine.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/SomaticGenotypingEngine.java @@ -209,7 +209,7 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine { // TODO: CONFIRM WITH GSA IF IT IS OK TO REMOVE READS FROM THE PRALM (should be... they do it in filterPoorlyModeledReads!) PerReadAlleleLikelihoodMap tumorPRALM = readAlleleLikelihoods.toPerReadAlleleLikelihoodMap(readAlleleLikelihoods.sampleIndex(tumorSampleName)); filterPRALMForOverlappingReads(tumorPRALM, mergedVC.getReference(), loc, false); - MuTect2.logReadInfo(DEBUG_READ_NAME, tumorPRALM.getLikelihoodReadMap().keySet(), "Present after filtering for overlapping reads"); + MuTect2.logReadInfo(DEBUG_READ_NAME, tumorPRALM.getLikelihoodReadMap().keySet(), "Present in Tumor PRALM after filtering for overlapping reads"); // extend to multiple samples // compute tumor LOD for each alternate allele @@ -249,12 +249,11 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine { if (hasNormal) { normalPRALM = readAlleleLikelihoods.toPerReadAlleleLikelihoodMap(readAlleleLikelihoods.sampleIndex(matchedNormalSampleName)); filterPRALMForOverlappingReads(normalPRALM, mergedVC.getReference(), loc, true); - MuTect2.logReadInfo(DEBUG_READ_NAME, normalPRALM.getLikelihoodReadMap().keySet(), "Present after filtering for overlapping reads"); + MuTect2.logReadInfo(DEBUG_READ_NAME, normalPRALM.getLikelihoodReadMap().keySet(), "Present after in Nomral PRALM filtering for overlapping reads"); GenomeLoc eventGenomeLoc = genomeLocParser.createGenomeLoc(activeRegionWindow.getContig(), loc); Collection cosmicVC = tracker.getValues(cosmicRod, eventGenomeLoc); Collection dbsnpVC = tracker.getValues(dbsnpRod, eventGenomeLoc); - // remove the effect of cosmic from dbSNP final boolean germlineAtRisk = (!dbsnpVC.isEmpty() && cosmicVC.isEmpty()); @@ -320,6 +319,10 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine { final PerReadAlleleLikelihoodMap reversePRALM = new PerReadAlleleLikelihoodMap(); splitPRALMintoForwardAndReverseReads(tumorPRALM, forwardPRALM, reversePRALM); + MuTect2.logReadInfo(DEBUG_READ_NAME, tumorPRALM.getLikelihoodReadMap().keySet(), "Present in tumor PRALM after PRALM is split"); + MuTect2.logReadInfo(DEBUG_READ_NAME, forwardPRALM.getLikelihoodReadMap().keySet(), "Present in forward PRALM after PRALM is split"); + MuTect2.logReadInfo(DEBUG_READ_NAME, reversePRALM.getLikelihoodReadMap().keySet(), "Present in reverse PRALM after PRALM is split"); + // TODO: build a new type for probability, likelihood, and log_likelihood. e.g. f_fwd :: probability[], tumorGLs_fwd :: likelihood[] // TODO: don't want to call getHetGenotypeLogLikelihoods on more than one alternate alelle. May need to overload it to take a scalar f_fwd. final PerAlleleCollection alleleFractionsForward = estimateAlleleFraction(mergedVC, forwardPRALM, true); @@ -328,6 +331,22 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine { final PerAlleleCollection alleleFractionsReverse = estimateAlleleFraction(mergedVC, reversePRALM, true); final PerAlleleCollection tumorGenotypeLLReverse = getHetGenotypeLogLikelihoods(mergedVC, reversePRALM, originalNormalReadQualities, alleleFractionsReverse); + if( configuration.DEBUG && logger != null ) { + StringBuilder forwardMessage = new StringBuilder("Calculated forward allelic fraction at " + loc + " = ["); + StringBuilder reverseMessage = new StringBuilder("Calculated reverse allelic fraction at " + loc + " = ["); + + for (Allele altAllele : altAlleleFractions.getAltAlleles()){ + forwardMessage.append( altAllele + ": " + alleleFractionsForward.getAlt(altAllele) + ", "); + reverseMessage.append( altAllele + ": " + alleleFractionsReverse.getAlt(altAllele) + ", "); + } + + forwardMessage.append("]"); + reverseMessage.append("]"); + + logger.info(forwardMessage.toString()); + logger.info(reverseMessage.toString()); + } + double tumorLod_fwd = tumorGenotypeLLForward.getAlt(alleleWithHighestTumorLOD) - tumorGenotypeLLForward.getRef(); double tumorLod_rev = tumorGenotypeLLReverse.getAlt(alleleWithHighestTumorLOD) - tumorGenotypeLLReverse.getRef(); @@ -500,6 +519,10 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine { for ( final Allele altAllele : vc.getAlternateAlleles() ) { int altCount = alleleCounts.getAlt(altAllele); double alleleFraction = (double) altCount / (refCount + altCount); + // weird case, but I've seen it happen in one strand cases + if (refCount == 0 && altCount == refCount ) { + alleleFraction = 0; + } alleleFractions.setAlt(altAllele, alleleFraction); // logger.info("Counted " + refCount + " ref and " + altCount + " alt " ); } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/MuTect2IntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/MuTect2IntegrationTest.java index 70b401333..0b93b9fc1 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/MuTect2IntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/MuTect2IntegrationTest.java @@ -57,8 +57,6 @@ import org.testng.annotations.Test; import java.util.*; public class MuTect2IntegrationTest extends WalkerTest { - final static String REF = hg19Reference; - final static String CCLE_MICRO_TUMOR_BAM = privateTestDir + "HCC1143.cghub.ccle.micro.bam"; final static String CCLE_MICRO_NORMAL_BAM = privateTestDir + "HCC1143_BL.cghub.ccle.micro.bam"; final static String CCLE_MICRO_INTERVALS_FILE = privateTestDir + "HCC1143.cghub.ccle.micro.intervals"; @@ -72,8 +70,6 @@ public class MuTect2IntegrationTest extends WalkerTest { final static String DREAM3_TP_INTERVALS_FILE = privateTestDir + "m2_dream3.tp.intervals"; final static String DREAM3_FP_INTERVALS_FILE = privateTestDir + "m2_dream3.fp.intervals"; - final static String MULTIALLELIC_TUMOR_BAM = privateTestDir + "m2-multiallelic-tumor.bam"; - final String commandLine = @@ -82,7 +78,7 @@ public class MuTect2IntegrationTest extends WalkerTest { private void M2Test(String tumorBam, String normalBam, String intervals, String args, String md5) { final String base = String.format( commandLine, - REF, DBSNP, COSMIC, PON, tumorBam, normalBam, intervals) + + hg19Reference, DBSNP, COSMIC, PON, tumorBam, normalBam, intervals) + " -o %s "; final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5)); @@ -97,7 +93,7 @@ public class MuTect2IntegrationTest extends WalkerTest { private void m2TumorOnlyTest(String tumorBam, String intervals, String args, String md5) { final String base = String.format( "-T MuTect2 --no_cmdline_in_header -dt NONE --disableDithering -alwaysloadVectorHMM -pairHMM LOGLESS_CACHING -ip 50 -R %s --dbsnp %s --cosmic %s --normal_panel %s -I:tumor %s -L %s", - REF, DBSNP, COSMIC, PON, tumorBam, intervals) + + hg19Reference, DBSNP, COSMIC, PON, tumorBam, intervals) + " -o %s "; final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5)); @@ -109,7 +105,7 @@ public class MuTect2IntegrationTest extends WalkerTest { private void M2TestWithDroppedReads(String tumorBam, String normalBam, String intervals, String args, String md5Variants, String md5Bamout) { final String base = String.format( commandLine, - REF, DBSNP, COSMIC, PON, tumorBam, normalBam, intervals) + + hg19Reference, DBSNP, COSMIC, PON, tumorBam, normalBam, intervals) + " -o %s " + "-bamout %s --emitDroppedReads"; @@ -124,7 +120,7 @@ public class MuTect2IntegrationTest extends WalkerTest { @Test public void testMicroRegression() { - M2Test(CCLE_MICRO_TUMOR_BAM, CCLE_MICRO_NORMAL_BAM, CCLE_MICRO_INTERVALS_FILE, "", "dc6d742e85a59b237f5541109a6d343e"); + M2Test(CCLE_MICRO_TUMOR_BAM, CCLE_MICRO_NORMAL_BAM, CCLE_MICRO_INTERVALS_FILE, "", "dd3bb9526c85c0aed39545c4639ff138"); } /** @@ -134,7 +130,7 @@ public class MuTect2IntegrationTest extends WalkerTest { */ @Test public void testTruePositivesDream3() { - M2Test(DREAM3_TUMOR_BAM, DREAM3_NORMAL_BAM, DREAM3_TP_INTERVALS_FILE, "", "7faeb329798cca63a42867404111847c"); + M2Test(DREAM3_TUMOR_BAM, DREAM3_NORMAL_BAM, DREAM3_TP_INTERVALS_FILE, "", "5bd540d238916a2b91e827aed3592e59"); } /** @@ -143,7 +139,7 @@ public class MuTect2IntegrationTest extends WalkerTest { @Test public void testTruePositivesDream3TrackedDropped() { M2TestWithDroppedReads(DREAM3_TUMOR_BAM, DREAM3_NORMAL_BAM, "21:10935369", "", - "a2e6cc12a21219d510b6719ee86c676e", + "48a446d47bb10434cb7f0ee726d15721", "b536e76870326b4be01b8d6b83c1cf1c"); } @@ -153,7 +149,7 @@ public class MuTect2IntegrationTest extends WalkerTest { */ @Test public void testFalsePositivesDream3() { - M2Test(DREAM3_TUMOR_BAM, DREAM3_NORMAL_BAM, DREAM3_FP_INTERVALS_FILE, "", "fe3adcf8ac45e8ec9a9feb26908f67a9"); // e2413f4166b6ed20be6cdee6616ba43d + M2Test(DREAM3_TUMOR_BAM, DREAM3_NORMAL_BAM, DREAM3_FP_INTERVALS_FILE, "", "c23f794866797f9bbcb3ed04451758be"); // e2413f4166b6ed20be6cdee6616ba43d } /** @@ -161,7 +157,7 @@ public class MuTect2IntegrationTest extends WalkerTest { */ @Test public void testContaminationCorrection() { - M2Test(CCLE_MICRO_TUMOR_BAM, CCLE_MICRO_NORMAL_BAM, CCLE_MICRO_INTERVALS_FILE, "-contamination 0.1", "4ffcef4c72ac72b9b8738efdcf3e04e9"); + M2Test(CCLE_MICRO_TUMOR_BAM, CCLE_MICRO_NORMAL_BAM, CCLE_MICRO_INTERVALS_FILE, "-contamination 0.1", "c25e48edd704bbb436cd6456d9f47d8b"); } /** @@ -169,19 +165,18 @@ public class MuTect2IntegrationTest extends WalkerTest { */ @Test public void testTumorOnly(){ - m2TumorOnlyTest(CCLE_MICRO_TUMOR_BAM, "2:166000000-167000000", "", "6044780242414820090c5b4b1d4b8ac0"); + m2TumorOnlyTest(CCLE_MICRO_TUMOR_BAM, "2:166000000-167000000", "", "2af2253b1f09ea8fd354e1bf2c4612f0"); } @Test public void testStrandArtifactFilter(){ - M2Test(DREAM3_TUMOR_BAM, DREAM3_NORMAL_BAM, DREAM3_FP_INTERVALS_FILE, "--enable_strand_artifact_filter", "b988ba4b5f3af4674e28b3501bd3b124"); + M2Test(DREAM3_TUMOR_BAM, DREAM3_NORMAL_BAM, DREAM3_FP_INTERVALS_FILE, "--enable_strand_artifact_filter", "75c9349ff9f8dc84291396ac50871f64"); } -// @Test -// public void testMultiAllelicSite(){ -// // TODO need b38 reference -// m2TumorOnlyTest(MULTIALLELIC_TUMOR_BAM, "1:23558000-23560000", "", "5c7182623391c1faec3f7c05c0506781") -// } + @Test + public void testClusteredReadPositionFilter() { + M2Test(DREAM3_TUMOR_BAM, DREAM3_NORMAL_BAM, DREAM3_FP_INTERVALS_FILE, "--enable_clustered_read_position_filter", "c333f7dc11e39e0713147ad9af2bf4db"); + } } diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/sam/ReadUtils.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/sam/ReadUtils.java index 75617e87d..afa486cac 100644 --- a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/sam/ReadUtils.java +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/sam/ReadUtils.java @@ -501,7 +501,7 @@ public class ReadUtils { if (refBases + cigarElement.getLength() < goal) shift = cigarElement.getLength(); else - shift = goal - refBases; + shift = goal - refBases; // get to the goal refBases += shift; } @@ -515,7 +515,7 @@ public class ReadUtils { final boolean endsWithinCigar = shift < cigarElement.getLength(); // If it isn't, we need to check the next one. There should *ALWAYS* be a next one - // since we checked if the goal coordinate is within the read length, so this is just a sanity check. + // since we checked if the goal coordinate is within the read length, this is just a sanity check. if (!endsWithinCigar && !cigarElementIterator.hasNext()) { if (allowGoalNotReached) { return new Pair(CLIPPING_GOAL_NOT_REACHED, false); diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVCFConstants.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVCFConstants.java index 06c626496..1a8381ade 100644 --- a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVCFConstants.java +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVCFConstants.java @@ -138,6 +138,10 @@ public final class GATKVCFConstants { public static final String TLOD_REV_KEY = "TLOD_REV"; public static final String TUMOR_SB_POWER_FWD_KEY = "TUMOR_SB_POWER_FWD"; public static final String TUMOR_SB_POWER_REV_KEY = "TUMOR_SB_POWER_REV"; + public static final String MEDIAN_LEFT_OFFSET_KEY = "MEDIAN_LEFT_OFFSET"; + public static final String MEDIAN_RIGHT_OFFSET_KEY = "MEDIAN_RIGHT_OFFSET"; + public static final String MAD_MEDIAN_LEFT_OFFSET_KEY = "MAD_LEFT_OFFSET"; + public static final String MAD_MEDIAN_RIGHT_OFFSET_KEY = "MAD_RIGHT_OFFSET"; //FORMAT keys @@ -179,6 +183,8 @@ public final class GATKVCFConstants { public static final String TUMOR_LOD_FILTER_NAME = "t_lod_fstar"; //M2 public static final String TRIALLELIC_SITE_FILTER_NAME = "triallelic_site"; //M2 public static final String STRAND_ARTIFACT_FILTER_NAME = "strand_artifact"; // M2 + public static final String CLUSTERED_READ_POSITION_FILTER_NAME = "clustered_read_position"; // M2 + // Symbolic alleles public final static String SYMBOLIC_ALLELE_DEFINITION_HEADER_TAG = "ALT"; diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVCFHeaderLines.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVCFHeaderLines.java index a64eccf92..0c683cf20 100644 --- a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVCFHeaderLines.java +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVCFHeaderLines.java @@ -72,8 +72,8 @@ public class GATKVCFHeaderLines { addFilterLine(new VCFFilterHeaderLine(GATKVCFConstants.TUMOR_LOD_FILTER_NAME, "Tumor does not meet likelihood threshold")); addFilterLine(new VCFFilterHeaderLine(GATKVCFConstants.STR_CONTRACTION_FILTER_NAME, "Site filtered due to contraction of short tandem repeat region")); addFilterLine(new VCFFilterHeaderLine(GATKVCFConstants.TRIALLELIC_SITE_FILTER_NAME, "Site filtered because more than two alt alleles pass tumor LOD")); - addFilterLine(new VCFFilterHeaderLine(GATKVCFConstants.STRAND_ARTIFACT_FILTER_NAME, "Strand bias detected: evidence for alt allele comes from one read direction only")); - // addFilterLine(new VCFFilterHeaderLine(GATKVCFConstants.CLUSTERED_READ_POSITION_FILTER_NAME, "Variant appears in similar read positions")); + addFilterLine(new VCFFilterHeaderLine(GATKVCFConstants.STRAND_ARTIFACT_FILTER_NAME, "Evidence for alt allele comes from one read direction only")); + addFilterLine(new VCFFilterHeaderLine(GATKVCFConstants.CLUSTERED_READ_POSITION_FILTER_NAME, "Evidence for somatic variant clusters near the ends of reads")); @@ -197,7 +197,7 @@ public class GATKVCFHeaderLines { addInfoLine(new VCFInfoHeaderLine(BEAGLE_AF_COMP_KEY, 1, VCFHeaderLineType.Integer, "Allele Frequency from Comparison ROD at this site")); addInfoLine(new VCFInfoHeaderLine(BEAGLE_AN_COMP_KEY, 1, VCFHeaderLineType.Float, "Allele Number from Comparison ROD at this site")); - // M2-related info lines + // More M2-related info lines addInfoLine(new VCFInfoHeaderLine(GATKVCFConstants.EVENT_COUNT_IN_HAPLOTYPE_KEY, 1, VCFHeaderLineType.String, "Number of events in this haplotype")); addInfoLine(new VCFInfoHeaderLine(GATKVCFConstants.EVENT_DISTANCE_MAX_KEY, 1, VCFHeaderLineType.Integer, "Maximum distance between events in this active region")); addInfoLine(new VCFInfoHeaderLine(GATKVCFConstants.EVENT_DISTANCE_MIN_KEY, 1, VCFHeaderLineType.Integer, "Minimum distance between events in this active region")); @@ -209,6 +209,10 @@ public class GATKVCFHeaderLines { addInfoLine(new VCFInfoHeaderLine(GATKVCFConstants.TLOD_REV_KEY,1,VCFHeaderLineType.Float,"TLOD from reverse reads only")); addInfoLine(new VCFInfoHeaderLine(GATKVCFConstants.TUMOR_SB_POWER_FWD_KEY,1,VCFHeaderLineType.Float,"Strand bias power for forward reads")); addInfoLine(new VCFInfoHeaderLine(GATKVCFConstants.TUMOR_SB_POWER_REV_KEY,1,VCFHeaderLineType.Float,"Stand bias power for reverse reads")); + addInfoLine(new VCFInfoHeaderLine(GATKVCFConstants.MEDIAN_LEFT_OFFSET_KEY, 1, VCFHeaderLineType.Float, "Median of the number of bases between the left end of the tumor read and the variant")); + addInfoLine(new VCFInfoHeaderLine(GATKVCFConstants.MEDIAN_RIGHT_OFFSET_KEY, 1, VCFHeaderLineType.Float, "Median of the number of bases between the variant and the right end of the tumor read")); + addInfoLine(new VCFInfoHeaderLine(GATKVCFConstants.MAD_MEDIAN_LEFT_OFFSET_KEY, 1, VCFHeaderLineType.Float, "Median absolute deviation of medians of the number of bases between the left end of the tumor read and the variant")); + addInfoLine(new VCFInfoHeaderLine(GATKVCFConstants.MAD_MEDIAN_RIGHT_OFFSET_KEY, 1, VCFHeaderLineType.Float, "Median absolute deviation of medians of the number of bases between the variant and the right end of the tumor read")); } } diff --git a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/BaseTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/BaseTest.java index 4d11a43cd..a9ffd21e7 100644 --- a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/BaseTest.java +++ b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/BaseTest.java @@ -97,6 +97,7 @@ public abstract class BaseTest { public static final String hg19Reference = "/seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta"; public static final String b36KGReference = "/humgen/1kg/reference/human_b36_both.fasta"; public static final String b37KGReference = "/humgen/1kg/reference/human_g1k_v37.fasta"; + public static final String b38Reference = "/seq/references/Homo_sapiens_assembly38/v0/Homo_sapiens_assembly38.fasta"; public static final String b37KGReferenceWithDecoy = "/humgen/gsa-hpprojects/GATK/bundle/current/b37/human_g1k_v37_decoy.fasta"; public static final String hg19ReferenceWithChrPrefixInChromosomeNames = "/humgen/gsa-hpprojects/GATK/bundle/current/hg19/ucsc.hg19.fasta"; public static final String GATKDataLocation = "/humgen/gsa-hpprojects/GATK/data/"; From aace73e884ffb98477b92e5ad2cac1c0c657214e Mon Sep 17 00:00:00 2001 From: Ron Levine Date: Fri, 8 Jul 2016 12:46:50 -0400 Subject: [PATCH 20/68] Enable control of reporting periodicity --- .../arguments/GATKArgumentCollection.java | 7 +++++ .../gatk/engine/executive/MicroScheduler.java | 3 +- .../utils/progressmeter/ProgressMeter.java | 13 ++------- .../progressmeter/ProgressMeterDaemon.java | 29 ++++++------------- 4 files changed, 21 insertions(+), 31 deletions(-) diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/arguments/GATKArgumentCollection.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/arguments/GATKArgumentCollection.java index 737a46ba1..6f527b121 100644 --- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/arguments/GATKArgumentCollection.java +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/arguments/GATKArgumentCollection.java @@ -52,6 +52,9 @@ public class GATKArgumentCollection { // the default value of the stop of the expanded window public static final int DEFAULT_REFERENCE_WINDOW_STOP = 0; + // the default time in seconds between progress meter calls + public final static long DEFAULT_SECONDS_BETWEEN_PROGRESS_UPDATES = 10; + /** the constructor */ public GATKArgumentCollection() { } @@ -354,6 +357,10 @@ public class GATKArgumentCollection { @Argument(fullName = "globalQScorePrior", shortName = "globalQScorePrior", doc = "Global Qscore Bayesian prior to use for BQSR", required = false) public double globalQScorePrior = -1.0; + @Advanced + @Argument(fullName="secondsBetweenProgressUpdates", shortName = "secondsBetweenProgressUpdates", doc = "Time interval for process meter information output (in seconds)", required=false) + public long secondsBetweenProgressUpdates = DEFAULT_SECONDS_BETWEEN_PROGRESS_UPDATES; + // -------------------------------------------------------------------------------------------------------------- // diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/executive/MicroScheduler.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/executive/MicroScheduler.java index e40edb760..5da39245e 100644 --- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/executive/MicroScheduler.java +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/executive/MicroScheduler.java @@ -204,7 +204,8 @@ public abstract class MicroScheduler implements MicroSchedulerMBean { // Create the progress meter, and register it with the analysis engine engine.registerProgressMeter(new ProgressMeter(progressLogFile, availableTraversalEngines.peek().getTraversalUnits(), - engine.getRegionsOfGenomeBeingProcessed())); + engine.getRegionsOfGenomeBeingProcessed(), + engine.getArguments().secondsBetweenProgressUpdates)); // Now that we have a progress meter, go through and initialize the traversal engines for ( final TraversalEngine traversalEngine : allCreatedTraversalEngines ) diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/progressmeter/ProgressMeter.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/progressmeter/ProgressMeter.java index b68a5b9e0..b2fdbb8bc 100644 --- a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/progressmeter/ProgressMeter.java +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/progressmeter/ProgressMeter.java @@ -170,22 +170,15 @@ public class ProgressMeter { /** * Create a new ProgressMeter * - * Note that progress meter isn't started until the client calls start() - * * @param performanceLogFile an optional performance log file where a table of performance logs will be written * @param processingUnitName the name of the unit type being processed, suitable for saying X seconds per processingUnitName * @param processingIntervals the intervals being processed + * @param secondsBetweenProgressUpdates how frequently (in seconds) to print progress */ public ProgressMeter(final File performanceLogFile, - final String processingUnitName, - final GenomeLocSortedSet processingIntervals) { - this(performanceLogFile, processingUnitName, processingIntervals, ProgressMeterDaemon.DEFAULT_POLL_FREQUENCY_MILLISECONDS); - } - - protected ProgressMeter(final File performanceLogFile, final String processingUnitName, final GenomeLocSortedSet processingIntervals, - final long pollingFrequency) { + final long secondsBetweenProgressUpdates) { if ( processingUnitName == null ) throw new IllegalArgumentException("processingUnitName cannot be null"); if ( processingIntervals == null ) throw new IllegalArgumentException("Target intervals cannot be null"); @@ -212,7 +205,7 @@ public class ProgressMeter { targetSizeInBP = processingIntervals.coveredSize(); // start up the timer - progressMeterDaemon = new ProgressMeterDaemon(this, pollingFrequency); + progressMeterDaemon = new ProgressMeterDaemon(this, secondsBetweenProgressUpdates); } public ProgressMeterDaemon getProgressMeterDaemon() { diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/progressmeter/ProgressMeterDaemon.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/progressmeter/ProgressMeterDaemon.java index eb18cb16c..6eb4d0954 100644 --- a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/progressmeter/ProgressMeterDaemon.java +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/progressmeter/ProgressMeterDaemon.java @@ -25,6 +25,8 @@ package org.broadinstitute.gatk.utils.progressmeter; +import java.util.concurrent.TimeUnit; + /** * Daemon thread that periodically prints the progress of the progress meter * @@ -33,20 +35,10 @@ package org.broadinstitute.gatk.utils.progressmeter; * Time: 9:16 PM */ public final class ProgressMeterDaemon extends Thread { - public final static long DEFAULT_POLL_FREQUENCY_MILLISECONDS = 10 * 1000; - /** * How frequently should we poll and print progress? */ - private final long pollFrequencyMilliseconds; - - /** - * How long are we waiting between print progress calls are issued? - * @return the time in milliseconds between progress meter calls - */ - private long getPollFrequencyMilliseconds() { - return pollFrequencyMilliseconds; - } + private final long secondsBetweenProgressUpdates; /** * Are we to continue periodically printing status, or should we shut down? @@ -60,22 +52,19 @@ public final class ProgressMeterDaemon extends Thread { /** * Create a new ProgressMeterDaemon printing progress for meter - * @param meter the progress meter to print progress of + * @param meter the progress meter to print progress + * @param secondsBetweenProgressUpdates how frequently (in seconds) to print progress */ - public ProgressMeterDaemon(final ProgressMeter meter, final long pollFrequencyMilliseconds) { + public ProgressMeterDaemon(final ProgressMeter meter, final long secondsBetweenProgressUpdates) { if ( meter == null ) throw new IllegalArgumentException("meter cannot be null"); - if ( pollFrequencyMilliseconds <= 0 ) throw new IllegalArgumentException("pollFrequencyMilliseconds must be greater than 0 but got " + pollFrequencyMilliseconds); + if ( secondsBetweenProgressUpdates <= 0 ) throw new IllegalArgumentException("secondsBetweenProgressUpdates must be greater than 0 but got " + secondsBetweenProgressUpdates); this.meter = meter; - this.pollFrequencyMilliseconds = pollFrequencyMilliseconds; + this.secondsBetweenProgressUpdates = secondsBetweenProgressUpdates; setDaemon(true); setName("ProgressMeterDaemon"); } - public ProgressMeterDaemon(final ProgressMeter meter) { - this(meter, DEFAULT_POLL_FREQUENCY_MILLISECONDS); - } - /** * Tells this daemon thread to shutdown at the next opportunity, as the progress * metering is complete. @@ -102,7 +91,7 @@ public final class ProgressMeterDaemon extends Thread { meter.printProgress(false); meter.updateElapsedTimeInNanoseconds(); try { - Thread.sleep(getPollFrequencyMilliseconds()); + Thread.sleep(TimeUnit.SECONDS.toMillis(secondsBetweenProgressUpdates)); } catch (InterruptedException e) { throw new RuntimeException(e); } From 641382eb8bbe1af0208021263829fe23ba626c62 Mon Sep 17 00:00:00 2001 From: Laura Gauthier Date: Wed, 1 Jun 2016 12:29:14 -0400 Subject: [PATCH 21/68] Fix BetaTestingAnnotation group Add test --- .../HaplotypeCallerGVCFIntegrationTest.java | 9 +++++++++ .../annotator/interfaces/BetaTestingAnnotation.java | 2 +- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java index bb8373b20..bfd444274 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java @@ -340,6 +340,15 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest { executeTest(" testASMQMateRankSumAnnotation", spec); } + @Test + public void testBetaTestingAnnotationGroup() { + final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -pairHMMSub %s %s -R %s -I %s -L %s -ERC GVCF --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d -G BetaTesting --disableDithering", + HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, b37KGReference, privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.bam", "20:10433000-10437000", GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER); + final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Collections.singletonList("df746da577c1d8a340f93b9d5df4df80")); + spec.disableShadowBCF(); + executeTest(" testASMQMateRankSumAnnotation", spec); + } + @Test public void testASInsertSizeRankSum() { final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -pairHMMSub %s %s -R %s -I %s -L %s -ERC GVCF --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d -G Standard -G AS_Standard --disableDithering -A AS_InsertSizeRankSum", diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/interfaces/BetaTestingAnnotation.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/interfaces/BetaTestingAnnotation.java index 2a7887021..fa7bc5fde 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/interfaces/BetaTestingAnnotation.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/interfaces/BetaTestingAnnotation.java @@ -28,5 +28,5 @@ package org.broadinstitute.gatk.tools.walkers.annotator.interfaces; /** * Annotations implementing this interface are not guaranteed to persist between GATK versions */ -public interface BetaTestingAnnotation { +public interface BetaTestingAnnotation extends AnnotationType { } From 4f2e3128050db69150290e5982c78dbaa27aff18 Mon Sep 17 00:00:00 2001 From: Ron Levine Date: Thu, 7 Jul 2016 10:24:20 -0400 Subject: [PATCH 22/68] Throw an exception for invalid Picard intervals --- .../gatk/utils/interval/IntervalUtils.java | 13 +++++++------ .../gatk/utils/GenomeLocParserUnitTest.java | 17 +++++++++-------- .../utils/interval/IntervalUtilsUnitTest.java | 15 ++------------- 3 files changed, 18 insertions(+), 27 deletions(-) diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/interval/IntervalUtils.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/interval/IntervalUtils.java index b2ff1708e..9f6e352bb 100644 --- a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/interval/IntervalUtils.java +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/interval/IntervalUtils.java @@ -144,16 +144,17 @@ public class IntervalUtils { IntervalList il = IntervalList.fromFile(inputFile); isPicardInterval = true; - int nInvalidIntervals = 0; for (Interval interval : il.getIntervals()) { - if ( glParser.isValidGenomeLoc(interval.getSequence(), interval.getStart(), interval.getEnd(), true)) - ret.add(glParser.createGenomeLoc(interval.getSequence(), interval.getStart(), interval.getEnd(), true)); + if (interval.getStart() - interval.getEnd() == 1 ) { // remove once a corrected version of the exome interval list is released. + logger.warn("Possible incorrectly converted length 1 interval : " + interval); + } + else if ( glParser.isValidGenomeLoc(interval.getContig(), interval.getStart(), interval.getEnd(), true)) { + ret.add(glParser.createGenomeLoc(interval.getContig(), interval.getStart(), interval.getEnd(), true)); + } else { - nInvalidIntervals++; + throw new UserException(inputFile.toString() + " has an invalid genome location : " + interval) ; } } - if ( nInvalidIntervals > 0 ) - logger.warn("Ignoring " + nInvalidIntervals + " invalid intervals from " + inputFile); } // if that didn't work, try parsing file as a GATK interval file diff --git a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/GenomeLocParserUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/GenomeLocParserUnitTest.java index 5432e236f..3f58e2d77 100644 --- a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/GenomeLocParserUnitTest.java +++ b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/GenomeLocParserUnitTest.java @@ -51,6 +51,7 @@ import java.util.*; import static org.testng.Assert.assertEquals; import static org.testng.Assert.assertTrue; +import static org.testng.Assert.assertFalse; /** * @author aaron @@ -316,14 +317,14 @@ public class GenomeLocParserUnitTest extends BaseTest { @Test public void testValidationOfGenomeLocs() { assertTrue(genomeLocParser.isValidGenomeLoc("chr1",1,1)); - assertTrue(!genomeLocParser.isValidGenomeLoc("chr2",1,1)); // shouldn't have an entry - assertTrue(!genomeLocParser.isValidGenomeLoc("chr1",1,11)); // past the end of the contig - assertTrue(!genomeLocParser.isValidGenomeLoc("chr1",-1,10)); // bad start - assertTrue(!genomeLocParser.isValidGenomeLoc("chr1",1,-2)); // bad stop - assertTrue( genomeLocParser.isValidGenomeLoc("chr1",-1,2, false)); // bad stop - assertTrue(!genomeLocParser.isValidGenomeLoc("chr1",10,11)); // bad start, past end - assertTrue( genomeLocParser.isValidGenomeLoc("chr1",10,11, false)); // bad start, past end - assertTrue(!genomeLocParser.isValidGenomeLoc("chr1",2,1)); // stop < start + assertFalse(genomeLocParser.isValidGenomeLoc("chr2",1,1)); // shouldn't have an entry + assertFalse(genomeLocParser.isValidGenomeLoc("chr1",1,11)); // past the end of the contig + assertFalse(genomeLocParser.isValidGenomeLoc("chr1",-1,10)); // bad start + assertFalse(genomeLocParser.isValidGenomeLoc("chr1",1,-2)); // bad stop + assertTrue(genomeLocParser.isValidGenomeLoc("chr1",-1,2, false)); // bad stop + assertFalse(genomeLocParser.isValidGenomeLoc("chr1",10,11)); // bad start, past end + assertTrue(genomeLocParser.isValidGenomeLoc("chr1",10,11, false)); // bad start, past end + assertFalse(genomeLocParser.isValidGenomeLoc("chr1",2,1)); // stop < start } @Test(expectedExceptions = ReviewedGATKException.class) diff --git a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/interval/IntervalUtilsUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/interval/IntervalUtilsUnitTest.java index 07b49d480..b0db3800c 100644 --- a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/interval/IntervalUtilsUnitTest.java +++ b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/interval/IntervalUtilsUnitTest.java @@ -1046,23 +1046,12 @@ public class IntervalUtilsUnitTest extends BaseTest { }; } - /* - * This test is disabled because its assumption that we will not throw an error - * upon parsing invalid Picard intervals is no longer true, as htsjdk has added - * extra protection against invalid intervals to IntervalList.add(). - * - * We should reconsider our decision in IntervalUtils.intervalFileToList() to - * silently ignore invalid intervals when parsing Picard interval files, as it's - * inconsistent with the way we handle invalid intervals for GATK interval files - * (throw a UserException, covered by testInvalidGATKFileIntervalHandling() below), - * and update this test accordingly. - */ - @Test(dataProvider="invalidIntervalTestData", enabled = false) + @Test(dataProvider="invalidIntervalTestData", expectedExceptions=UserException.class, enabled = true) public void testInvalidPicardIntervalHandling(GenomeLocParser genomeLocParser, String contig, int intervalStart, int intervalEnd ) throws Exception { SAMFileHeader picardFileHeader = new SAMFileHeader(); - picardFileHeader.addSequence(genomeLocParser.getContigInfo("chr1")); + picardFileHeader.addSequence(genomeLocParser.getContigInfo(contig)); IntervalList picardIntervals = new IntervalList(picardFileHeader); picardIntervals.add(new Interval(contig, intervalStart, intervalEnd, true, "dummyname")); From 7392c4d1b068113f87139ad05ffe5cc4ddab7045 Mon Sep 17 00:00:00 2001 From: Ron Levine Date: Thu, 23 Jun 2016 17:14:06 -0400 Subject: [PATCH 23/68] Removed spanning deletions if the deletion was removed --- .../walkers/genotyper/GenotypingEngine.java | 84 +++++++++++++++---- .../GenotypeGVCFsIntegrationTest.java | 9 ++ 2 files changed, 76 insertions(+), 17 deletions(-) diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypingEngine.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypingEngine.java index 83c7ed533..cdcb2c195 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypingEngine.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypingEngine.java @@ -104,8 +104,7 @@ public abstract class GenotypingEngine upstreamDeletionsLoc = new LinkedList<>(); /** * Construct a new genotyper engine, on a specific subset of samples. @@ -233,7 +232,7 @@ public abstract class GenotypingEngine alleles = afcr.getAllelesUsedInGenotyping(); final int alternativeAlleleCount = alleles.size() - 1; @@ -355,23 +354,74 @@ public abstract class GenotypingEngine 0) { + final GenomeLoc genomeLoc = genomeLocParser.createGenomeLocOnContig(vc.getContig(), vc.getStart(), vc.getStart() + deletionSize); + upstreamDeletionsLoc.add(genomeLoc); + } + } + + /** + * Is the variant context covered by an upstream deletion? + * + * @param vc variant context + * @return true if the location is covered by an upstream deletion, false otherwise + */ + private boolean coveredByDeletion(final VariantContext vc) { + for (Iterator it = upstreamDeletionsLoc.iterator(); it.hasNext(); ) { + final GenomeLoc loc = it.next(); + if (!loc.getContig().equals(vc.getContig())) { // past contig deletion. + it.remove(); + } else if (loc.getStop() < vc.getStart()) { // past position in current contig deletion. + it.remove(); + } else if (loc.getStart() == vc.getStart()) { + // ignore this deletion, the symbolic one does not make reference to it. + } else { // deletion covers. + return true; + } + } + + return false; + } + /** * Checks whether even if the allele is not well supported by the data, we should keep it for genotyping. * diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFsIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFsIntegrationTest.java index 56d506ce3..04153ed2d 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFsIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFsIntegrationTest.java @@ -661,4 +661,13 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { spec.disableShadowBCF(); executeTest("testGenotypingSpanningDeletionWithAllSites", spec); } + + @Test + public void testGenotypingSpanningDeletionAcrossLines() { + final WalkerTestSpec spec = new WalkerTestSpec( + baseTestString(" -V " + privateTestDir + "input-1_2256566.vcf", b37KGReference), + Collections.singletonList("1f914189326cdd17d0a8753f13cb221f")); + spec.disableShadowBCF(); + executeTest("testGenotypingSpanningDeletionAcrossLines", spec); + } } \ No newline at end of file From 3daed9e5a19097ac57cfe46b25ea4601b09eb1a3 Mon Sep 17 00:00:00 2001 From: Samuel Lee Date: Tue, 19 Jul 2016 14:44:49 -0400 Subject: [PATCH 24/68] Added exception for GQB values greater than MAX_GENOTYPE_QUAL and tests. --- .../haplotypecaller/HaplotypeCaller.java | 8 +-- .../gatk/utils/gvcf/GVCFWriter.java | 34 ++++++----- .../HaplotypeCallerGVCFIntegrationTest.java | 58 +++++++++++++------ .../GenotypeGVCFsIntegrationTest.java | 2 +- .../gatk/utils/gvcf/GVCFWriterUnitTest.java | 30 ++++++---- 5 files changed, 83 insertions(+), 49 deletions(-) diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCaller.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCaller.java index c7cc4cecc..b284b56a6 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCaller.java @@ -367,10 +367,10 @@ public class HaplotypeCaller extends ActiveRegionWalker, In * * This argument allows you to set the GQ boundaries. HC expects a list of multiple GQ threshold values. To pass * multiple values, you provide them one by one with the argument, as in `-GQB 10 -GQB 20 -GQB 30` and so on. Note - * that GQ values are capped at 99 in the GATK. + * that GQ values are capped at 99 in the GATK, so values must be integers in [1, 99]. */ @Advanced - @Argument(fullName="GVCFGQBands", shortName="GQB", doc="GQ thresholds for reference confidence bands", required = false) + @Argument(fullName="GVCFGQBands", shortName="GQB", doc="GQ thresholds for reference confidence bands (must be in [1, 99] and specified in increasing order)", required = false) protected List GVCFGQBands = new ArrayList(70) {{ for (int i=1; i<=60; ++i) add(i); add(70); add(80); add(90); add(99); @@ -754,8 +754,8 @@ public class HaplotypeCaller extends ActiveRegionWalker, In try { vcfWriter = new GVCFWriter(vcfWriter, GVCFGQBands, HCAC.genotypeArgs.samplePloidy); - } catch ( IllegalArgumentException e ) { - throw new UserException.BadArgumentValue("GQBands", "are malformed: " + e.getMessage()); + } catch ( final IllegalArgumentException e ) { + throw new UserException.BadArgumentValue("GVCFGQBands", e.getMessage()); } } } diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/gvcf/GVCFWriter.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/gvcf/GVCFWriter.java index f0e4ca9eb..23d6dbb2d 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/gvcf/GVCFWriter.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/gvcf/GVCFWriter.java @@ -74,13 +74,15 @@ import java.util.List; */ public class GVCFWriter implements VariantContextWriter { + private static final int MAX_GENOTYPE_QUAL = VCFConstants.MAX_GENOTYPE_QUAL; + // // Final fields initialized in constructor // /** Where we'll ultimately write our VCF records */ - final private VariantContextWriter underlyingWriter; + private final VariantContextWriter underlyingWriter; - final private List GQPartitions; + private final List GQPartitions; /** fields updated on the fly during GVCFWriter operation */ int nextAvailableStart = -1; @@ -90,26 +92,28 @@ public class GVCFWriter implements VariantContextWriter { private final int defaultPloidy; /** - * Is the proposed GQ partitions well-formed? + * Are the proposed GQ partitions well-formed? * * @param GQPartitions proposed GQ partitions * @return a non-null string if something is wrong (string explains issue) */ protected static List parsePartitions(final List GQPartitions, final int defaultPloidy) { - if ( GQPartitions == null ) throw new IllegalArgumentException("GQpartitions cannot be null"); - if ( GQPartitions.isEmpty() ) throw new IllegalArgumentException("GQpartitions cannot be empty"); + if ( GQPartitions == null ) throw new IllegalArgumentException("The list of GQ partitions cannot be null."); + if ( GQPartitions.isEmpty() ) throw new IllegalArgumentException("The list of GQ partitions cannot be empty."); final List result = new LinkedList<>(); int lastThreshold = 0; for ( final Integer value : GQPartitions ) { - if ( value == null ) throw new IllegalArgumentException("GQPartitions contains a null integer"); - if ( value < lastThreshold ) throw new IllegalArgumentException("GQPartitions is out of order. Last is " + lastThreshold + " but next is " + value); - if ( value == lastThreshold ) throw new IllegalArgumentException("GQPartitions is equal elements: Last is " + lastThreshold + " but next is " + value); - result.add(new HomRefBlock(lastThreshold, value,defaultPloidy)); + if ( value == null || value <= 0 ) throw new IllegalArgumentException("The list of GQ partitions contains a null or non-positive integer."); + if ( value < lastThreshold ) throw new IllegalArgumentException(String.format("The list of GQ partitions is out of order. Previous value is %d but the next is %d.", lastThreshold, value)); + if ( value == lastThreshold ) throw new IllegalArgumentException(String.format("The value %d appears more than once in the list of GQ partitions.", value)); + if ( value > MAX_GENOTYPE_QUAL ) throw new IllegalArgumentException(String.format("The value %d in the list of GQ partitions is greater than VCFConstants.MAX_GENOTYPE_QUAL = %d.", value, VCFConstants.MAX_GENOTYPE_QUAL)); + result.add(new HomRefBlock(lastThreshold, value, defaultPloidy)); lastThreshold = value; } - result.add(new HomRefBlock(lastThreshold, Integer.MAX_VALUE,defaultPloidy)); - + if (lastThreshold <= MAX_GENOTYPE_QUAL ) { + result.add(new HomRefBlock(lastThreshold, MAX_GENOTYPE_QUAL + 1, defaultPloidy)); + } return result; } @@ -209,10 +213,14 @@ public class GVCFWriter implements VariantContextWriter { } private boolean genotypeCanBeMergedInCurrentBlock(final Genotype g) { - return currentBlock != null && currentBlock.withinBounds(g.getGQ()) && currentBlock.getPloidy() == g.getPloidy() + return currentBlock != null && currentBlock.withinBounds(capToMaxGQ(g.getGQ())) && currentBlock.getPloidy() == g.getPloidy() && (currentBlock.getMinPLs() == null || !g.hasPL() || (currentBlock.getMinPLs().length == g.getPL().length)); } + private int capToMaxGQ(final int gq) { + return Math.min(gq, MAX_GENOTYPE_QUAL); + } + /** * Flush the current hom-ref block, if necessary, to the underlying writer, and reset the currentBlock to null */ @@ -268,7 +276,7 @@ public class GVCFWriter implements VariantContextWriter { // figure out the GQ limits to use based on the GQ of g HomRefBlock partition = null; for ( final HomRefBlock maybePartition : GQPartitions ) { - if ( maybePartition.withinBounds(g.getGQ()) ) { + if ( maybePartition.withinBounds(capToMaxGQ(g.getGQ())) ) { partition = maybePartition; break; } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java index bb8373b20..80139438a 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java @@ -51,6 +51,7 @@ package org.broadinstitute.gatk.tools.walkers.haplotypecaller; +import htsjdk.variant.vcf.VCFConstants; import org.apache.commons.io.FileUtils; import org.apache.log4j.Level; import org.broadinstitute.gatk.engine.GATKVCFUtils; @@ -67,6 +68,7 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.List; +import java.util.stream.Collectors; public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest { @@ -87,10 +89,10 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest { //tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.NONE, PCRFreeIntervals, "7f09c261950bf86e435edfa69ed2ec71"}); tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.NONE, PCRFreeIntervals, "8d30370465d74fd549d76dd31adc4c0c"}); tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "cf5545094ebb264fa8eb879fd848d9ef"}); - tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "a6bbc30b82e7864baf64163d55f5aee5"}); + tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "0086cc735cf792a9f236ec057c73b750"}); tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.NONE, WExIntervals, "2e81881e92061ad4eb29025ffdc129c7"}); tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "2c67bdc08c8784f2114c2039270b9766"}); - tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.GVCF, WExIntervals, "63fa5841a21e2c13f1e1a8e2d4ea3380"}); + tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.GVCF, WExIntervals, "861fa31b135d200f765914126b422cf4"}); return tests.toArray(new Object[][]{}); } @@ -106,11 +108,11 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest { // this functionality can be adapted to provide input data for whatever you might want in your data tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.NONE, PCRFreeIntervals, "3ae2c7e570855f6d6ca58ddd1089a970"}); tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "8bb824886fb0e77d0e8317d69f9d1b62"}); - tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "ca87b62a070801e4954d72169b88fb9c"}); + tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "1f19c2b2b528dff502bc1a47701edde7"}); tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.NONE, WExIntervals, "63ff771eed3e62340c8938b4963d0add"}); tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "1122a0b3849f42d1c4a654f93b660e1b"}); - final String NA12878bandedResolutionMD5 = "8d4a51af32cd13ba4b3e33dd00c58398"; + final String NA12878bandedResolutionMD5 = "7240907ec3dc2ed49b55c9956546ba13"; tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.GVCF, WExIntervals, NA12878bandedResolutionMD5}); tests.add(new Object[]{NA12878_WEx + " -I " + privateTestDir + "NA20313.highCoverageRegion.bam -sn NA12878", ReferenceConfidenceMode.GVCF, WExIntervals, NA12878bandedResolutionMD5}); @@ -129,10 +131,10 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest { // this functionality can be adapted to provide input data for whatever you might want in your data tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.NONE, PCRFreeIntervals, "8bf132d73cf6b0851ae73c6799f19ba9"}); tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "450906ce3c11860c25b90cf0a56bb1a0"}); - tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "3c0346d41a7e57b45b85a920cc04f51f"}); + tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "49f41972e19f6897659e497d32730dde"}); tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.NONE, WExIntervals, "6ad7855dbf6dda2060aa93a3ee010b3e"}); tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "50e628de2a79cd6887af020b713ca3b8"}); - tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.GVCF, WExIntervals, "8123d8b68b6fa77ef084f292e191622a"}); + tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.GVCF, WExIntervals, "e48bbcf453e63a6ea5eeda05f6865f94"}); return tests.toArray(new Object[][]{}); } @@ -147,15 +149,14 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest { // this functionality can be adapted to provide input data for whatever you might want in your data tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.NONE, PCRFreeIntervals, "6662cfc41393257dfd6c39f1af1e3843"}); tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "0bc1ca3bff07381a344685b048e76ee4"}); - tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "9d1724150feccb0a09b6fad522605bb1"}); + tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "3ff7e3cd9f6b1949d19f52fab53bdb5e"}); tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.NONE, WExIntervals, "af0fe243e3b96e59097187cd16ba1597"}); tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "8a094080fb25bbcd39325dcdd62bcf65"}); - tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.GVCF, WExIntervals, "f35192d245babba9764128abad669019"}); + tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.GVCF, WExIntervals, "685025831ac783784d7838e568e35f46"}); return tests.toArray(new Object[][]{}); } - /** * Test HaplotypeCaller, using MyDataProvider */ @@ -276,7 +277,7 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest { public void testWrongGVCFNonVariantRecordOrderBugFix() { final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -pairHMMSub %s %s -R %s -I %s -L %s -ERC GVCF --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d", HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, b37KGReference, WRONG_GVCF_RECORD_ORDER_BUGFIX_BAM, WRONG_GVCF_RECORD_ORDER_BUGFIX_INTERVALS, GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER); - final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("6facd3d2cf9f52877182d627cef1c872")); + final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("f70b7052dfeb065ee8c7d796f1a1f84a")); spec.disableShadowBCF(); executeTest("testMissingGVCFIndexingStrategyException", spec); } @@ -293,7 +294,7 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest { public void testNoCallGVCFMissingPLsBugFix() { final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -pairHMMSub %s %s -R %s -I %s -L %s -ERC GVCF --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d", HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, b37KGReference, NOCALL_GVCF_BUGFIX_BAM, NOCALL_GVCF_BUGFIX_INTERVALS, GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER); - final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("9fc3c68f46e747b730615c0be98cb013")); + final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("66f242cf3f1f1776c743505b84505f94")); spec.disableShadowBCF(); executeTest("testNoCallGVCFMissingPLsBugFix", spec); } @@ -326,7 +327,7 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest { public void testAlleleSpecificAnnotations() { final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -pairHMMSub %s %s -R %s -I %s -L %s -ERC GVCF --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d -G Standard -G AS_Standard --disableDithering", HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, b37KGReference, privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.bam", "20:10433000-10437000", GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER); - final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Collections.singletonList("d986868d83057c0ecdf7ba177b8282f3")); + final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Collections.singletonList("5877ccbc99bbaffbcd5fe3aaa3d7e7f7")); spec.disableShadowBCF(); executeTest(" testAlleleSpecificAnnotations", spec); } @@ -335,7 +336,7 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest { public void testASMQMateRankSumAnnotation() { final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -pairHMMSub %s %s -R %s -I %s -L %s -ERC GVCF --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d -A AS_MQMateRankSumTest --disableDithering", HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, b37KGReference, privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.bam", "20:10433000-10437000", GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER); - final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Collections.singletonList("87723bd4442c7ec25f65a77d6434957a")); + final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Collections.singletonList("0381fec3b0d21508b28fa62c2a61ccfc")); spec.disableShadowBCF(); executeTest(" testASMQMateRankSumAnnotation", spec); } @@ -344,7 +345,7 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest { public void testASInsertSizeRankSum() { final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -pairHMMSub %s %s -R %s -I %s -L %s -ERC GVCF --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d -G Standard -G AS_Standard --disableDithering -A AS_InsertSizeRankSum", HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, b37KGReference, privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.bam", "20:10433000-10437000", GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER); - final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Collections.singletonList("a63d6912b2f2fab7debee9488fbbd0b0")); + final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Collections.singletonList("4599a591427c188c117f09ac40cc866f")); spec.disableShadowBCF(); executeTest(" testASInsertSizeRankSum", spec); } @@ -353,7 +354,7 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest { public void testHaplotypeCallerMultiAllelicNonRef() { final String commandLine = String.format("-T HaplotypeCaller -R %s -I %s -L %s -ERC GVCF --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d -A StrandAlleleCountsBySample", b37KGReference, privateTestDir + "multiallelic-nonref.bam", "2:47641259-47641859", GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER); - final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Collections.singletonList("182aa78f42235d2b4dabb87cc6c8a433")); + final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Collections.singletonList("19fc2c5218d907fcdcd36de2afbef19c")); spec.disableShadowBCF(); executeTest(" testHaplotypeCallerMultiAllelicNonRef", spec); } @@ -362,7 +363,7 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest { public void testHaplotypeCallerMaxNumPLValues() { final String commandLine = String.format("-T HaplotypeCaller -R %s -I %s -L %s -ERC GVCF --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d -ploidy 4 -maxNumPLValues 70", b37KGReference, privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.bam", validationDataLocation + "NA12878.HiSeq.b37.chr20.10_11mb.test.intervals", GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER); - final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Collections.singletonList("446604d4398d4c1bad41b9506624ab91")); + final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Collections.singletonList("b2adc744d9dff2f488149bcc96d6bb6d")); spec.disableShadowBCF(); executeTest("testHaplotypeCallerMaxNumPLValues", spec); } @@ -379,7 +380,7 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest { final String commandLine = String.format("-T HaplotypeCaller -R %s -I %s -L %s -ERC GVCF --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d -ploidy 4 -maxNumPLValues 30 -log %s", b37KGReference, privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.bam", validationDataLocation + "NA12878.HiSeq.b37.chr20.10_11mb.test.intervals", GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER, logFileName); - final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("a01abc7e0b4a486125967d3a1ebcc33f")); + final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("cbd37b492f77c50d2da744d5e00c6f90")); spec.disableShadowBCF(); executeTest("testHaplotypeCallerMaxNumPLValuesExceededWithWarnLogLevel", spec); // Make sure the "Maximum allowed number of PLs exceeded" messages are in the log @@ -404,7 +405,7 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest { final String commandLine = String.format("-T HaplotypeCaller -R %s -I %s -L %s -ERC GVCF --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d -ploidy 4 -maxNumPLValues 30 -log %s", b37KGReference, privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.bam", validationDataLocation + "NA12878.HiSeq.b37.chr20.10_11mb.test.intervals", GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER, logFileName); - final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("a01abc7e0b4a486125967d3a1ebcc33f")); + final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("cbd37b492f77c50d2da744d5e00c6f90")); spec.disableShadowBCF(); executeTest("testHaplotypeCallerMaxNumPLValuesExceededWithDebugLogLevel", spec); // Make sure the "Maximum allowed number of PLs exceeded" messages are in the log @@ -420,8 +421,27 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest { public void testHaplotypeCallerGVCFBlocks() { final String commandLine = String.format("-T HaplotypeCaller -R %s -I %s -L 1:1-1000000 -ERC GVCF --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d", b37KGReference, privateTestDir + "gvcf_blocks_test.bam", GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER); - final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("802c53621bd2004d9052a8e81d91df3e")); + final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("0cdf4d6d0a45def15fb11ea30c78e470")); spec.disableShadowBCF(); executeTest("testHaplotypeCallerGVCFBlocks", spec); } + + @DataProvider(name = "dataBadGQBValues") + public Object[][] dataBadGQBValues() { + return new Object[][]{ + {Arrays.asList(-1, 10, 20)}, + {Arrays.asList(10, 20, 1)}, + {Arrays.asList(10, 10, 20)}, + {Arrays.asList(10, 20, VCFConstants.MAX_GENOTYPE_QUAL + 1)} + }; + } + @Test(dataProvider = "dataBadGQBValues") + public void testBadGQBValues(final List inputGQBValues) { + final String inputGQBValuesString = inputGQBValues.stream().map(gqb -> "-GQB " + gqb).collect(Collectors.joining(" ")); + final String commandLine = String.format("-T HaplotypeCaller -R %s -I %s -L 1:1-1000000 -ERC GVCF %s --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d", + b37KGReference, privateTestDir + "gvcf_blocks_test.bam", inputGQBValuesString, GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER); + final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", 1, UserException.BadArgumentValue.class); + spec.disableShadowBCF(); + executeTest("testBadGQBValues", spec); + } } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFsIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFsIntegrationTest.java index 56d506ce3..ae77125d5 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFsIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFsIntegrationTest.java @@ -282,7 +282,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { final WalkerTestSpec spec = new WalkerTestSpec( baseTestString(" -V " + gVCF.getAbsolutePath(), b37KGReference), 1, - Collections.singletonList("34d76dc8dabc6a97e6d8f5365d7531e5")); + Collections.singletonList("5d8fff160ec6eedb8e02c9207e256073")); spec.disableShadowBCF(); //TODO: Remove when BaseTest.assertAttributesEquals() works with SAC executeTest("testStrandAlleleCountsBySample", spec); } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/gvcf/GVCFWriterUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/gvcf/GVCFWriterUnitTest.java index eb421ba3f..7067fd89f 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/gvcf/GVCFWriterUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/gvcf/GVCFWriterUnitTest.java @@ -69,6 +69,7 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.List; +import java.util.stream.Collectors; public class GVCFWriterUnitTest extends BaseTest { private static class MockWriter implements VariantContextWriter { @@ -378,25 +379,30 @@ public class GVCFWriterUnitTest extends BaseTest { public Object[][] makeBandPartitionData() { List tests = new ArrayList<>(); - tests.add(new Object[]{null, false}); - tests.add(new Object[]{Collections.emptyList(), false}); - tests.add(new Object[]{Arrays.asList(1), true}); - tests.add(new Object[]{Arrays.asList(1, 10), true}); - tests.add(new Object[]{Arrays.asList(1, 10, 30), true}); - tests.add(new Object[]{Arrays.asList(10, 1, 30), false}); - tests.add(new Object[]{Arrays.asList(-1, 1), false}); - tests.add(new Object[]{Arrays.asList(1, null, 10), false}); + tests.add(new Object[]{null, false, null}); + tests.add(new Object[]{Collections.emptyList(), false, null}); + tests.add(new Object[]{Collections.singletonList(1), true, Arrays.asList(0, 1)}); + tests.add(new Object[]{Arrays.asList(1, 10), true, Arrays.asList(0, 1, 10)}); + tests.add(new Object[]{Arrays.asList(1, 10, 30), true, Arrays.asList(0, 1, 10, 30)}); + tests.add(new Object[]{Arrays.asList(10, 1, 30), false, null}); + tests.add(new Object[]{Arrays.asList(-1, 1), false, null}); + tests.add(new Object[]{Arrays.asList(1, null, 10), false, null}); + tests.add(new Object[]{Arrays.asList(1, 1, 10), false, null}); + tests.add(new Object[]{Arrays.asList(1, 10, VCFConstants.MAX_GENOTYPE_QUAL - 1), true, Arrays.asList(0, 1, 10, VCFConstants.MAX_GENOTYPE_QUAL - 1)}); + tests.add(new Object[]{Arrays.asList(1, 10, VCFConstants.MAX_GENOTYPE_QUAL), true, Arrays.asList(0, 1, 10, VCFConstants.MAX_GENOTYPE_QUAL)}); + tests.add(new Object[]{Arrays.asList(1, 10, VCFConstants.MAX_GENOTYPE_QUAL + 1), false, null}); return tests.toArray(new Object[][]{}); } @Test(dataProvider = "BandPartitionData") - public void testMyData(final List partitions, final boolean expectedGood) { + public void testBandPartitionData(final List partitions, final boolean expectedGood, final List expectedPartitionLowerBounds) { try { - GVCFWriter.parsePartitions(partitions,2); + final List resultPartitionLowerBounds = GVCFWriter.parsePartitions(partitions, 2).stream().map(HomRefBlock::getGQLowerBound).collect(Collectors.toList()); Assert.assertTrue(expectedGood, "Expected to fail but didn't"); - } catch ( Exception e ) { - Assert.assertTrue(! expectedGood, "Expected to succeed but failed with message " + e.getMessage()); + Assert.assertEquals(resultPartitionLowerBounds, expectedPartitionLowerBounds); + } catch ( final Exception e ) { + Assert.assertTrue(!expectedGood, "Expected to succeed but failed with message " + e.getMessage()); } } } From fef63ce6a810727794b6c71460840205384106d7 Mon Sep 17 00:00:00 2001 From: Valentin Ruano Rubio Date: Fri, 15 Jul 2016 14:20:52 -0400 Subject: [PATCH 25/68] Make sure that multi-alleleic uninformative PLs (0,0,...,0) stay uninformative after biallelization. Addresses issue #1439 (thus #1437). Fixes a bug where non informative PLs were not handled appropriatelly when calculating multi-allelic site QUAL values. This was resulting in long execution times for very large datasets (~200,000 samples in the case of ExAC2). --- .../afcalc/IndependentAllelesDiploidExactAFCalculator.java | 4 +++- .../UnifiedGenotyperIndelCallingIntegrationTest.java | 2 +- ...lotypeCallerComplexAndSymbolicVariantsIntegrationTest.java | 2 +- .../walkers/variantutils/GenotypeGVCFsIntegrationTest.java | 4 ++-- 4 files changed, 7 insertions(+), 5 deletions(-) diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalculator.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalculator.java index 1e720a6f5..68a88a717 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalculator.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalculator.java @@ -194,7 +194,9 @@ import java.util.*; } gb.alleles(newAlleles); } - if (combineAltAlleleLikelihoods(oldGenotype, genotypeCount, newLikelihoods, hetLikelihoods, homAltLikelihoods)) + if (oldGenotype.isNonInformative()) + gb.PL(BIALLELIC_NON_INFORMATIVE_PLS); + else if (combineAltAlleleLikelihoods(oldGenotype, genotypeCount, newLikelihoods, hetLikelihoods, homAltLikelihoods)) gb.PL(newLikelihoods); newGenotypes.add(gb.make()); } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperIndelCallingIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperIndelCallingIntegrationTest.java index 9e524e347..791e4ac6d 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperIndelCallingIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperIndelCallingIntegrationTest.java @@ -140,7 +140,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + result.get(0).getAbsolutePath() + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L " + result.get(0).getAbsolutePath(), 1, - Arrays.asList("f9d848fe5e6e6762e0dd5b5d925f74f4")); + Arrays.asList("08967b41ccc76b1f3c7093e51a90713a")); executeTest("test MultiSample Pilot1 CEU indels using GENOTYPE_GIVEN_ALLELES", spec2); } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java index 42649af05..383b85801 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java @@ -72,7 +72,7 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa @Test public void testHaplotypeCallerMultiSampleComplex1() { - HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "88255eda0e29e4a6e128ddb7177a03ab"); + HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "113ae4c0244c50243313a7d6e77da26b"); } private void HCTestSymbolicVariants(String bam, String args, String md5) { diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFsIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFsIntegrationTest.java index 04153ed2d..ef81bcf0a 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFsIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFsIntegrationTest.java @@ -157,7 +157,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { " -V:sample3 " + privateTestDir + "combine.single.sample.pipeline.3.vcf" + " --includeNonVariantSites -L 20:10,030,000-10,033,000 -L 20:10,386,000-10,386,500", b37KGReference), 1, - Collections.singletonList("c2f30f25ba4a84e38c04aa49b95694e8")); + Collections.singletonList("af19ee0d7e739143be4e252c48701c45")); executeTest("combineSingleSamplePipelineGVCF_includeNonVariants", spec); } @@ -579,7 +579,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { baseTestString(" -V " + privateTestDir + "set.zero.RGQs.no.call.sample1.g.vcf" + " -V " + privateTestDir + "set.zero.RGQs.no.call.sample2.g.vcf" + " -L chr16:1279274-1279874 -allSites", hg19ReferenceWithChrPrefixInChromosomeNames), - Collections.singletonList("b7106be316e43ca04204b78038f65c9f")); + Collections.singletonList("92c097d8b6074d40f8d1385bc92a0a5d")); spec.disableShadowBCF(); executeTest("testSetZeroRGQsToNoCall", spec); } From 832a383acdd5ece9ed76c193d64431b474af8ba1 Mon Sep 17 00:00:00 2001 From: Samuel Lee Date: Wed, 27 Jul 2016 13:44:12 -0400 Subject: [PATCH 26/68] Fixed MD5 broken by PR #1440. --- .../haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java index 3f9c701de..ddc6587dd 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java @@ -345,7 +345,7 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest { public void testBetaTestingAnnotationGroup() { final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -pairHMMSub %s %s -R %s -I %s -L %s -ERC GVCF --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d -G BetaTesting --disableDithering", HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, b37KGReference, privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.bam", "20:10433000-10437000", GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER); - final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Collections.singletonList("df746da577c1d8a340f93b9d5df4df80")); + final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Collections.singletonList("f248a6c4a7645dc5cc9f5ec9f81d9ad5")); spec.disableShadowBCF(); executeTest(" testASMQMateRankSumAnnotation", spec); } From a465c87ff83c1966565d255e9839de2b13807575 Mon Sep 17 00:00:00 2001 From: Andrii Nikitiuk Date: Tue, 15 Sep 2015 17:15:49 -0400 Subject: [PATCH 27/68] Added support for directly reading SRA runs --- .../tools/walkers/bqsr/BaseRecalibrator.java | 4 +- .../haplotypecaller/HaplotypeCaller.java | 11 +- .../VariantOverlapAnnotatorUnitTest.java | 4 +- ...seQualitySumPerAlleleBySampleUnitTest.java | 1 - .../HaplotypeCallerIntegrationTest.java | 3 +- .../LocalAssemblyEngineUnitTest.java | 4 +- .../PairHMMIndelErrorModelUnitTest.java | 5 +- .../gatk/utils/ContigComparatorUnitTest.java | 6 +- .../PerReadAlleleLikelihoodMapUnitTest.java | 4 +- .../gatk/engine/CommandLineExecutable.java | 12 +- .../gatk/engine/GenomeAnalysisEngine.java | 2 +- .../bwa/java/AlignerTestHarness.java | 3 +- .../providers/LocusShardDataProvider.java | 4 +- .../providers/ReadShardDataProvider.java | 3 +- .../datasources/providers/ReferenceView.java | 5 +- .../providers/ShardDataProvider.java | 8 +- .../datasources/reads/GATKBAMIndex.java | 376 +-------------- .../reads/GATKBAMIndexFromDataSource.java | 109 +++++ .../reads/GATKBAMIndexFromFile.java | 435 ++++++++++++++++++ .../datasources/reads/SAMDataSource.java | 40 +- .../reads/utilities/FindLargeShards.java | 13 +- .../reference/ReferenceDataSource.java | 8 +- .../executive/HierarchicalMicroScheduler.java | 4 +- .../executive/LinearMicroScheduler.java | 5 +- .../gatk/engine/executive/MicroScheduler.java | 11 +- .../engine/filters/BAQReadTransformer.java | 4 +- .../engine/walkers/ActiveRegionWalker.java | 3 +- .../gatk/engine/ReadMetricsUnitTest.java | 4 +- .../ReferenceOrderedViewUnitTest.java | 3 +- .../providers/ReferenceViewTemplate.java | 4 +- .../reads/FilePointerUnitTest.java | 4 +- .../GATKBAMIndexFromDataSourceUnitTest.java | 98 ++++ ...java => GATKBAMIndexFromFileUnitTest.java} | 16 +- .../reads/SAMDataSourceUnitTest.java | 4 +- .../rmd/ReferenceOrderedDataPoolUnitTest.java | 3 +- ...ReferenceOrderedQueryDataPoolUnitTest.java | 4 +- .../TAROrderedReadCacheUnitTest.java | 4 +- .../TraverseActiveRegionsUnitTest.java | 4 +- .../traversals/TraverseReadsUnitTest.java | 3 +- public/gatk-root/pom.xml | 1 + .../gatk/tools/walkers/qc/QCRef.java | 4 +- .../walkers/varianteval/VariantEval.java | 3 +- .../filters/VariantFiltrationUnitTest.java | 4 +- .../indels/IndelRealignerUnitTest.java | 4 +- .../SRAPrintReadsIntegrationTest.java | 143 ++++++ .../gatk/utils/activeregion/ActiveRegion.java | 12 +- .../broadinstitute/gatk/utils/baq/BAQ.java | 18 +- .../CachingIndexedFastaSequenceFile.java | 14 +- .../utils/locusiterator/LIBSPerformance.java | 4 +- .../gatk/utils/sam/ArtificialBAMBuilder.java | 8 +- .../gatk/utils/text/ListFileUtils.java | 18 +- .../gatk/utils/ExampleToCopyUnitTest.java | 4 +- .../gatk/utils/GenomeLocParserBenchmark.java | 4 +- .../gatk/utils/NGSPlatformUnitTest.java | 5 +- .../activeregion/ActiveRegionUnitTest.java | 4 +- .../gatk/utils/baq/BAQUnitTest.java | 3 +- ...chingIndexedFastaSequenceFileUnitTest.java | 15 +- .../tracks/FeatureManagerUnitTest.java | 5 +- .../tracks/RMDTrackBuilderUnitTest.java | 4 +- .../FeatureToGATKFeatureIteratorUnitTest.java | 4 +- .../gatk/utils/sam/ReadUtilsUnitTest.java | 6 +- 61 files changed, 1007 insertions(+), 518 deletions(-) create mode 100644 public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/GATKBAMIndexFromDataSource.java create mode 100644 public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/GATKBAMIndexFromFile.java create mode 100644 public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/GATKBAMIndexFromDataSourceUnitTest.java rename public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/{GATKBAMIndexUnitTest.java => GATKBAMIndexFromFileUnitTest.java} (85%) create mode 100644 public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/tools/walkers/readutils/SRAPrintReadsIntegrationTest.java diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/bqsr/BaseRecalibrator.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/bqsr/BaseRecalibrator.java index 4205ed2d2..4b621b41d 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/bqsr/BaseRecalibrator.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/bqsr/BaseRecalibrator.java @@ -51,9 +51,9 @@ package org.broadinstitute.gatk.tools.walkers.bqsr; -import htsjdk.samtools.reference.IndexedFastaSequenceFile; import htsjdk.samtools.CigarElement; import htsjdk.samtools.SAMFileHeader; +import htsjdk.samtools.reference.ReferenceSequenceFile; import htsjdk.tribble.Feature; import org.broadinstitute.gatk.engine.recalibration.*; import org.broadinstitute.gatk.engine.walkers.*; @@ -194,7 +194,7 @@ public class BaseRecalibrator extends ReadWalker implements NanoSche private static final String NO_DBSNP_EXCEPTION = "This calculation is critically dependent on being able to mask out known variant sites. Please provide a VCF file containing known sites of genetic variation."; private BAQ baq; // BAQ the reads on the fly to generate the alignment uncertainty vector - private IndexedFastaSequenceFile referenceReader; // fasta reference reader for use with BAQ calculation + private ReferenceSequenceFile referenceReader; // fasta reference reader for use with BAQ calculation private final static byte NO_BAQ_UNCERTAINTY = (byte)'@'; /** diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCaller.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCaller.java index b284b56a6..80a95239c 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCaller.java @@ -53,6 +53,7 @@ package org.broadinstitute.gatk.tools.walkers.haplotypecaller; import com.google.java.contract.Ensures; import htsjdk.samtools.SAMFileWriter; +import htsjdk.samtools.reference.ReferenceSequenceFile; import htsjdk.variant.variantcontext.*; import htsjdk.variant.variantcontext.writer.VariantContextWriter; import htsjdk.variant.vcf.VCFConstants; @@ -487,7 +488,7 @@ public class HaplotypeCaller extends ActiveRegionWalker, In private HaplotypeCallerGenotypingEngine genotypingEngine = null; // fasta reference reader to supplement the edges of the reference sequence - protected CachingIndexedFastaSequenceFile referenceReader; + protected ReferenceSequenceFile referenceReader; // reference base padding size private static final int REFERENCE_PADDING = 500; @@ -683,12 +684,8 @@ public class HaplotypeCaller extends ActiveRegionWalker, In vcfWriter.writeHeader(new VCFHeader(headerInfo, sampleSet)); - try { - // fasta reference reader to supplement the edges of the reference sequence - referenceReader = new CachingIndexedFastaSequenceFile(getToolkit().getArguments().referenceFile); - } catch( FileNotFoundException e ) { - throw new UserException.CouldNotReadInputFile(getToolkit().getArguments().referenceFile, e); - } + // fasta reference reader to supplement the edges of the reference sequence + referenceReader = CachingIndexedFastaSequenceFile.checkAndCreate(getToolkit().getArguments().referenceFile); // create and setup the assembler assemblyEngine = new ReadThreadingAssembler(RTAC.maxNumHaplotypesInPopulation, RTAC.kmerSizes, RTAC.dontIncreaseKmerSizesForCycles, RTAC.allowNonUniqueKmersInRef, RTAC.numPruningSamples); diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/annotator/VariantOverlapAnnotatorUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/annotator/VariantOverlapAnnotatorUnitTest.java index 010f3bb72..8008eaf86 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/annotator/VariantOverlapAnnotatorUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/annotator/VariantOverlapAnnotatorUnitTest.java @@ -51,7 +51,7 @@ package org.broadinstitute.gatk.tools.walkers.annotator; -import htsjdk.samtools.reference.IndexedFastaSequenceFile; +import htsjdk.samtools.reference.ReferenceSequenceFile; import org.broadinstitute.gatk.utils.BaseTest; import org.broadinstitute.gatk.utils.commandline.RodBinding; import org.broadinstitute.gatk.utils.GenomeLocParser; @@ -71,7 +71,7 @@ import java.util.*; public class VariantOverlapAnnotatorUnitTest extends BaseTest { private GenomeLocParser genomeLocParser; - private IndexedFastaSequenceFile seq; + private ReferenceSequenceFile seq; @BeforeClass public void setup() throws FileNotFoundException { diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/cancer/BaseQualitySumPerAlleleBySampleUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/cancer/BaseQualitySumPerAlleleBySampleUnitTest.java index 1a304891f..26f6891a4 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/cancer/BaseQualitySumPerAlleleBySampleUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/cancer/BaseQualitySumPerAlleleBySampleUnitTest.java @@ -76,5 +76,4 @@ public class BaseQualitySumPerAlleleBySampleUnitTest { Assert.assertFalse(a.isUsableRead(read)); } - } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index 747609ace..e34f734e3 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -52,6 +52,7 @@ package org.broadinstitute.gatk.tools.walkers.haplotypecaller; import htsjdk.samtools.reference.IndexedFastaSequenceFile; +import htsjdk.samtools.reference.ReferenceSequenceFile; import htsjdk.tribble.readers.LineIterator; import htsjdk.tribble.readers.PositionalBufferedStream; import htsjdk.variant.variantcontext.VariantContext; @@ -207,7 +208,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { private void HCTestNearbySmallIntervals(String bam, String args, String md5) { try { - final IndexedFastaSequenceFile fasta = new IndexedFastaSequenceFile(new File(b37KGReference)); + final ReferenceSequenceFile fasta = new IndexedFastaSequenceFile(new File(b37KGReference)); final GenomeLocParser parser = new GenomeLocParser(fasta.getSequenceDictionary()); final String base = String.format("-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -pairHMMSub %s %s -R %s -I %s", HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, REF, bam) + " -L 20:10,001,603-10,001,642 -L 20:10,001,653-10,001,742 --no_cmdline_in_header -o %s"; diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/LocalAssemblyEngineUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/LocalAssemblyEngineUnitTest.java index 28690d993..4a0f9b9f2 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/LocalAssemblyEngineUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/LocalAssemblyEngineUnitTest.java @@ -51,11 +51,11 @@ package org.broadinstitute.gatk.tools.walkers.haplotypecaller; -import htsjdk.samtools.reference.IndexedFastaSequenceFile; import htsjdk.samtools.Cigar; import htsjdk.samtools.CigarElement; import htsjdk.samtools.CigarOperator; import htsjdk.samtools.SAMFileHeader; +import htsjdk.samtools.reference.ReferenceSequenceFile; import org.broadinstitute.gatk.utils.BaseTest; import org.broadinstitute.gatk.tools.walkers.haplotypecaller.readthreading.ReadThreadingAssembler; import org.broadinstitute.gatk.utils.GenomeLoc; @@ -80,7 +80,7 @@ import java.util.*; public class LocalAssemblyEngineUnitTest extends BaseTest { private GenomeLocParser genomeLocParser; - private IndexedFastaSequenceFile seq; + private ReferenceSequenceFile seq; private SAMFileHeader header; @BeforeClass diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/indels/PairHMMIndelErrorModelUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/indels/PairHMMIndelErrorModelUnitTest.java index 60f7f0fdf..410b895ac 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/indels/PairHMMIndelErrorModelUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/indels/PairHMMIndelErrorModelUnitTest.java @@ -51,9 +51,8 @@ package org.broadinstitute.gatk.tools.walkers.indels; - -import htsjdk.samtools.reference.IndexedFastaSequenceFile; import htsjdk.samtools.SAMFileHeader; +import htsjdk.samtools.reference.ReferenceSequenceFile; import org.broadinstitute.gatk.utils.BaseTest; import org.broadinstitute.gatk.utils.UnvalidatingGenomeLoc; import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile; @@ -76,7 +75,7 @@ public class PairHMMIndelErrorModelUnitTest extends BaseTest { @BeforeClass public void setup() throws FileNotFoundException { - final IndexedFastaSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(b37KGReference)); + final ReferenceSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(b37KGReference)); header = ArtificialSAMUtils.createArtificialSamHeader(seq.getSequenceDictionary()); } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/ContigComparatorUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/ContigComparatorUnitTest.java index be669cca2..4b2ac1b88 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/ContigComparatorUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/ContigComparatorUnitTest.java @@ -51,9 +51,9 @@ package org.broadinstitute.gatk.utils; -import htsjdk.samtools.reference.IndexedFastaSequenceFile; import htsjdk.samtools.SAMSequenceDictionary; import htsjdk.samtools.SAMSequenceRecord; +import htsjdk.samtools.reference.ReferenceSequenceFile; import org.broadinstitute.gatk.utils.BaseTest; import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile; import org.testng.Assert; @@ -71,7 +71,7 @@ public class ContigComparatorUnitTest extends BaseTest { @BeforeClass public void setup() throws FileNotFoundException { // sequence - final IndexedFastaSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(b37KGReference)); + final ReferenceSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(b37KGReference)); final GenomeLocParser genomeLocParser = new GenomeLocParser(seq); dictForFails = genomeLocParser.getContigs(); } @@ -81,7 +81,7 @@ public class ContigComparatorUnitTest extends BaseTest { List tests = new ArrayList(); for ( final String ref : Arrays.asList(b37KGReference, hg18Reference) ) { - final IndexedFastaSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(ref)); + final ReferenceSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(ref)); final GenomeLocParser genomeLocParser = new GenomeLocParser(seq); final SAMSequenceDictionary dict = genomeLocParser.getContigs(); diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/genotyper/PerReadAlleleLikelihoodMapUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/genotyper/PerReadAlleleLikelihoodMapUnitTest.java index 95e4806fa..b9656ebdb 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/genotyper/PerReadAlleleLikelihoodMapUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/genotyper/PerReadAlleleLikelihoodMapUnitTest.java @@ -52,6 +52,7 @@ package org.broadinstitute.gatk.utils.genotyper; import htsjdk.samtools.*; +import htsjdk.samtools.reference.ReferenceSequenceFile; import org.broadinstitute.gatk.utils.BaseTest; import htsjdk.variant.variantcontext.Allele; import org.broadinstitute.gatk.utils.BaseUtils; @@ -64,7 +65,6 @@ import java.util.Map; import java.util.List; import org.testng.Assert; import org.testng.annotations.Test; -import htsjdk.samtools.reference.IndexedFastaSequenceFile; import org.broadinstitute.gatk.utils.GenomeLoc; import org.broadinstitute.gatk.utils.GenomeLocParser; import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile; @@ -81,7 +81,7 @@ public class PerReadAlleleLikelihoodMapUnitTest extends BaseTest { private GenomeLocParser genomeLocParser; // example fasta index file, can be deleted if you don't use the reference - private IndexedFastaSequenceFile seq; + private ReferenceSequenceFile seq; @BeforeClass public void setup() throws FileNotFoundException { diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/CommandLineExecutable.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/CommandLineExecutable.java index 73dc0749c..9598349bb 100644 --- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/CommandLineExecutable.java +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/CommandLineExecutable.java @@ -37,6 +37,7 @@ import org.broadinstitute.gatk.engine.io.stubs.VCFWriterArgumentTypeDescriptor; import org.broadinstitute.gatk.utils.refdata.utils.RMDTriplet; import org.broadinstitute.gatk.engine.walkers.Walker; import org.broadinstitute.gatk.utils.text.ListFileUtils; +import htsjdk.samtools.sra.SRAAccession; import java.util.ArrayList; import java.util.Arrays; @@ -63,10 +64,19 @@ public abstract class CommandLineExecutable extends CommandLineProgram { /** * A list of all the arguments initially used as sources. */ - private final Collection argumentSources = new ArrayList(); + private final Collection argumentSources = new ArrayList<>(); protected static Logger logger = Logger.getLogger(CommandLineExecutable.class); + private final static String SRA_LIBS_DOWNLOAD = "samjdk.sra_libraries_download"; + /** + * Set GATK version to be used as part of user agent for network requests + */ + static { + System.setProperty(SRA_LIBS_DOWNLOAD, "true"); + SRAAccession.setAppVersionString("GATK " + getVersionNumber()); + } + /** * this is the function that the inheriting class can expect to have called * when the command line system has initialized. diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/GenomeAnalysisEngine.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/GenomeAnalysisEngine.java index 469e5491d..85c535c1f 100644 --- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/GenomeAnalysisEngine.java +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/GenomeAnalysisEngine.java @@ -905,7 +905,7 @@ public class GenomeAnalysisEngine { * @return A data source for the given set of reads. */ private SAMDataSource createReadsDataSource(final GATKArgumentCollection argCollection, final GenomeLocParser genomeLocParser, - final IndexedFastaSequenceFile refReader, final Map sampleRenameMap) { + final ReferenceSequenceFile refReader, final Map sampleRenameMap) { DownsamplingMethod downsamplingMethod = getDownsamplingMethod(); // Synchronize the method back into the collection so that it shows up when diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/bwa/java/AlignerTestHarness.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/bwa/java/AlignerTestHarness.java index fe3f5052c..db098e0e3 100644 --- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/bwa/java/AlignerTestHarness.java +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/bwa/java/AlignerTestHarness.java @@ -27,6 +27,7 @@ package org.broadinstitute.gatk.engine.alignment.bwa.java; import htsjdk.samtools.reference.IndexedFastaSequenceFile; import htsjdk.samtools.*; +import htsjdk.samtools.reference.ReferenceSequenceFile; import org.broadinstitute.gatk.engine.alignment.Aligner; import org.broadinstitute.gatk.engine.alignment.Alignment; import org.broadinstitute.gatk.utils.BaseUtils; @@ -126,7 +127,7 @@ public class AlignerTestHarness { mismatches++; - IndexedFastaSequenceFile reference = new IndexedFastaSequenceFile(referenceFile); + final ReferenceSequenceFile reference = new IndexedFastaSequenceFile(referenceFile); System.out.printf("read = %s, position = %d, negative strand = %b%n", formatBasesBasedOnCigar(read.getReadString(),read.getCigar(),CigarOperator.DELETION), read.getAlignmentStart(), diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/LocusShardDataProvider.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/LocusShardDataProvider.java index 227675eea..20615c4d6 100644 --- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/LocusShardDataProvider.java +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/LocusShardDataProvider.java @@ -25,7 +25,7 @@ package org.broadinstitute.gatk.engine.datasources.providers; -import htsjdk.samtools.reference.IndexedFastaSequenceFile; +import htsjdk.samtools.reference.ReferenceSequenceFile; import org.broadinstitute.gatk.engine.ReadProperties; import org.broadinstitute.gatk.engine.datasources.reads.Shard; import org.broadinstitute.gatk.engine.datasources.rmd.ReferenceOrderedDataSource; @@ -62,7 +62,7 @@ public class LocusShardDataProvider extends ShardDataProvider { * @param shard The chunk of data over which traversals happen. * @param reference A getter for a section of the reference. */ - public LocusShardDataProvider(Shard shard, ReadProperties sourceInfo, GenomeLocParser genomeLocParser, GenomeLoc locus, LocusIterator locusIterator, IndexedFastaSequenceFile reference, Collection rods) { + public LocusShardDataProvider(final Shard shard, final ReadProperties sourceInfo, final GenomeLocParser genomeLocParser, final GenomeLoc locus, final LocusIterator locusIterator, final ReferenceSequenceFile reference, final Collection rods) { super(shard,genomeLocParser,reference,rods); this.sourceInfo = sourceInfo; this.locus = locus; diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/ReadShardDataProvider.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/ReadShardDataProvider.java index 15f861c20..0483f5bc4 100644 --- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/ReadShardDataProvider.java +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/ReadShardDataProvider.java @@ -26,6 +26,7 @@ package org.broadinstitute.gatk.engine.datasources.providers; import htsjdk.samtools.reference.IndexedFastaSequenceFile; +import htsjdk.samtools.reference.ReferenceSequenceFile; import org.broadinstitute.gatk.engine.datasources.reads.Shard; import org.broadinstitute.gatk.engine.datasources.rmd.ReferenceOrderedDataSource; import org.broadinstitute.gatk.utils.iterators.GATKSAMIterator; @@ -50,7 +51,7 @@ public class ReadShardDataProvider extends ShardDataProvider { * @param shard The chunk of data over which traversals happen. * @param reference A getter for a section of the reference. */ - public ReadShardDataProvider(Shard shard, GenomeLocParser genomeLocParser, GATKSAMIterator reads, IndexedFastaSequenceFile reference, Collection rods) { + public ReadShardDataProvider(final Shard shard, final GenomeLocParser genomeLocParser, final GATKSAMIterator reads, final ReferenceSequenceFile reference, final Collection rods) { super(shard,genomeLocParser,reference,rods); this.reads = reads; } diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/ReferenceView.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/ReferenceView.java index e5932bd94..4e8eae8c5 100644 --- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/ReferenceView.java +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/ReferenceView.java @@ -29,6 +29,7 @@ import htsjdk.samtools.reference.IndexedFastaSequenceFile; import htsjdk.samtools.reference.ReferenceSequence; import htsjdk.samtools.SAMRecord; import htsjdk.samtools.SAMSequenceRecord; +import htsjdk.samtools.reference.ReferenceSequenceFile; import org.broadinstitute.gatk.utils.GenomeLoc; import org.broadinstitute.gatk.utils.GenomeLocParser; import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; @@ -61,13 +62,13 @@ public class ReferenceView implements View { /** * The source of reference data. */ - protected IndexedFastaSequenceFile reference = null; + protected ReferenceSequenceFile reference = null; /** * Create a new ReferenceView. * @param provider */ - public ReferenceView( ShardDataProvider provider ) { + public ReferenceView( final ShardDataProvider provider ) { this.genomeLocParser = provider.getGenomeLocParser(); this.reference = provider.getReference(); } diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/ShardDataProvider.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/ShardDataProvider.java index 27c71173a..c1b663f8d 100644 --- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/ShardDataProvider.java +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/ShardDataProvider.java @@ -25,7 +25,7 @@ package org.broadinstitute.gatk.engine.datasources.providers; -import htsjdk.samtools.reference.IndexedFastaSequenceFile; +import htsjdk.samtools.reference.ReferenceSequenceFile; import org.broadinstitute.gatk.engine.datasources.reads.Shard; import org.broadinstitute.gatk.engine.datasources.rmd.ReferenceOrderedDataSource; import org.broadinstitute.gatk.utils.GenomeLocParser; @@ -70,7 +70,7 @@ public abstract class ShardDataProvider { /** * Provider of reference data for this particular shard. */ - private final IndexedFastaSequenceFile reference; + private final ReferenceSequenceFile reference; /** * Sources of reference-ordered data. @@ -106,7 +106,7 @@ public abstract class ShardDataProvider { * Gets a pointer into the given indexed fasta sequence file. * @return The indexed fasta sequence file. */ - IndexedFastaSequenceFile getReference() { + ReferenceSequenceFile getReference() { return reference; } @@ -131,7 +131,7 @@ public abstract class ShardDataProvider { * @param shard The chunk of data over which traversals happen. * @param reference A getter for a section of the reference. */ - public ShardDataProvider(Shard shard,GenomeLocParser genomeLocParser,IndexedFastaSequenceFile reference,Collection rods) { + public ShardDataProvider(final Shard shard, final GenomeLocParser genomeLocParser, final ReferenceSequenceFile reference, final Collection rods) { this.shard = shard; this.genomeLocParser = genomeLocParser; this.reference = reference; diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/GATKBAMIndex.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/GATKBAMIndex.java index 394923b1e..7f954d82e 100644 --- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/GATKBAMIndex.java +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/GATKBAMIndex.java @@ -25,20 +25,7 @@ package org.broadinstitute.gatk.engine.datasources.reads; -import htsjdk.samtools.*; -import htsjdk.samtools.seekablestream.SeekableBufferedStream; -import htsjdk.samtools.seekablestream.SeekableFileStream; -import htsjdk.samtools.seekablestream.SeekableStream; -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; -import org.broadinstitute.gatk.utils.exceptions.UserException; - -import java.io.File; -import java.io.IOException; -import java.nio.ByteBuffer; -import java.nio.ByteOrder; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; +import htsjdk.samtools.Bin; /** * A basic interface for querying BAM indices. @@ -47,116 +34,24 @@ import java.util.List; * @author mhanna * @version 0.1 */ -public class GATKBAMIndex { +public abstract class GATKBAMIndex { + /** - * BAM index file magic number. + * What is the starting bin for each level? */ - private static final byte[] BAM_INDEX_MAGIC = "BAI\1".getBytes(); + protected static final int[] LEVEL_STARTS = {0,1,9,73,585,4681}; /** * Reports the total amount of genomic data that any bin can index. */ protected static final int BIN_GENOMIC_SPAN = 512*1024*1024; - /** - * What is the starting bin for each level? - */ - private static final int[] LEVEL_STARTS = {0,1,9,73,585,4681}; - /** * Reports the maximum number of bins that can appear in a BAM file. */ public static final int MAX_BINS = 37450; // =(8^6-1)/7+1 - private final SAMSequenceDictionary sequenceDictionary; - private final File mFile; - - //TODO: figure out a good value for this buffer size - private static final int BUFFERED_STREAM_BUFFER_SIZE = 8192; - - /** - * Number of sequences stored in this index. - */ - private final int sequenceCount; - - /** - * A cache of the starting positions of the sequences. - */ - private final long[] sequenceStartCache; - - private SeekableFileStream fileStream; - private SeekableStream baiStream; - private SeekableBufferedStream bufferedStream; - private long fileLength; - - public GATKBAMIndex(final File file, final SAMSequenceDictionary sequenceDictionary) { - mFile = file; - this.sequenceDictionary = sequenceDictionary; - - // Open the file stream. - openIndexFile(); - - // Verify the magic number. - seek(0); - final byte[] buffer = readBytes(4); - if (!Arrays.equals(buffer, BAM_INDEX_MAGIC)) { - throw new ReviewedGATKException("Invalid file header in BAM index " + mFile + - ": " + new String(buffer)); - } - - seek(4); - - sequenceCount = readInteger(); - - // Create a cache of the starting position of each sequence. Initialize it to -1. - sequenceStartCache = new long[sequenceCount]; - for(int i = 1; i < sequenceCount; i++) - sequenceStartCache[i] = -1; - - // Seed the first element in the array with the current position. - if(sequenceCount > 0) - sequenceStartCache[0] = position(); - - closeIndexFile(); - } - - public GATKBAMIndexData readReferenceSequence(final int referenceSequence) { - openIndexFile(); - - if (referenceSequence >= sequenceCount) - throw new ReviewedGATKException("Invalid sequence number " + referenceSequence + " in index file " + mFile); - - skipToSequence(referenceSequence); - - int binCount = readInteger(); - List bins = new ArrayList<>(); - for (int binNumber = 0; binNumber < binCount; binNumber++) { - final int indexBin = readInteger(); - final int nChunks = readInteger(); - - List chunks = new ArrayList<>(nChunks); - long[] rawChunkData = readLongs(nChunks*2); - for (int ci = 0; ci < nChunks; ci++) { - final long chunkBegin = rawChunkData[ci*2]; - final long chunkEnd = rawChunkData[ci*2+1]; - chunks.add(new GATKChunk(chunkBegin, chunkEnd)); - } - GATKBin bin = new GATKBin(referenceSequence, indexBin); - bin.setChunkList(chunks.toArray(new GATKChunk[chunks.size()])); - while(indexBin >= bins.size()) - bins.add(null); - bins.set(indexBin,bin); - } - - final int nLinearBins = readInteger(); - long[] linearIndexEntries = readLongs(nLinearBins); - - LinearIndex linearIndex = new LinearIndex(referenceSequence,0,linearIndexEntries); - - closeIndexFile(); - - return new GATKBAMIndexData(this,referenceSequence,bins,linearIndex); - } + public abstract GATKBAMIndexData readReferenceSequence(final int referenceSequence); /** * Get the number of levels employed by this index. @@ -180,91 +75,35 @@ public class GATKBAMIndex { * @param levelNumber Level number. 0-based. * @return The size (number of possible bins) of the given level. */ - public int getLevelSize(final int levelNumber) { - if(levelNumber == getNumIndexLevels()-1) - return MAX_BINS-LEVEL_STARTS[levelNumber]-1; - else - return LEVEL_STARTS[levelNumber+1]-LEVEL_STARTS[levelNumber]; - } + public abstract int getLevelSize(final int levelNumber); /** * Gets the level associated with the given bin number. * @param bin The bin for which to determine the level. * @return the level associated with the given bin number. */ - public int getLevelForBin(final Bin bin) { - GATKBin gatkBin = new GATKBin(bin); - if(gatkBin.getBinNumber() >= MAX_BINS) - throw new ReviewedGATKException("Tried to get level for invalid bin in index file " + mFile); - for(int i = getNumIndexLevels()-1; i >= 0; i--) { - if(gatkBin.getBinNumber() >= LEVEL_STARTS[i]) - return i; - } - throw new ReviewedGATKException("Unable to find correct bin for bin " + bin + " in index file " + mFile); - } + public abstract int getLevelForBin(final Bin bin); /** * Gets the first locus that this bin can index into. * @param bin The bin to test. * @return The last position that the given bin can represent. */ - public int getFirstLocusInBin(final Bin bin) { - final int level = getLevelForBin(bin); - final int levelStart = LEVEL_STARTS[level]; - final int levelSize = ((level==getNumIndexLevels()-1) ? MAX_BINS-1 : LEVEL_STARTS[level+1]) - levelStart; - return (new GATKBin(bin).getBinNumber() - levelStart)*(BIN_GENOMIC_SPAN /levelSize)+1; - } + public abstract int getFirstLocusInBin(final Bin bin); /** * Gets the last locus that this bin can index into. * @param bin The bin to test. * @return The last position that the given bin can represent. */ - public int getLastLocusInBin(final Bin bin) { - final int level = getLevelForBin(bin); - final int levelStart = LEVEL_STARTS[level]; - final int levelSize = ((level==getNumIndexLevels()-1) ? MAX_BINS-1 : LEVEL_STARTS[level+1]) - levelStart; - return (new GATKBin(bin).getBinNumber()-levelStart+1)*(BIN_GENOMIC_SPAN /levelSize); - } + public abstract int getLastLocusInBin(final Bin bin); /** * Use to get close to the unmapped reads at the end of a BAM file. * @return The file offset of the first record in the last linear bin, or -1 * if there are no elements in linear bins (i.e. no mapped reads). */ - public long getStartOfLastLinearBin() { - openIndexFile(); - - seek(4); - - final int sequenceCount = readInteger(); - // Because no reads may align to the last sequence in the sequence dictionary, - // grab the last element of the linear index for each sequence, and return - // the last one from the last sequence that has one. - long lastLinearIndexPointer = -1; - for (int i = 0; i < sequenceCount; i++) { - // System.out.println("# Sequence TID: " + i); - final int nBins = readInteger(); - // System.out.println("# nBins: " + nBins); - for (int j1 = 0; j1 < nBins; j1++) { - // Skip bin # - skipBytes(4); - final int nChunks = readInteger(); - // Skip chunks - skipBytes(16 * nChunks); - } - final int nLinearBins = readInteger(); - if (nLinearBins > 0) { - // Skip to last element of list of linear bins - skipBytes(8 * (nLinearBins - 1)); - lastLinearIndexPointer = readLongs(1)[0]; - } - } - - closeIndexFile(); - - return lastLinearIndexPointer; - } + public abstract long getStartOfLastLinearBin(); /** * Gets the possible number of bins for a given reference sequence. @@ -273,197 +112,4 @@ public class GATKBAMIndex { protected int getMaxAddressibleGenomicLocation() { return BIN_GENOMIC_SPAN; } - - protected void skipToSequence(final int referenceSequence) { - // Find the offset in the file of the last sequence whose position has been determined. Start here - // when searching the sequence for the next value to read. (Note that sequenceStartCache[0] will always - // be present, so no extra stopping condition is necessary. - int sequenceIndex = referenceSequence; - while(sequenceStartCache[sequenceIndex] == -1) - sequenceIndex--; - - // Advance to the most recently found position. - seek(sequenceStartCache[sequenceIndex]); - - for (int i = sequenceIndex; i < referenceSequence; i++) { - sequenceStartCache[i] = position(); - // System.out.println("# Sequence TID: " + i); - final int nBins = readInteger(); - // System.out.println("# nBins: " + nBins); - for (int j = 0; j < nBins; j++) { - /* final int bin = */ - readInteger(); - final int nChunks = readInteger(); - // System.out.println("# bin[" + j + "] = " + bin + ", nChunks = " + nChunks); - skipBytes(16 * nChunks); - } - final int nLinearBins = readInteger(); - // System.out.println("# nLinearBins: " + nLinearBins); - skipBytes(8 * nLinearBins); - - } - - sequenceStartCache[referenceSequence] = position(); - } - - - - private void openIndexFile() { - try { - fileStream = new SeekableFileStream(mFile); - baiStream = SamIndexes.asBaiSeekableStreamOrNull(fileStream, sequenceDictionary); - bufferedStream = new SeekableBufferedStream(baiStream, BUFFERED_STREAM_BUFFER_SIZE); - fileLength=bufferedStream.length(); - } - catch (IOException exc) { - throw new ReviewedGATKException("Unable to open index file (" + exc.getMessage() +")" + mFile, exc); - } - } - - private void closeIndexFile() { - try { - bufferedStream.close(); - baiStream.close(); - fileStream.close(); - fileLength = -1; - } - catch (IOException exc) { - throw new ReviewedGATKException("Unable to close index file " + mFile, exc); - } - } - - private static final int INT_SIZE_IN_BYTES = Integer.SIZE / 8; - private static final int LONG_SIZE_IN_BYTES = Long.SIZE / 8; - - private byte[] readBytes(int count) { - ByteBuffer buffer = getBuffer(count); - read(buffer); - buffer.flip(); - byte[] contents = new byte[count]; - buffer.get(contents); - return contents; - } - - private int readInteger() { - ByteBuffer buffer = getBuffer(INT_SIZE_IN_BYTES); - read(buffer); - buffer.flip(); - return buffer.getInt(); - } - - /** - * Reads an array of longs from the file channel, returning the results as an array. - * @param count Number of longs to read. - * @return An array of longs. Size of array should match count. - */ - private long[] readLongs(final int count) { - ByteBuffer buffer = getBuffer(count*LONG_SIZE_IN_BYTES); - read(buffer); - buffer.flip(); - long[] result = new long[count]; - for(int i = 0; i < count; i++) - result[i] = buffer.getLong(); - return result; - } - - private void read(final ByteBuffer buffer) { - final int bytesRequested = buffer.limit(); - if (bytesRequested == 0) - return; - - try { - - //BufferedInputStream cannot read directly into a byte buffer, so we read into an array - //and put the result into the bytebuffer after the if statement. - - // We have a rigid expectation here to read in exactly the number of bytes we've limited - // our buffer to -- if there isn't enough data in the file, the index - // must be truncated or otherwise corrupt: - if(bytesRequested > fileLength - bufferedStream.position()){ - throw new UserException.MalformedFile(mFile, String.format("Premature end-of-file while reading BAM index file %s. " + - "It's likely that this file is truncated or corrupt -- " + - "Please try re-indexing the corresponding BAM file.", - mFile)); - } - - int bytesRead = bufferedStream.read(byteArray, 0, bytesRequested); - - // We have a rigid expectation here to read in exactly the number of bytes we've limited - // our buffer to -- if we encounter EOF (-1), the index - // must be truncated or otherwise corrupt: - if (bytesRead <= 0) { - throw new UserException.MalformedFile(mFile, String.format("Premature end-of-file while reading BAM index file %s. " + - "It's likely that this file is truncated or corrupt -- " + - "Please try re-indexing the corresponding BAM file.", - mFile)); - } - - if(bytesRead != bytesRequested) - throw new RuntimeException("Read amount different from requested amount. This should not happen."); - - buffer.put(byteArray, 0, bytesRequested); - } - catch(IOException ex) { - throw new ReviewedGATKException("Index: unable to read bytes from index file " + mFile); - } - } - - - /** - * A reusable buffer for use by this index generator. - * TODO: Should this be a SoftReference? - */ - private ByteBuffer buffer = null; - - //BufferedStream don't read into ByteBuffers, so we need this temporary array - private byte[] byteArray=null; - private ByteBuffer getBuffer(final int size) { - if(buffer == null || buffer.capacity() < size) { - // Allocate a new byte buffer. For now, make it indirect to make sure it winds up on the heap for easier debugging. - buffer = ByteBuffer.allocate(size); - byteArray = new byte[size]; - buffer.order(ByteOrder.LITTLE_ENDIAN); - } - buffer.clear(); - buffer.limit(size); - return buffer; - } - - private void skipBytes(final int count) { - try { - - //try to skip forward the requested amount. - long skipped = bufferedStream.skip(count); - - if( skipped != count ) { //if not managed to skip the requested amount - throw new ReviewedGATKException("Index: unable to reposition file channel of index file " + mFile); - } - } - catch(IOException ex) { - throw new ReviewedGATKException("Index: unable to reposition file channel of index file " + mFile); - } - } - - private void seek(final long position) { - try { - //to seek a new position, move the fileChannel, and reposition the bufferedStream - bufferedStream.seek(position); - } - catch(IOException ex) { - throw new ReviewedGATKException("Index: unable to reposition of file channel of index file " + mFile); - } - } - - /** - * Retrieve the position from the current file channel. - * @return position of the current file channel. - */ - private long position() { - try { - return bufferedStream.position(); - } - catch (IOException exc) { - throw new ReviewedGATKException("Unable to read position from index file " + mFile, exc); - } - } } diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/GATKBAMIndexFromDataSource.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/GATKBAMIndexFromDataSource.java new file mode 100644 index 000000000..5f3f92b72 --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/GATKBAMIndexFromDataSource.java @@ -0,0 +1,109 @@ +/* +* Copyright 2012-2016 Broad Institute, Inc. +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.datasources.reads; + +import htsjdk.samtools.*; +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; + +import java.io.File; +import java.util.ArrayList; +import java.util.List; + +/** + * This class implements BAM index querying API + * by wrapping a class BrowseableBAMIndex from HTSJDK. + * + * @version 0.1 + */ +public class GATKBAMIndexFromDataSource extends GATKBAMIndex { + private File sourceFile; + private SAMFileHeader sourceHeader; + private BrowseableBAMIndex index; + + public GATKBAMIndexFromDataSource(final File sourceFile, final SAMFileHeader sourceHeader, final BrowseableBAMIndex index) { + this.sourceFile = sourceFile; + this.sourceHeader = sourceHeader; + this.index = index; + } + + @Override + public GATKBAMIndexData readReferenceSequence(final int referenceSequence) { + final List sequences = sourceHeader.getSequenceDictionary().getSequences(); + if (referenceSequence >= sequences.size()) + throw new ReviewedGATKException("Sequence number " + referenceSequence + " cannot be greater or equal to " + sequences.size() + " in index file " + sourceFile); + + + final BinList sourceBins = index.getBinsOverlapping(referenceSequence, 0, sequences.get(referenceSequence).getSequenceLength()); + + final List bins = new ArrayList<>(); + for (Bin sourceBin : sourceBins) { + final int indexBin = sourceBin.getBinNumber(); + while(indexBin >= bins.size()) + bins.add(null); + + final GATKBin bin = new GATKBin(referenceSequence, indexBin); + final List chunks = index.getSpanOverlapping(sourceBin).getChunks(); + final List gatkChunks = new ArrayList<>(chunks.size()); + for (Chunk chunk : chunks) { + gatkChunks.add(new GATKChunk(chunk)); + } + + bin.setChunkList(gatkChunks.toArray(new GATKChunk[gatkChunks.size()])); + + bins.set(indexBin, bin); + } + + // there is no interface to get linear index from HTSJDK + final LinearIndex linearIndex = new LinearIndex(referenceSequence, 0, new long[]{}); + + return new GATKBAMIndexData(this,referenceSequence,bins,linearIndex); + } + + @Override + public int getLevelSize(int levelNumber) { + return index.getLevelSize(levelNumber); + } + + @Override + public int getLevelForBin(Bin bin) { + return index.getLevelForBin(bin); + } + + @Override + public int getFirstLocusInBin(Bin bin) { + return index.getFirstLocusInBin(bin); + } + + @Override + public int getLastLocusInBin(Bin bin) { + return index.getLastLocusInBin(bin); + } + + @Override + public long getStartOfLastLinearBin() { + return index.getStartOfLastLinearBin(); + } +} diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/GATKBAMIndexFromFile.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/GATKBAMIndexFromFile.java new file mode 100644 index 000000000..f1498a63a --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/GATKBAMIndexFromFile.java @@ -0,0 +1,435 @@ +/* +* Copyright 2012-2016 Broad Institute, Inc. +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.datasources.reads; + +import htsjdk.samtools.*; +import htsjdk.samtools.seekablestream.SeekableBufferedStream; +import htsjdk.samtools.seekablestream.SeekableFileStream; +import htsjdk.samtools.seekablestream.SeekableStream; +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; +import org.broadinstitute.gatk.utils.exceptions.UserException; + +import java.io.File; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +/** + * This class implements BAM index querying API + * by reading .bai file directly. + * Very much not thread-safe. + * + * @author mhanna + * @version 0.1 + */ +public class GATKBAMIndexFromFile extends GATKBAMIndex { + /** + * BAM index file magic number. + */ + private static final byte[] BAM_INDEX_MAGIC = "BAI\1".getBytes(); + + private final SAMSequenceDictionary sequenceDictionary; + private final File mFile; + + //TODO: figure out a good value for this buffer size + private static final int BUFFERED_STREAM_BUFFER_SIZE = 8192; + + /** + * Number of sequences stored in this index. + */ + private final int sequenceCount; + + /** + * A cache of the starting positions of the sequences. + */ + private final long[] sequenceStartCache; + + private SeekableFileStream fileStream; + private SeekableStream baiStream; + private SeekableBufferedStream bufferedStream; + private long fileLength; + + public GATKBAMIndexFromFile(final File file, final SAMSequenceDictionary sequenceDictionary) { + mFile = file; + this.sequenceDictionary = sequenceDictionary; + + // Open the file stream. + openIndexFile(); + + // Verify the magic number. + seek(0); + final byte[] buffer = readBytes(4); + if (!Arrays.equals(buffer, BAM_INDEX_MAGIC)) { + throw new ReviewedGATKException("Invalid file header in BAM index " + mFile + + ": " + new String(buffer)); + } + + seek(4); + + sequenceCount = readInteger(); + + // Create a cache of the starting position of each sequence. Initialize it to -1. + sequenceStartCache = new long[sequenceCount]; + for(int i = 1; i < sequenceCount; i++) + sequenceStartCache[i] = -1; + + // Seed the first element in the array with the current position. + if(sequenceCount > 0) + sequenceStartCache[0] = position(); + + closeIndexFile(); + } + + public GATKBAMIndexData readReferenceSequence(final int referenceSequence) { + openIndexFile(); + + if (referenceSequence >= sequenceCount) + throw new ReviewedGATKException("Invalid sequence number " + referenceSequence + " in index file " + mFile); + + skipToSequence(referenceSequence); + + final int binCount = readInteger(); + List bins = new ArrayList<>(); + for (int binNumber = 0; binNumber < binCount; binNumber++) { + final int indexBin = readInteger(); + final int nChunks = readInteger(); + + final List chunks = new ArrayList<>(nChunks); + final long[] rawChunkData = readLongs(nChunks*2); + for (int ci = 0; ci < nChunks; ci++) { + final long chunkBegin = rawChunkData[ci*2]; + final long chunkEnd = rawChunkData[ci*2+1]; + chunks.add(new GATKChunk(chunkBegin, chunkEnd)); + } + final GATKBin bin = new GATKBin(referenceSequence, indexBin); + bin.setChunkList(chunks.toArray(new GATKChunk[chunks.size()])); + while(indexBin >= bins.size()) + bins.add(null); + bins.set(indexBin,bin); + } + + final int nLinearBins = readInteger(); + long[] linearIndexEntries = readLongs(nLinearBins); + + final LinearIndex linearIndex = new LinearIndex(referenceSequence,0,linearIndexEntries); + + closeIndexFile(); + + return new GATKBAMIndexData(this,referenceSequence,bins,linearIndex); + } + + /** + * Gets the number of bins in the given level. + * @param levelNumber Level number. 0-based. + * @return The size (number of possible bins) of the given level. + */ + @Override + public int getLevelSize(final int levelNumber) { + if(levelNumber == getNumIndexLevels()-1) + return MAX_BINS-LEVEL_STARTS[levelNumber]-1; + else + return LEVEL_STARTS[levelNumber+1]-LEVEL_STARTS[levelNumber]; + } + + /** + * Gets the level associated with the given bin number. + * @param bin The bin for which to determine the level. + * @return the level associated with the given bin number. + */ + @Override + public int getLevelForBin(final Bin bin) { + final GATKBin gatkBin = new GATKBin(bin); + if(gatkBin.getBinNumber() >= MAX_BINS) + throw new ReviewedGATKException("Tried to get level for invalid bin in index file " + mFile); + for(int i = getNumIndexLevels()-1; i >= 0; i--) { + if(gatkBin.getBinNumber() >= LEVEL_STARTS[i]) + return i; + } + throw new ReviewedGATKException("Unable to find correct bin for bin " + bin + " in index file " + mFile); + } + + /** + * Gets the first locus that this bin can index into. + * @param bin The bin to test. + * @return The last position that the given bin can represent. + */ + @Override + public int getFirstLocusInBin(final Bin bin) { + final int level = getLevelForBin(bin); + final int levelStart = LEVEL_STARTS[level]; + final int levelSize = ((level==getNumIndexLevels()-1) ? MAX_BINS-1 : LEVEL_STARTS[level+1]) - levelStart; + return (new GATKBin(bin).getBinNumber() - levelStart)*(BIN_GENOMIC_SPAN /levelSize)+1; + } + + /** + * Gets the last locus that this bin can index into. + * @param bin The bin to test. + * @return The last position that the given bin can represent. + */ + @Override + public int getLastLocusInBin(final Bin bin) { + final int level = getLevelForBin(bin); + final int levelStart = LEVEL_STARTS[level]; + final int levelSize = ((level==getNumIndexLevels()-1) ? MAX_BINS-1 : LEVEL_STARTS[level+1]) - levelStart; + return (new GATKBin(bin).getBinNumber()-levelStart+1)*(BIN_GENOMIC_SPAN /levelSize); + } + + /** + * Use to get close to the unmapped reads at the end of a BAM file. + * @return The file offset of the first record in the last linear bin, or -1 + * if there are no elements in linear bins (i.e. no mapped reads). + */ + @Override + public long getStartOfLastLinearBin() { + openIndexFile(); + + seek(4); + + final int sequenceCount = readInteger(); + // Because no reads may align to the last sequence in the sequence dictionary, + // grab the last element of the linear index for each sequence, and return + // the last one from the last sequence that has one. + long lastLinearIndexPointer = -1; + for (int i = 0; i < sequenceCount; i++) { + // System.out.println("# Sequence TID: " + i); + final int nBins = readInteger(); + // System.out.println("# nBins: " + nBins); + for (int j1 = 0; j1 < nBins; j1++) { + // Skip bin # + skipBytes(4); + final int nChunks = readInteger(); + // Skip chunks + skipBytes(16 * nChunks); + } + final int nLinearBins = readInteger(); + if (nLinearBins > 0) { + // Skip to last element of list of linear bins + skipBytes(8 * (nLinearBins - 1)); + lastLinearIndexPointer = readLongs(1)[0]; + } + } + + closeIndexFile(); + + return lastLinearIndexPointer; + } + + protected void skipToSequence(final int referenceSequence) { + // Find the offset in the file of the last sequence whose position has been determined. Start here + // when searching the sequence for the next value to read. (Note that sequenceStartCache[0] will always + // be present, so no extra stopping condition is necessary. + int sequenceIndex = referenceSequence; + while(sequenceStartCache[sequenceIndex] == -1) + sequenceIndex--; + + // Advance to the most recently found position. + seek(sequenceStartCache[sequenceIndex]); + + for (int i = sequenceIndex; i < referenceSequence; i++) { + sequenceStartCache[i] = position(); + // System.out.println("# Sequence TID: " + i); + final int nBins = readInteger(); + // System.out.println("# nBins: " + nBins); + for (int j = 0; j < nBins; j++) { + /* final int bin = */ + readInteger(); + final int nChunks = readInteger(); + // System.out.println("# bin[" + j + "] = " + bin + ", nChunks = " + nChunks); + skipBytes(16 * nChunks); + } + final int nLinearBins = readInteger(); + // System.out.println("# nLinearBins: " + nLinearBins); + skipBytes(8 * nLinearBins); + + } + + sequenceStartCache[referenceSequence] = position(); + } + + + + private void openIndexFile() { + try { + fileStream = new SeekableFileStream(mFile); + baiStream = SamIndexes.asBaiSeekableStreamOrNull(fileStream, sequenceDictionary); + bufferedStream = new SeekableBufferedStream(baiStream, BUFFERED_STREAM_BUFFER_SIZE); + fileLength=bufferedStream.length(); + } + catch (IOException exc) { + throw new ReviewedGATKException("Unable to open index file (" + exc.getMessage() +")" + mFile, exc); + } + } + + private void closeIndexFile() { + try { + bufferedStream.close(); + baiStream.close(); + fileStream.close(); + fileLength = -1; + } + catch (IOException exc) { + throw new ReviewedGATKException("Unable to close index file " + mFile, exc); + } + } + + private static final int INT_SIZE_IN_BYTES = Integer.SIZE / 8; + private static final int LONG_SIZE_IN_BYTES = Long.SIZE / 8; + + private byte[] readBytes(int count) { + final ByteBuffer buffer = getBuffer(count); + read(buffer); + buffer.flip(); + byte[] contents = new byte[count]; + buffer.get(contents); + return contents; + } + + private int readInteger() { + final ByteBuffer buffer = getBuffer(INT_SIZE_IN_BYTES); + read(buffer); + buffer.flip(); + return buffer.getInt(); + } + + /** + * Reads an array of longs from the file channel, returning the results as an array. + * @param count Number of longs to read. + * @return An array of longs. Size of array should match count. + */ + private long[] readLongs(final int count) { + final ByteBuffer buffer = getBuffer(count*LONG_SIZE_IN_BYTES); + read(buffer); + buffer.flip(); + long[] result = new long[count]; + for(int i = 0; i < count; i++) + result[i] = buffer.getLong(); + return result; + } + + private void read(final ByteBuffer buffer) { + final int bytesRequested = buffer.limit(); + if (bytesRequested == 0) + return; + + try { + + //BufferedInputStream cannot read directly into a byte buffer, so we read into an array + //and put the result into the bytebuffer after the if statement. + + // We have a rigid expectation here to read in exactly the number of bytes we've limited + // our buffer to -- if there isn't enough data in the file, the index + // must be truncated or otherwise corrupt: + if(bytesRequested > fileLength - bufferedStream.position()){ + throw new UserException.MalformedFile(mFile, String.format("Premature end-of-file while reading BAM index file %s. " + + "It's likely that this file is truncated or corrupt -- " + + "Please try re-indexing the corresponding BAM file.", + mFile)); + } + + int bytesRead = bufferedStream.read(byteArray, 0, bytesRequested); + + // We have a rigid expectation here to read in exactly the number of bytes we've limited + // our buffer to -- if we encounter EOF (-1), the index + // must be truncated or otherwise corrupt: + if (bytesRead <= 0) { + throw new UserException.MalformedFile(mFile, String.format("Premature end-of-file while reading BAM index file %s. " + + "It's likely that this file is truncated or corrupt -- " + + "Please try re-indexing the corresponding BAM file.", + mFile)); + } + + if(bytesRead != bytesRequested) + throw new RuntimeException("Read amount different from requested amount. This should not happen."); + + buffer.put(byteArray, 0, bytesRequested); + } + catch(IOException ex) { + throw new ReviewedGATKException("Index: unable to read bytes from index file " + mFile); + } + } + + + /** + * A reusable buffer for use by this index generator. + * TODO: Should this be a SoftReference? + */ + private ByteBuffer buffer = null; + + //BufferedStream don't read into ByteBuffers, so we need this temporary array + private byte[] byteArray=null; + private ByteBuffer getBuffer(final int size) { + if(buffer == null || buffer.capacity() < size) { + // Allocate a new byte buffer. For now, make it indirect to make sure it winds up on the heap for easier debugging. + buffer = ByteBuffer.allocate(size); + byteArray = new byte[size]; + buffer.order(ByteOrder.LITTLE_ENDIAN); + } + buffer.clear(); + buffer.limit(size); + return buffer; + } + + private void skipBytes(final int count) { + try { + + //try to skip forward the requested amount. + final long skipped = bufferedStream.skip(count); + + if( skipped != count ) { //if not managed to skip the requested amount + throw new ReviewedGATKException("Index: unable to reposition file channel of index file " + mFile); + } + } + catch(IOException ex) { + throw new ReviewedGATKException("Index: unable to reposition file channel of index file " + mFile); + } + } + + private void seek(final long position) { + try { + //to seek a new position, move the fileChannel, and reposition the bufferedStream + bufferedStream.seek(position); + } + catch(IOException ex) { + throw new ReviewedGATKException("Index: unable to reposition of file channel of index file " + mFile); + } + } + + /** + * Retrieve the position from the current file channel. + * @return position of the current file channel. + */ + private long position() { + try { + return bufferedStream.position(); + } + catch (IOException exc) { + throw new ReviewedGATKException("Unable to read position from index file " + mFile, exc); + } + } +} diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/SAMDataSource.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/SAMDataSource.java index dd662f01d..76d309bcb 100644 --- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/SAMDataSource.java +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/SAMDataSource.java @@ -28,7 +28,10 @@ package org.broadinstitute.gatk.engine.datasources.reads; import htsjdk.samtools.MergingSamRecordIterator; import htsjdk.samtools.SamFileHeaderMerger; import htsjdk.samtools.*; +import htsjdk.samtools.reference.ReferenceSequenceFile; import htsjdk.samtools.reference.ReferenceSequenceFileFactory; +import htsjdk.samtools.sra.SRAAccession; +import htsjdk.samtools.sra.SRAIndexedSequenceFile; import htsjdk.samtools.util.CloseableIterator; import htsjdk.samtools.util.CloserUtil; import htsjdk.samtools.util.RuntimeIOException; @@ -51,7 +54,6 @@ import org.broadinstitute.gatk.utils.interval.IntervalMergingRule; import org.broadinstitute.gatk.utils.iterators.GATKSAMIterator; import org.broadinstitute.gatk.utils.iterators.GATKSAMIteratorAdapter; import org.broadinstitute.gatk.utils.sam.GATKSAMReadGroupRecord; -import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; import org.broadinstitute.gatk.utils.sam.GATKSAMRecordIterator; import org.broadinstitute.gatk.utils.sam.SAMReaderID; @@ -310,7 +312,7 @@ public class SAMDataSource { // Determine the sort order. for(SAMReaderID readerID: readerIDs) { - if (! readerID.getSamFile().canRead() ) + if (!SRAAccession.isValid(readerID.getSamFile().getPath()) && !readerID.getSamFile().canRead() ) throw new UserException.CouldNotReadInputFile(readerID.getSamFile(),"file is not present or user does not have appropriate permissions. " + "Please check that the file is present and readable and try again."); @@ -377,15 +379,28 @@ public class SAMDataSource { if (referenceFile == null) { samSequenceDictionary = mergedHeader.getSequenceDictionary(); } else { - samSequenceDictionary = ReferenceSequenceFileFactory. - getReferenceSequenceFile(referenceFile). - getSequenceDictionary(); + ReferenceSequenceFile ref; + // maybe it is SRA file? + if (SRAAccession.isValid(referenceFile.getPath())) { + ref = new SRAIndexedSequenceFile(new SRAAccession(referenceFile.getPath())); + } else { + ref = ReferenceSequenceFileFactory.getReferenceSequenceFile(referenceFile); + } + samSequenceDictionary = ref.getSequenceDictionary(); } for(SAMReaderID id: readerIDs) { File indexFile = findIndexFile(id.getSamFile()); - if(indexFile != null) - bamIndices.put(id,new GATKBAMIndex(indexFile, samSequenceDictionary)); + if(indexFile != null) { + bamIndices.put(id, new GATKBAMIndexFromFile(indexFile, samSequenceDictionary)); + continue; + } + + // if index file is not found, try SamReader indexing interface + SamReader reader = readers.getReader(id); + if (reader.indexing().hasBrowseableIndex()) { + bamIndices.put(id, new GATKBAMIndexFromDataSource(id.getSamFile(), reader.getFileHeader(), reader.indexing().getBrowseableIndex())); + } } resourcePool.releaseReaders(readers); @@ -1122,12 +1137,17 @@ public class SAMDataSource { try { if (threadAllocation.getNumIOThreads() > 0) blockInputStream = new BlockInputStream(dispatcher,readerID,false); - reader = SamReaderFactory.makeDefault() - .referenceSequence(referenceFile) + SamReaderFactory factory = SamReaderFactory.makeDefault() .validationStringency(validationStringency) - .setOption(SamReaderFactory.Option.EAGERLY_DECODE, false) + .setOption(SamReaderFactory.Option.EAGERLY_DECODE, false); + + if (SRAAccession.isValid(readerID.getSamFile().getPath())) { + reader = factory.open(SamInputResource.of(new SRAAccession(readerID.getSamFile().getPath()))); + } else { + reader = factory.referenceSequence(referenceFile) .setOption(SamReaderFactory.Option.INCLUDE_SOURCE_IN_RECORDS, true) .open(readerID.getSamFile()); + } } catch ( RuntimeIOException e ) { throw new UserException.CouldNotReadInputFile(readerID.getSamFile(), e); diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/utilities/FindLargeShards.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/utilities/FindLargeShards.java index 7dace7bc5..377a826e3 100644 --- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/utilities/FindLargeShards.java +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/utilities/FindLargeShards.java @@ -26,6 +26,7 @@ package org.broadinstitute.gatk.engine.datasources.reads.utilities; import htsjdk.samtools.reference.IndexedFastaSequenceFile; +import htsjdk.samtools.reference.ReferenceSequenceFile; import org.apache.log4j.Logger; import org.broadinstitute.gatk.utils.commandline.CommandLineProgram; import org.broadinstitute.gatk.utils.commandline.Input; @@ -88,12 +89,12 @@ public class FindLargeShards extends CommandLineProgram { @Override public int execute() throws IOException { // initialize reference - IndexedFastaSequenceFile refReader = new IndexedFastaSequenceFile(referenceFile); - GenomeLocParser genomeLocParser = new GenomeLocParser(refReader); + final ReferenceSequenceFile refReader = new IndexedFastaSequenceFile(referenceFile); + final GenomeLocParser genomeLocParser = new GenomeLocParser(refReader); // initialize reads List bamReaders = ListFileUtils.unpackBAMFileList(samFiles,parser); - SAMDataSource dataSource = new SAMDataSource(referenceFile, bamReaders, new ThreadAllocation(), null, genomeLocParser); + final SAMDataSource dataSource = new SAMDataSource(referenceFile, bamReaders, new ThreadAllocation(), null, genomeLocParser); // intervals final GenomeLocSortedSet intervalSortedSet; @@ -124,12 +125,12 @@ public class FindLargeShards extends CommandLineProgram { } // Print out the stddev: (sum(x^2) - (1/N)*sum(x)^2)/N - long mean = sum.divide(BigInteger.valueOf(numberOfShards)).longValue(); - long stddev = (long)(Math.sqrt(sumOfSquares.subtract(sum.pow(2).divide(BigInteger.valueOf(numberOfShards))).divide(BigInteger.valueOf(numberOfShards)).doubleValue())); + final long mean = sum.divide(BigInteger.valueOf(numberOfShards)).longValue(); + final long stddev = (long)(Math.sqrt(sumOfSquares.subtract(sum.pow(2).divide(BigInteger.valueOf(numberOfShards))).divide(BigInteger.valueOf(numberOfShards)).doubleValue())); logger.info(String.format("Number of shards: %d; mean uncompressed size = %d; stddev uncompressed size = %d%n",numberOfShards,mean,stddev)); // Crank through the shards again, this time reporting on the shards significantly larger than the mean. - long threshold = mean + stddev*5; + final long threshold = mean + stddev*5; logger.warn(String.format("PROGRESS: Searching for large shards: Contig\tRegion.Start\tRegion.Stop\tSize")); out.printf("Contig\tRegion.Start\tRegion.Stop\tSize%n"); diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reference/ReferenceDataSource.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reference/ReferenceDataSource.java index 5dc159e27..a6189d45c 100644 --- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reference/ReferenceDataSource.java +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reference/ReferenceDataSource.java @@ -25,8 +25,8 @@ package org.broadinstitute.gatk.engine.datasources.reference; -import htsjdk.samtools.reference.IndexedFastaSequenceFile; import htsjdk.samtools.SAMSequenceRecord; +import htsjdk.samtools.reference.ReferenceSequenceFile; import org.broadinstitute.gatk.engine.datasources.reads.LocusShard; import org.broadinstitute.gatk.engine.datasources.reads.SAMDataSource; import org.broadinstitute.gatk.engine.datasources.reads.Shard; @@ -46,7 +46,7 @@ import java.util.List; * Looks for fai and dict files, and tries to create them if they don't exist */ public class ReferenceDataSource { - private IndexedFastaSequenceFile reference; + private ReferenceSequenceFile reference; /** our log, which we want to capture anything from this class */ protected static final org.apache.log4j.Logger logger = org.apache.log4j.Logger.getLogger(ReferenceDataSource.class); @@ -55,7 +55,7 @@ public class ReferenceDataSource { * Create reference data source from fasta file * @param fastaFile Fasta file to be used as reference */ - public ReferenceDataSource(File fastaFile) { + public ReferenceDataSource(final File fastaFile) { reference = CachingIndexedFastaSequenceFile.checkAndCreate(fastaFile); } @@ -63,7 +63,7 @@ public class ReferenceDataSource { * Get indexed fasta file * @return IndexedFastaSequenceFile that was created from file */ - public IndexedFastaSequenceFile getReference() { + public ReferenceSequenceFile getReference() { return this.reference; } diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/executive/HierarchicalMicroScheduler.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/executive/HierarchicalMicroScheduler.java index 856c931b1..a0daa1cf6 100644 --- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/executive/HierarchicalMicroScheduler.java +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/executive/HierarchicalMicroScheduler.java @@ -25,7 +25,7 @@ package org.broadinstitute.gatk.engine.executive; -import htsjdk.samtools.reference.IndexedFastaSequenceFile; +import htsjdk.samtools.reference.ReferenceSequenceFile; import htsjdk.tribble.TribbleException; import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; import org.broadinstitute.gatk.engine.datasources.reads.SAMDataSource; @@ -111,7 +111,7 @@ public class HierarchicalMicroScheduler extends MicroScheduler implements Hierar protected HierarchicalMicroScheduler(final GenomeAnalysisEngine engine, final Walker walker, final SAMDataSource reads, - final IndexedFastaSequenceFile reference, + final ReferenceSequenceFile reference, final Collection rods, final ThreadAllocation threadAllocation) { super(engine, walker, reads, reference, rods, threadAllocation); diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/executive/LinearMicroScheduler.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/executive/LinearMicroScheduler.java index 22ca49300..f35acf77c 100644 --- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/executive/LinearMicroScheduler.java +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/executive/LinearMicroScheduler.java @@ -25,7 +25,7 @@ package org.broadinstitute.gatk.engine.executive; -import htsjdk.samtools.reference.IndexedFastaSequenceFile; +import htsjdk.samtools.reference.ReferenceSequenceFile; import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; import org.broadinstitute.gatk.engine.datasources.providers.LocusShardDataProvider; import org.broadinstitute.gatk.engine.datasources.providers.ReadShardDataProvider; @@ -63,7 +63,7 @@ public class LinearMicroScheduler extends MicroScheduler { protected LinearMicroScheduler(final GenomeAnalysisEngine engine, final Walker walker, final SAMDataSource reads, - final IndexedFastaSequenceFile reference, + final ReferenceSequenceFile reference, final Collection rods, final ThreadAllocation threadAllocation) { super(engine, walker, reads, reference, rods, threadAllocation); @@ -84,7 +84,6 @@ public class LinearMicroScheduler extends MicroScheduler { Accumulator accumulator = Accumulator.create(engine,walker); boolean done = walker.isDone(); - int counter = 0; final TraversalEngine traversalEngine = borrowTraversalEngine(this); for (Shard shard : shardStrategy ) { diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/executive/MicroScheduler.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/executive/MicroScheduler.java index 5da39245e..f3f28807c 100644 --- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/executive/MicroScheduler.java +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/executive/MicroScheduler.java @@ -26,7 +26,7 @@ package org.broadinstitute.gatk.engine.executive; import com.google.java.contract.Ensures; -import htsjdk.samtools.reference.IndexedFastaSequenceFile; +import htsjdk.samtools.reference.ReferenceSequenceFile; import org.apache.log4j.Logger; import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; import org.broadinstitute.gatk.engine.ReadMetrics; @@ -104,7 +104,7 @@ public abstract class MicroScheduler implements MicroSchedulerMBean { */ protected final GenomeAnalysisEngine engine; - protected final IndexedFastaSequenceFile reference; + protected final ReferenceSequenceFile reference; private final SAMDataSource reads; protected final Collection rods; @@ -131,7 +131,8 @@ public abstract class MicroScheduler implements MicroSchedulerMBean { * * @return The best-fit microscheduler. */ - public static MicroScheduler create(GenomeAnalysisEngine engine, Walker walker, SAMDataSource reads, IndexedFastaSequenceFile reference, Collection rods, ThreadAllocation threadAllocation) { + public static MicroScheduler create(final GenomeAnalysisEngine engine, final Walker walker, final SAMDataSource reads, final ReferenceSequenceFile reference, + final Collection rods, final ThreadAllocation threadAllocation) { if ( threadAllocation.isRunningInParallelMode() ) { logger.info(String.format("Running the GATK in parallel mode with %d total threads, " + "%d CPU thread(s) for each of %d data thread(s), of %d processors available on this machine", @@ -183,7 +184,7 @@ public abstract class MicroScheduler implements MicroSchedulerMBean { protected MicroScheduler(final GenomeAnalysisEngine engine, final Walker walker, final SAMDataSource reads, - final IndexedFastaSequenceFile reference, + final ReferenceSequenceFile reference, final Collection rods, final ThreadAllocation threadAllocation) { this.engine = engine; @@ -397,7 +398,7 @@ public abstract class MicroScheduler implements MicroSchedulerMBean { * Returns the reference maintained by this scheduler. * @return The reference maintained by this scheduler. */ - public IndexedFastaSequenceFile getReference() { return reference; } + public ReferenceSequenceFile getReference() { return reference; } protected void cleanup() { try { diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/BAQReadTransformer.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/BAQReadTransformer.java index af3432f8b..bbe17ba6e 100644 --- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/BAQReadTransformer.java +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/BAQReadTransformer.java @@ -25,7 +25,7 @@ package org.broadinstitute.gatk.engine.filters; -import htsjdk.samtools.reference.IndexedFastaSequenceFile; +import htsjdk.samtools.reference.ReferenceSequenceFile; import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; import org.broadinstitute.gatk.engine.WalkerManager; import org.broadinstitute.gatk.engine.iterators.ReadTransformer; @@ -41,7 +41,7 @@ import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; */ public class BAQReadTransformer extends ReadTransformer { private BAQ baqHMM; - private IndexedFastaSequenceFile refReader; + private ReferenceSequenceFile refReader; private BAQ.CalculationMode cmode; private BAQ.QualityMode qmode; diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/ActiveRegionWalker.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/ActiveRegionWalker.java index bb3145d6f..669fa4bb3 100644 --- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/ActiveRegionWalker.java +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/ActiveRegionWalker.java @@ -27,6 +27,7 @@ package org.broadinstitute.gatk.engine.walkers; import com.google.java.contract.Ensures; import htsjdk.samtools.reference.IndexedFastaSequenceFile; +import htsjdk.samtools.reference.ReferenceSequenceFile; import htsjdk.tribble.Feature; import org.broadinstitute.gatk.utils.commandline.*; import org.broadinstitute.gatk.utils.contexts.AlignmentContext; @@ -181,7 +182,7 @@ public abstract class ActiveRegionWalker extends Walker allIntervals = new ArrayList(); for( final GenomeLoc interval : intervals.toList() ) { diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/ReadMetricsUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/ReadMetricsUnitTest.java index a5a77ca73..fc6f2b526 100644 --- a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/ReadMetricsUnitTest.java +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/ReadMetricsUnitTest.java @@ -25,8 +25,8 @@ package org.broadinstitute.gatk.engine; -import htsjdk.samtools.reference.IndexedFastaSequenceFile; import htsjdk.samtools.*; +import htsjdk.samtools.reference.ReferenceSequenceFile; import org.broadinstitute.gatk.utils.BaseTest; import org.broadinstitute.gatk.utils.commandline.Tags; import org.broadinstitute.gatk.utils.ValidationExclusion; @@ -82,7 +82,7 @@ public class ReadMetricsUnitTest extends BaseTest { // Test the accuracy of the read metrics private File referenceFile; - private IndexedFastaSequenceFile reference; + private ReferenceSequenceFile reference; private SAMSequenceDictionary dictionary; private SAMFileHeader header; private GATKSAMReadGroupRecord readGroup; diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/providers/ReferenceOrderedViewUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/providers/ReferenceOrderedViewUnitTest.java index 3ca67276a..856167fea 100644 --- a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/providers/ReferenceOrderedViewUnitTest.java +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/providers/ReferenceOrderedViewUnitTest.java @@ -25,6 +25,7 @@ package org.broadinstitute.gatk.engine.datasources.providers; +import htsjdk.samtools.reference.ReferenceSequenceFile; import htsjdk.tribble.Feature; import org.broadinstitute.gatk.utils.commandline.RodBinding; import org.broadinstitute.gatk.utils.commandline.Tags; @@ -71,7 +72,7 @@ public class ReferenceOrderedViewUnitTest extends BaseTest { /** * Sequence file. */ - private static IndexedFastaSequenceFile seq; + private static ReferenceSequenceFile seq; private GenomeLocParser genomeLocParser; /** diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/providers/ReferenceViewTemplate.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/providers/ReferenceViewTemplate.java index c113b3392..82d03edb5 100644 --- a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/providers/ReferenceViewTemplate.java +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/providers/ReferenceViewTemplate.java @@ -26,7 +26,7 @@ package org.broadinstitute.gatk.engine.datasources.providers; import htsjdk.samtools.SAMSequenceRecord; -import htsjdk.samtools.reference.IndexedFastaSequenceFile; +import htsjdk.samtools.reference.ReferenceSequenceFile; import org.broadinstitute.gatk.utils.BaseTest; import org.broadinstitute.gatk.utils.GenomeLoc; import org.broadinstitute.gatk.utils.GenomeLocParser; @@ -57,7 +57,7 @@ public abstract class ReferenceViewTemplate extends BaseTest { /** * The fasta, for comparison. */ - protected IndexedFastaSequenceFile sequenceFile = null; + protected ReferenceSequenceFile sequenceFile = null; protected GenomeLocParser genomeLocParser = null; // diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/FilePointerUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/FilePointerUnitTest.java index fd59eb2d3..beb50c6eb 100644 --- a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/FilePointerUnitTest.java +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/FilePointerUnitTest.java @@ -25,9 +25,9 @@ package org.broadinstitute.gatk.engine.datasources.reads; -import htsjdk.samtools.reference.IndexedFastaSequenceFile; import htsjdk.samtools.GATKBAMFileSpan; import htsjdk.samtools.GATKChunk; +import htsjdk.samtools.reference.ReferenceSequenceFile; import org.broadinstitute.gatk.utils.BaseTest; import org.broadinstitute.gatk.utils.commandline.Tags; import org.broadinstitute.gatk.utils.GenomeLocParser; @@ -45,7 +45,7 @@ import java.io.FileNotFoundException; * */ public class FilePointerUnitTest extends BaseTest { - private IndexedFastaSequenceFile seq; + private ReferenceSequenceFile seq; private GenomeLocParser genomeLocParser; private SAMReaderID readerID = new SAMReaderID("samFile",new Tags()); diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/GATKBAMIndexFromDataSourceUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/GATKBAMIndexFromDataSourceUnitTest.java new file mode 100644 index 000000000..3cd4ac6e4 --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/GATKBAMIndexFromDataSourceUnitTest.java @@ -0,0 +1,98 @@ +/* +* Copyright 2012-2016 Broad Institute, Inc. +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.datasources.reads; + +import htsjdk.samtools.SAMFileReader; +import htsjdk.samtools.SamReader; +import htsjdk.samtools.SamReaderFactory; +import org.broadinstitute.gatk.utils.BaseTest; +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.Test; + +import java.io.File; +import java.io.IOException; + +/** + * Test basic functionality in the GATK's wrapper around htsjdk.samtools.BrowseableBAMIndex. + */ +public class GATKBAMIndexFromDataSourceUnitTest extends BaseTest { + private static File bamFile = new File(validationDataLocation+"MV1994.selected.bam"); + + /** + * Storage for the index itself. + */ + private GATKBAMIndex bamIndex; + + @BeforeClass + public void init() throws IOException { + final SAMFileReader reader = new SAMFileReader(bamFile); + reader.enableIndexCaching(true); // needed ot get BrowseableBAMIndex + Assert.assertTrue(reader.hasIndex()); + Assert.assertTrue(reader.indexing().hasBrowseableIndex()); + + bamIndex = new GATKBAMIndexFromDataSource(bamFile, reader.getFileHeader(), reader.indexing().getBrowseableIndex()); + reader.close(); + } + + @Test + public void testNumberAndSizeOfIndexLevels() { + // The correct values for this test are pulled directly from the + // SAM Format Specification v1.3-r882, Section 4.1.1, last paragraph. + Assert.assertEquals(GATKBAMIndex.getNumIndexLevels(),6,"Incorrect number of levels in BAM index"); + + // Level 0 + Assert.assertEquals(GATKBAMIndex.getFirstBinInLevel(0),0); + Assert.assertEquals(bamIndex.getLevelSize(0),1); + + // Level 1 + Assert.assertEquals(GATKBAMIndex.getFirstBinInLevel(1),1); + Assert.assertEquals(bamIndex.getLevelSize(1),8-1+1); + + // Level 2 + Assert.assertEquals(GATKBAMIndex.getFirstBinInLevel(2),9); + Assert.assertEquals(bamIndex.getLevelSize(2),72-9+1); + + // Level 3 + Assert.assertEquals(GATKBAMIndex.getFirstBinInLevel(3),73); + Assert.assertEquals(bamIndex.getLevelSize(3),584-73+1); + + // Level 4 + Assert.assertEquals(GATKBAMIndex.getFirstBinInLevel(4),585); + Assert.assertEquals(bamIndex.getLevelSize(4),4680-585+1); + + // Level 5 + Assert.assertEquals(GATKBAMIndex.getFirstBinInLevel(5),4681); + // Need to wait unitl AbstractBAMFileIndex.getLevelSize() is fixed, will throw an ArrayIndexOutOfBoundsException if 1 before the end of the GenomicIndexUtil.LEVEL_STARTS array + //Assert.assertEquals(bamIndex.getLevelSize(5),37448-4681+1); + } + + @Test( expectedExceptions = ReviewedGATKException.class ) + public void testTooManyReferenceSequencesDataSource() { + final GATKBAMIndexData index = bamIndex.readReferenceSequence(1); + } +} diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/GATKBAMIndexUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/GATKBAMIndexFromFileUnitTest.java similarity index 85% rename from public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/GATKBAMIndexUnitTest.java rename to public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/GATKBAMIndexFromFileUnitTest.java index f6be513bf..c13a0006d 100644 --- a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/GATKBAMIndexUnitTest.java +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/GATKBAMIndexFromFileUnitTest.java @@ -39,7 +39,7 @@ import java.io.FileNotFoundException; /** * Test basic functionality in the GATK's implementation of the BAM index classes. */ -public class GATKBAMIndexUnitTest extends BaseTest { +public class GATKBAMIndexFromFileUnitTest extends BaseTest { private static File bamFile = new File(validationDataLocation+"MV1994.selected.bam"); /** @@ -60,11 +60,11 @@ public class GATKBAMIndexUnitTest extends BaseTest { @BeforeClass public void init() throws FileNotFoundException { - SAMFileReader reader = new SAMFileReader(bamFile); - this.sequenceDictionary = reader.getFileHeader().getSequenceDictionary(); + final SAMFileReader reader = new SAMFileReader(bamFile); + sequenceDictionary = reader.getFileHeader().getSequenceDictionary(); reader.close(); - - bamIndex = new GATKBAMIndex(bamIndexFile, sequenceDictionary); + + bamIndex = new GATKBAMIndexFromFile(bamIndexFile, sequenceDictionary); } @Test @@ -100,13 +100,13 @@ public class GATKBAMIndexUnitTest extends BaseTest { @Test( expectedExceptions = UserException.MalformedFile.class ) public void testDetectTruncatedBamIndexWordBoundary() { - GATKBAMIndex index = new GATKBAMIndex(new File(privateTestDir + "truncated_at_word_boundary.bai"), sequenceDictionary); + final GATKBAMIndex index = new GATKBAMIndexFromFile(new File(privateTestDir + "truncated_at_word_boundary.bai"), sequenceDictionary); index.readReferenceSequence(0); } @Test( expectedExceptions = UserException.MalformedFile.class ) - public void testDetectTruncatedBamIndexNonWordBoundary() { - GATKBAMIndex index = new GATKBAMIndex(new File(privateTestDir + "truncated_at_non_word_boundary.bai"), sequenceDictionary); + public void testDetectTruncatedBamIndeFromFilexNonWordBoundary() { + final GATKBAMIndex index = new GATKBAMIndexFromFile(new File(privateTestDir + "truncated_at_non_word_boundary.bai"), sequenceDictionary); index.readReferenceSequence(0); } diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/SAMDataSourceUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/SAMDataSourceUnitTest.java index 606c3aa72..96731336a 100644 --- a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/SAMDataSourceUnitTest.java +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/SAMDataSourceUnitTest.java @@ -25,8 +25,8 @@ package org.broadinstitute.gatk.engine.datasources.reads; -import htsjdk.samtools.reference.IndexedFastaSequenceFile; import htsjdk.samtools.*; +import htsjdk.samtools.reference.ReferenceSequenceFile; import org.broadinstitute.gatk.utils.BaseTest; import org.broadinstitute.gatk.utils.commandline.Tags; import org.broadinstitute.gatk.utils.ValidationExclusion; @@ -64,7 +64,7 @@ public class SAMDataSourceUnitTest extends BaseTest { private List readers; private File referenceFile; - private IndexedFastaSequenceFile seq; + private ReferenceSequenceFile seq; private GenomeLocParser genomeLocParser; /** diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/rmd/ReferenceOrderedDataPoolUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/rmd/ReferenceOrderedDataPoolUnitTest.java index 1ab027d06..a63ce6ee1 100644 --- a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/rmd/ReferenceOrderedDataPoolUnitTest.java +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/rmd/ReferenceOrderedDataPoolUnitTest.java @@ -25,6 +25,7 @@ package org.broadinstitute.gatk.engine.datasources.rmd; +import htsjdk.samtools.reference.ReferenceSequenceFile; import org.broadinstitute.gatk.utils.commandline.Tags; import org.broadinstitute.gatk.utils.refdata.tracks.RMDTrackBuilder; import org.testng.Assert; @@ -68,7 +69,7 @@ public class ReferenceOrderedDataPoolUnitTest extends BaseTest { private RMDTriplet triplet = null; private RMDTrackBuilder builder = null; - private IndexedFastaSequenceFile seq; + private ReferenceSequenceFile seq; private GenomeLocParser genomeLocParser; private GenomeLoc testSite1; diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/rmd/ReferenceOrderedQueryDataPoolUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/rmd/ReferenceOrderedQueryDataPoolUnitTest.java index 99cd61cbe..40357b442 100644 --- a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/rmd/ReferenceOrderedQueryDataPoolUnitTest.java +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/rmd/ReferenceOrderedQueryDataPoolUnitTest.java @@ -25,7 +25,7 @@ package org.broadinstitute.gatk.engine.datasources.rmd; -import htsjdk.samtools.reference.IndexedFastaSequenceFile; +import htsjdk.samtools.reference.ReferenceSequenceFile; import htsjdk.tribble.Feature; import org.broadinstitute.gatk.utils.BaseTest; import org.broadinstitute.gatk.utils.commandline.Tags; @@ -46,7 +46,7 @@ public class ReferenceOrderedQueryDataPoolUnitTest extends BaseTest{ // Build up query parameters File file = new File(BaseTest.privateTestDir + "NA12878.hg19.example1.vcf"); RMDTriplet triplet = new RMDTriplet("test", "VCF", file.getAbsolutePath(), RMDTriplet.RMDStorageType.FILE, new Tags()); - IndexedFastaSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(BaseTest.hg19Reference)); + ReferenceSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(BaseTest.hg19Reference)); GenomeLocParser parser = new GenomeLocParser(seq); GenomeLoc loc = parser.createGenomeLoc("20", 1, 100000); TestRMDTrackBuilder builder = new TestRMDTrackBuilder(seq.getSequenceDictionary(), parser); diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/traversals/TAROrderedReadCacheUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/traversals/TAROrderedReadCacheUnitTest.java index 6277646e6..bf625ad11 100644 --- a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/traversals/TAROrderedReadCacheUnitTest.java +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/traversals/TAROrderedReadCacheUnitTest.java @@ -25,7 +25,7 @@ package org.broadinstitute.gatk.engine.traversals; -import htsjdk.samtools.reference.IndexedFastaSequenceFile; +import htsjdk.samtools.reference.ReferenceSequenceFile; import org.broadinstitute.gatk.utils.BaseTest; import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile; import org.broadinstitute.gatk.utils.sam.ArtificialBAMBuilder; @@ -43,7 +43,7 @@ import java.util.List; public class TAROrderedReadCacheUnitTest extends BaseTest { // example fasta index file, can be deleted if you don't use the reference - private IndexedFastaSequenceFile seq; + private ReferenceSequenceFile seq; @BeforeClass public void setup() throws FileNotFoundException { diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/traversals/TraverseActiveRegionsUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/traversals/TraverseActiveRegionsUnitTest.java index d0b8c7170..f931ac40b 100644 --- a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/traversals/TraverseActiveRegionsUnitTest.java +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/traversals/TraverseActiveRegionsUnitTest.java @@ -27,6 +27,7 @@ package org.broadinstitute.gatk.engine.traversals; import com.google.java.contract.PreconditionError; import htsjdk.samtools.*; +import htsjdk.samtools.reference.ReferenceSequenceFile; import org.broadinstitute.gatk.utils.commandline.Tags; import org.broadinstitute.gatk.utils.ValidationExclusion; import org.broadinstitute.gatk.engine.datasources.reads.*; @@ -39,7 +40,6 @@ import org.broadinstitute.gatk.utils.activeregion.ActiveRegionReadState; import org.broadinstitute.gatk.utils.interval.IntervalMergingRule; import org.broadinstitute.gatk.utils.interval.IntervalUtils; import org.broadinstitute.gatk.utils.sam.*; -import htsjdk.samtools.reference.IndexedFastaSequenceFile; import org.broadinstitute.gatk.utils.BaseTest; import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; import org.broadinstitute.gatk.engine.datasources.providers.LocusShardDataProvider; @@ -80,7 +80,7 @@ public class TraverseActiveRegionsUnitTest extends BaseTest { } private File referenceFile; - private IndexedFastaSequenceFile reference; + private ReferenceSequenceFile reference; private SAMSequenceDictionary dictionary; private GenomeLocParser genomeLocParser; diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/traversals/TraverseReadsUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/traversals/TraverseReadsUnitTest.java index 45bc8da46..d7a0e6b2e 100644 --- a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/traversals/TraverseReadsUnitTest.java +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/traversals/TraverseReadsUnitTest.java @@ -25,7 +25,6 @@ package org.broadinstitute.gatk.engine.traversals; -import htsjdk.samtools.reference.IndexedFastaSequenceFile; import htsjdk.samtools.reference.ReferenceSequenceFile; import org.broadinstitute.gatk.engine.walkers.TestCountReadsWalker; import org.broadinstitute.gatk.utils.BaseTest; @@ -90,7 +89,7 @@ public class TraverseReadsUnitTest extends BaseTest { private File output; private TraverseReadsNano traversalEngine = null; - private IndexedFastaSequenceFile ref = null; + private ReferenceSequenceFile ref = null; private GenomeLocParser genomeLocParser = null; private GenomeAnalysisEngine engine = null; diff --git a/public/gatk-root/pom.xml b/public/gatk-root/pom.xml index d4daad80e..aa12e0180 100644 --- a/public/gatk-root/pom.xml +++ b/public/gatk-root/pom.xml @@ -418,6 +418,7 @@ ${gatk.queuetests.run} ${java.io.tmpdir} + sra diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/QCRef.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/QCRef.java index 49983241d..5c855775f 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/QCRef.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/QCRef.java @@ -25,8 +25,8 @@ package org.broadinstitute.gatk.tools.walkers.qc; -import htsjdk.samtools.reference.IndexedFastaSequenceFile; import htsjdk.samtools.reference.ReferenceSequence; +import htsjdk.samtools.reference.ReferenceSequenceFile; import org.broadinstitute.gatk.utils.commandline.Output; import org.broadinstitute.gatk.engine.CommandLineGATK; import org.broadinstitute.gatk.utils.contexts.AlignmentContext; @@ -70,7 +70,7 @@ public class QCRef extends RefWalker { String contigName = ""; int contigStart, contigEnd; - IndexedFastaSequenceFile uncachedRef; + ReferenceSequenceFile uncachedRef; byte[] uncachedBases; @Override diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/VariantEval.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/VariantEval.java index 1e97c388a..ef6ebd7b6 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/VariantEval.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/VariantEval.java @@ -27,6 +27,7 @@ package org.broadinstitute.gatk.tools.walkers.varianteval; import com.google.java.contract.Requires; import htsjdk.samtools.reference.IndexedFastaSequenceFile; +import htsjdk.samtools.reference.ReferenceSequenceFile; import htsjdk.samtools.util.IntervalTree; import htsjdk.samtools.SAMSequenceRecord; import org.apache.log4j.Logger; @@ -282,7 +283,7 @@ public class VariantEval extends RodWalker implements TreeRedu private final VariantEvalUtils variantEvalUtils = new VariantEvalUtils(this); // Ancestral alignments - private IndexedFastaSequenceFile ancestralAlignments = null; + private ReferenceSequenceFile ancestralAlignments = null; // The set of all possible evaluation contexts StratificationManager stratManager; diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/tools/walkers/filters/VariantFiltrationUnitTest.java b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/tools/walkers/filters/VariantFiltrationUnitTest.java index 1e9a1dc81..3ec696592 100644 --- a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/tools/walkers/filters/VariantFiltrationUnitTest.java +++ b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/tools/walkers/filters/VariantFiltrationUnitTest.java @@ -25,7 +25,7 @@ package org.broadinstitute.gatk.tools.walkers.filters; -import htsjdk.samtools.reference.IndexedFastaSequenceFile; +import htsjdk.samtools.reference.ReferenceSequenceFile; import org.broadinstitute.gatk.utils.BaseTest; import org.broadinstitute.gatk.utils.GenomeLoc; import org.broadinstitute.gatk.utils.GenomeLocParser; @@ -51,7 +51,7 @@ public class VariantFiltrationUnitTest extends BaseTest { @BeforeTest public void before() { // Create GenomeLoc - IndexedFastaSequenceFile fasta = CachingIndexedFastaSequenceFile.checkAndCreate(new File(privateTestDir + "iupacFASTA.fasta")); + ReferenceSequenceFile fasta = CachingIndexedFastaSequenceFile.checkAndCreate(new File(privateTestDir + "iupacFASTA.fasta")); GenomeLocParser genomeLocParser = new GenomeLocParser(fasta); chr1 = fasta.getSequenceDictionary().getSequence(0).getSequenceName(); genomeLoc = genomeLocParser.createGenomeLoc(chr1, 5, 10); diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/tools/walkers/indels/IndelRealignerUnitTest.java b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/tools/walkers/indels/IndelRealignerUnitTest.java index 801db5269..de62d5525 100644 --- a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/tools/walkers/indels/IndelRealignerUnitTest.java +++ b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/tools/walkers/indels/IndelRealignerUnitTest.java @@ -25,8 +25,8 @@ package org.broadinstitute.gatk.tools.walkers.indels; -import htsjdk.samtools.reference.IndexedFastaSequenceFile; import htsjdk.samtools.SAMFileHeader; +import htsjdk.samtools.reference.ReferenceSequenceFile; import org.broadinstitute.gatk.utils.BaseTest; import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile; import org.broadinstitute.gatk.utils.sam.ArtificialSAMUtils; @@ -44,7 +44,7 @@ public class IndelRealignerUnitTest extends BaseTest { @BeforeClass public void setup() throws FileNotFoundException { - final IndexedFastaSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(b37KGReference)); + final ReferenceSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(b37KGReference)); header = ArtificialSAMUtils.createArtificialSamHeader(seq.getSequenceDictionary()); } diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/tools/walkers/readutils/SRAPrintReadsIntegrationTest.java b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/tools/walkers/readutils/SRAPrintReadsIntegrationTest.java new file mode 100644 index 000000000..6ebb0074b --- /dev/null +++ b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/tools/walkers/readutils/SRAPrintReadsIntegrationTest.java @@ -0,0 +1,143 @@ +/* +* Copyright 2012-2016 Broad Institute, Inc. +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.tools.walkers.readutils; + +import org.broadinstitute.gatk.engine.walkers.WalkerTest; +import org.testng.SkipException; +import org.testng.annotations.BeforeGroups; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.Collections; + +import htsjdk.samtools.sra.SRAAccession; + +import java.lang.reflect.Method; + +public class SRAPrintReadsIntegrationTest extends WalkerTest { + + private static class PRTest { + final String accession; + final String args; + final String md5; + + private PRTest(String accession, String args, String md5) { + this.accession = accession; + this.args = args; + this.md5 = md5; + } + + @Override + public String toString() { + return String.format("PRTest(accession='%s', args='%s')", accession, args); + } + } + + private static boolean canResolveNetworkAccession = false; + private static String checkAccession = "SRR000123"; + private static final String REMOTE_ACCESSION_PATTERN = "^[SED]RR[0-9]{6,9}$"; + + /** + * Are the SRA native libraries loaded and initialized? Does the test accession have a valid name? + */ + @BeforeGroups(groups = {"sra"}) + public final void checkIfCanResolve() { + // Did SRA successfully load the native libraries and are fully initialized? + if (!SRAAccession.isSupported()) { + return; + } + + // Is this is a valid SRA accession? + canResolveNetworkAccession = SRAAccession.isValid(checkAccession); + } + + /** + * Are the SRA native libraries loaded and initialized? + * + * @throws SkipException if the SRA native libraries are loaded and initialized + */ + @BeforeMethod + public final void assertSRAIsSupported(){ + if(!SRAAccession.isSupported()){ + throw new SkipException("Skipping SRA Test because SRA native code is unavailable."); + } + } + + /** + * Skip network SRA Test because cannot resolve remote SRA accession + * + * @param method Provides information about, and access to, a single method on a class or interface + * @param params Method parameters + * @throws SkipException if cannot resold an SRA accession + */ + @BeforeMethod + public void skipIfCantResolve(Method method, Object[] params) { + String accession = null; + + if (params.length > 0) { + Object firstParam = params[0]; + if (firstParam instanceof PRTest) { + accession = ((PRTest)firstParam).accession; + } + } + + if (accession != null && + accession.matches(REMOTE_ACCESSION_PATTERN) && !canResolveNetworkAccession) { + throw new SkipException("Skipping network SRA Test because cannot resolve remote SRA accession '" + + checkAccession + "'."); + } + } + + @DataProvider(name = "PRTest") + public Object[][] createPrintReadsTestData() { + return new Object[][]{ + // Test with local SRA accessions + {new PRTest(privateTestDir + "ERR1214757.sra", "", "173ed87acc794a704aa000c6ab5d63a8")}, + {new PRTest(privateTestDir + "ERR1214757.sra", " -L NC_000001.10:1-50000", "6bc055f028c49bcbca990857e57a6e4b")}, + {new PRTest(privateTestDir + "ERR1214757.sra", " -L NC_000001.10:500000-1000000", "ab545064b2314971cfae7486ff74d779")}, + // Tests with remote SRA accessions + {new PRTest("ERR1214757", "", "173ed87acc794a704aa000c6ab5d63a8")}, + {new PRTest("ERR1214757", " -L NC_000001.10:1-50000", "6bc055f028c49bcbca990857e57a6e4b")}, + {new PRTest("ERR1214757", " -L NC_000001.10:500000-1000000", "ab545064b2314971cfae7486ff74d779")}, + }; + } + + @Test(groups = "sra", dataProvider = "PRTest") + public void testPrintReads(PRTest params) { + + WalkerTestSpec spec = new WalkerTestSpec( + "-T PrintReads" + + " -R " + params.accession + + " -I " + params.accession + + params.args + + " --no_pg_tag" + + " -o %s", + Collections.singletonList(params.md5)); + executeTest("testPrintReads-"+params.args, spec).getFirst(); + } + +} diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/activeregion/ActiveRegion.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/activeregion/ActiveRegion.java index 1fc67d482..e83628e7c 100644 --- a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/activeregion/ActiveRegion.java +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/activeregion/ActiveRegion.java @@ -27,7 +27,7 @@ package org.broadinstitute.gatk.utils.activeregion; import com.google.java.contract.Ensures; import com.google.java.contract.Invariant; -import htsjdk.samtools.reference.IndexedFastaSequenceFile; +import htsjdk.samtools.reference.ReferenceSequenceFile; import org.broadinstitute.gatk.utils.GenomeLoc; import org.broadinstitute.gatk.utils.GenomeLocParser; import org.broadinstitute.gatk.utils.GenomeLocSortedSet; @@ -168,7 +168,7 @@ public class ActiveRegion implements HasGenomeLocation { /** * See #getActiveRegionReference but with padding == 0 */ - public byte[] getActiveRegionReference( final IndexedFastaSequenceFile referenceReader ) { + public byte[] getActiveRegionReference( final ReferenceSequenceFile referenceReader ) { return getActiveRegionReference(referenceReader, 0); } @@ -182,21 +182,21 @@ public class ActiveRegion implements HasGenomeLocation { * @return a non-null array of bytes holding the reference bases in referenceReader */ @Ensures("result != null") - public byte[] getActiveRegionReference( final IndexedFastaSequenceFile referenceReader, final int padding ) { + public byte[] getActiveRegionReference( final ReferenceSequenceFile referenceReader, final int padding ) { return getReference(referenceReader, padding, extendedLoc); } /** * See #getActiveRegionReference but using the span including regions not the extended span */ - public byte[] getFullReference( final IndexedFastaSequenceFile referenceReader ) { + public byte[] getFullReference( final ReferenceSequenceFile referenceReader ) { return getFullReference(referenceReader, 0); } /** * See #getActiveRegionReference but using the span including regions not the extended span */ - public byte[] getFullReference( final IndexedFastaSequenceFile referenceReader, final int padding ) { + public byte[] getFullReference( final ReferenceSequenceFile referenceReader, final int padding ) { return getReference(referenceReader, padding, spanIncludingReads); } @@ -211,7 +211,7 @@ public class ActiveRegion implements HasGenomeLocation { * @return a non-null array of bytes holding the reference bases in referenceReader */ @Ensures("result != null") - public byte[] getReference( final IndexedFastaSequenceFile referenceReader, final int padding, final GenomeLoc genomeLoc ) { + public byte[] getReference( final ReferenceSequenceFile referenceReader, final int padding, final GenomeLoc genomeLoc ) { if ( referenceReader == null ) throw new IllegalArgumentException("referenceReader cannot be null"); if ( padding < 0 ) throw new IllegalArgumentException("padding must be a positive integer but got " + padding); if ( genomeLoc == null ) throw new IllegalArgumentException("genomeLoc cannot be null"); diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/baq/BAQ.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/baq/BAQ.java index b56305d86..e6facd081 100644 --- a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/baq/BAQ.java +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/baq/BAQ.java @@ -25,12 +25,12 @@ package org.broadinstitute.gatk.utils.baq; -import htsjdk.samtools.reference.IndexedFastaSequenceFile; import htsjdk.samtools.reference.ReferenceSequence; import htsjdk.samtools.CigarElement; import htsjdk.samtools.CigarOperator; import htsjdk.samtools.SAMRecord; import htsjdk.samtools.SAMUtils; +import htsjdk.samtools.reference.ReferenceSequenceFile; import org.apache.log4j.Logger; import org.broadinstitute.gatk.utils.collections.Pair; import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; @@ -151,7 +151,7 @@ public class BAQ { * @param b band width * @param minBaseQual All bases with Q < minBaseQual are up'd to this value */ - public BAQ(final double d, final double e, final int b, final byte minBaseQual, boolean includeClippedBases) { + public BAQ(final double d, final double e, final int b, final byte minBaseQual, final boolean includeClippedBases) { cd = d; ce = e; cb = b; this.minBaseQual = minBaseQual; this.includeClippedBases = includeClippedBases; @@ -521,7 +521,7 @@ public class BAQ { } } - public BAQCalculationResult calcBAQFromHMM(SAMRecord read, IndexedFastaSequenceFile refReader) { + public BAQCalculationResult calcBAQFromHMM(final SAMRecord read, final ReferenceSequenceFile refReader) { // start is alignment start - band width / 2 - size of first I element, if there is one. Stop is similar int offset = getBandWidth() / 2; long readStart = includeClippedBases ? read.getUnclippedStart() : read.getAlignmentStart(); @@ -540,7 +540,7 @@ public class BAQ { // final SimpleTimer total = new SimpleTimer(); // final SimpleTimer local = new SimpleTimer(); // int n = 0; - public BAQCalculationResult calcBAQFromHMM(byte[] ref, byte[] query, byte[] quals, int queryStart, int queryEnd ) { + public BAQCalculationResult calcBAQFromHMM(final byte[] ref, final byte[] query, final byte[] quals, final int queryStart, final int queryEnd ) { // total.restart(); if ( queryStart < 0 ) throw new ReviewedGATKException("BUG: queryStart < 0: " + queryStart); if ( queryEnd < 0 ) throw new ReviewedGATKException("BUG: queryEnd < 0: " + queryEnd); @@ -564,7 +564,7 @@ public class BAQ { * @param read * @return */ - private final Pair calculateQueryRange(SAMRecord read) { + private final Pair calculateQueryRange(final SAMRecord read) { int queryStart = -1, queryStop = -1; int readI = 0; @@ -599,7 +599,7 @@ public class BAQ { } // we need to pad ref by at least the bandwidth / 2 on either side - public BAQCalculationResult calcBAQFromHMM(SAMRecord read, byte[] ref, int refOffset) { + public BAQCalculationResult calcBAQFromHMM(final SAMRecord read, final byte[] ref, final int refOffset) { // todo -- need to handle the case where the cigar sum of lengths doesn't cover the whole read Pair queryRange = calculateQueryRange(read); if ( queryRange == null ) return null; // read has Ns, or is completely clipped away @@ -642,7 +642,7 @@ public class BAQ { return baqResult; } - public byte capBaseByBAQ( byte oq, byte bq, int state, int expectedPos ) { + public byte capBaseByBAQ( final byte oq, final byte bq, final int state, final int expectedPos ) { byte b; boolean isIndel = stateIsIndel(state); int pos = stateAlignedPosition(state); @@ -664,7 +664,7 @@ public class BAQ { * @param calculationType * @return BQ qualities for use, in case qmode is DONT_MODIFY */ - public byte[] baqRead(SAMRecord read, IndexedFastaSequenceFile refReader, CalculationMode calculationType, QualityMode qmode ) { + public byte[] baqRead(final SAMRecord read, final ReferenceSequenceFile refReader, final CalculationMode calculationType, final QualityMode qmode ) { if ( DEBUG ) System.out.printf("BAQ %s read %s%n", calculationType, read.getReadName()); byte[] BAQQuals = read.getBaseQualities(); // in general we are overwriting quals, so just get a pointer to them @@ -706,7 +706,7 @@ public class BAQ { * @param read * @return */ - public boolean excludeReadFromBAQ(SAMRecord read) { + public boolean excludeReadFromBAQ(final SAMRecord read) { // keeping mapped reads, regardless of pairing status, or primary alignment status. return read.getReadUnmappedFlag() || read.getReadFailsVendorQualityCheckFlag() || read.getDuplicateReadFlag(); } diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/fasta/CachingIndexedFastaSequenceFile.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/fasta/CachingIndexedFastaSequenceFile.java index a193eebf4..4a09f8393 100644 --- a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/fasta/CachingIndexedFastaSequenceFile.java +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/fasta/CachingIndexedFastaSequenceFile.java @@ -25,6 +25,9 @@ package org.broadinstitute.gatk.utils.fasta; +import htsjdk.samtools.reference.ReferenceSequenceFile; +import htsjdk.samtools.sra.SRAAccession; +import htsjdk.samtools.sra.SRAIndexedSequenceFile; import org.broadinstitute.gatk.utils.exceptions.UserException; import htsjdk.samtools.SAMException; import htsjdk.samtools.reference.FastaSequenceIndex; @@ -192,8 +195,17 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile { * Possibly may be better as an overloaded constructor. * @param fastaFile Fasta file to be used as reference * @return A new instance of a CachingIndexedFastaSequenceFile. + * @throws IllegalArgumentException if Fasta file is null */ - public static CachingIndexedFastaSequenceFile checkAndCreate(final File fastaFile) { + public static ReferenceSequenceFile checkAndCreate(final File fastaFile) { + if ( fastaFile == null ) { + throw new IllegalArgumentException("Fasta file is null"); + } + // maybe it is SRA file? + if (SRAAccession.isValid(fastaFile.getPath())) { + return new SRAIndexedSequenceFile(new SRAAccession(fastaFile.getPath())); + } + // does the fasta file exist? check that first... if (!fastaFile.exists()) throw new UserException("The fasta file you specified (" + fastaFile.getAbsolutePath() + ") does not exist."); diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/locusiterator/LIBSPerformance.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/locusiterator/LIBSPerformance.java index 55a4efe64..5bb518a50 100644 --- a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/locusiterator/LIBSPerformance.java +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/locusiterator/LIBSPerformance.java @@ -25,10 +25,10 @@ package org.broadinstitute.gatk.utils.locusiterator; -import htsjdk.samtools.reference.IndexedFastaSequenceFile; import htsjdk.samtools.SAMFileReader; import htsjdk.samtools.SAMReadGroupRecord; import htsjdk.samtools.SAMRecordIterator; +import htsjdk.samtools.reference.ReferenceSequenceFile; import org.apache.log4j.Logger; import org.broadinstitute.gatk.utils.commandline.Argument; import org.broadinstitute.gatk.utils.commandline.CommandLineProgram; @@ -62,7 +62,7 @@ public class LIBSPerformance extends CommandLineProgram { @Override public int execute() throws IOException { - final IndexedFastaSequenceFile reference = new CachingIndexedFastaSequenceFile(referenceFile); + final ReferenceSequenceFile reference = new CachingIndexedFastaSequenceFile(referenceFile); final GenomeLocParser genomeLocParser = new GenomeLocParser(reference); final SAMFileReader reader = new SAMFileReader(samFile); diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/sam/ArtificialBAMBuilder.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/sam/ArtificialBAMBuilder.java index b69794ed9..1b63e6132 100644 --- a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/sam/ArtificialBAMBuilder.java +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/sam/ArtificialBAMBuilder.java @@ -25,8 +25,8 @@ package org.broadinstitute.gatk.utils.sam; -import htsjdk.samtools.reference.IndexedFastaSequenceFile; import htsjdk.samtools.*; +import htsjdk.samtools.reference.ReferenceSequenceFile; import org.broadinstitute.gatk.utils.GenomeLocParser; import org.broadinstitute.gatk.utils.NGSPlatform; @@ -52,7 +52,7 @@ import java.util.*; public class ArtificialBAMBuilder { public final static int BAM_SHARD_SIZE = 16384; - private final IndexedFastaSequenceFile reference; + private final ReferenceSequenceFile reference; private final GenomeLocParser parser; final int nReadsPerLocus; @@ -73,7 +73,7 @@ public class ArtificialBAMBuilder { SAMFileHeader header; - public ArtificialBAMBuilder(final IndexedFastaSequenceFile reference, int nReadsPerLocus, int nLoci) { + public ArtificialBAMBuilder(final ReferenceSequenceFile reference, int nReadsPerLocus, int nLoci) { this.nReadsPerLocus = nReadsPerLocus; this.nLoci = nLoci; @@ -94,7 +94,7 @@ public class ArtificialBAMBuilder { createAndSetHeader(1); } - public IndexedFastaSequenceFile getReference() { + public ReferenceSequenceFile getReference() { return reference; } diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/text/ListFileUtils.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/text/ListFileUtils.java index 75a56b110..d62c1681b 100644 --- a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/text/ListFileUtils.java +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/text/ListFileUtils.java @@ -25,6 +25,7 @@ package org.broadinstitute.gatk.utils.text; +import htsjdk.samtools.sra.SRAAccession; import org.broadinstitute.gatk.utils.commandline.ParsingEngine; import org.broadinstitute.gatk.utils.commandline.RodBinding; import org.broadinstitute.gatk.utils.commandline.Tags; @@ -59,6 +60,13 @@ public class ListFileUtils { * @return a flattened list of the bam files provided */ public static List unpackBAMFileList(final List samFiles, final ParsingEngine parser) { + if ( samFiles == null ) { + throw new IllegalArgumentException("The SAM files are null"); + } + if ( parser == null ) { + throw new IllegalArgumentException("The parser is null"); + } + List unpackedReads = new ArrayList(); for( String inputFileName: samFiles ) { Tags inputFileNameTags = parser.getTags(inputFileName); @@ -79,10 +87,14 @@ public class ListFileUtils { else if(inputFileName.endsWith("stdin")) { unpackedReads.add(new SAMReaderID(inputFileName,inputFileNameTags)); } + else if(SRAAccession.isValid(inputFileName)) { + unpackedReads.add(new SAMReaderID(inputFileName,inputFileNameTags)); + } else { - throw new UserException.CommandLineException(String.format("The GATK reads argument (-I, --input_file) supports only BAM/CRAM files with the .bam/.cram extension and lists of BAM/CRAM files " + - "with the .list extension, but the file %s has neither extension. Please ensure that your BAM/CRAM file or list " + - "of BAM/CRAM files is in the correct format, update the extension, and try again.",inputFileName)); + throw new UserException.CommandLineException(String.format("The GATK reads argument (-I, --input_file) supports only BAM/CRAM files with the .bam/.cram extension " + "" + + "or SRA archives and lists of BAM/CRAM/SRA files with the .list extension, but the file %s has " + + "neither extension and is not SRA accession. Please ensure that your BAM/CRAM file or list " + + "of BAM/CRAM files is in the correct format, update the extension, and try again.", inputFileName)); } } return unpackedReads; diff --git a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/ExampleToCopyUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/ExampleToCopyUnitTest.java index 186f39f84..637539395 100644 --- a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/ExampleToCopyUnitTest.java +++ b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/ExampleToCopyUnitTest.java @@ -29,10 +29,10 @@ package org.broadinstitute.gatk.utils; // the imports for unit testing. -import htsjdk.samtools.reference.IndexedFastaSequenceFile; import htsjdk.samtools.SAMFileHeader; import htsjdk.samtools.SAMFileReader; import htsjdk.samtools.SAMRecord; +import htsjdk.samtools.reference.ReferenceSequenceFile; import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile; import org.broadinstitute.gatk.utils.pileup.PileupElement; import org.broadinstitute.gatk.utils.pileup.ReadBackedPileup; @@ -57,7 +57,7 @@ public class ExampleToCopyUnitTest extends BaseTest { private GenomeLocParser genomeLocParser; // example fasta index file, can be deleted if you don't use the reference - private IndexedFastaSequenceFile seq; + private ReferenceSequenceFile seq; @BeforeClass public void setup() throws FileNotFoundException { diff --git a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/GenomeLocParserBenchmark.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/GenomeLocParserBenchmark.java index 43af1502c..67badf071 100644 --- a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/GenomeLocParserBenchmark.java +++ b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/GenomeLocParserBenchmark.java @@ -27,7 +27,7 @@ package org.broadinstitute.gatk.utils; import com.google.caliper.Param; import com.google.caliper.SimpleBenchmark; -import htsjdk.samtools.reference.IndexedFastaSequenceFile; +import htsjdk.samtools.reference.ReferenceSequenceFile; import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile; import java.io.File; @@ -36,7 +36,7 @@ import java.io.File; * Caliper microbenchmark of genome loc parser */ public class GenomeLocParserBenchmark extends SimpleBenchmark { - private IndexedFastaSequenceFile seq; + private ReferenceSequenceFile seq; private final int ITERATIONS = 1000000; @Param({"NEW", "NONE"}) diff --git a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/NGSPlatformUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/NGSPlatformUnitTest.java index 1f8b1f49c..a06c86ba6 100644 --- a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/NGSPlatformUnitTest.java +++ b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/NGSPlatformUnitTest.java @@ -28,10 +28,9 @@ package org.broadinstitute.gatk.utils; // the imports for unit testing. - -import htsjdk.samtools.reference.IndexedFastaSequenceFile; import htsjdk.samtools.SAMFileHeader; import htsjdk.samtools.SAMReadGroupRecord; +import htsjdk.samtools.reference.ReferenceSequenceFile; import org.broadinstitute.gatk.utils.BaseTest; import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile; import org.broadinstitute.gatk.utils.sam.ArtificialSAMUtils; @@ -53,7 +52,7 @@ public class NGSPlatformUnitTest extends BaseTest { private GenomeLocParser genomeLocParser; // example fasta index file, can be deleted if you don't use the reference - private IndexedFastaSequenceFile seq; + private ReferenceSequenceFile seq; @BeforeClass public void setup() throws FileNotFoundException { diff --git a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/activeregion/ActiveRegionUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/activeregion/ActiveRegionUnitTest.java index 4a0b38215..1728f2170 100644 --- a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/activeregion/ActiveRegionUnitTest.java +++ b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/activeregion/ActiveRegionUnitTest.java @@ -29,8 +29,8 @@ package org.broadinstitute.gatk.utils.activeregion; // the imports for unit testing. -import htsjdk.samtools.reference.IndexedFastaSequenceFile; import htsjdk.samtools.SAMFileHeader; +import htsjdk.samtools.reference.ReferenceSequenceFile; import org.broadinstitute.gatk.utils.BaseTest; import org.broadinstitute.gatk.utils.GenomeLoc; import org.broadinstitute.gatk.utils.GenomeLocParser; @@ -51,7 +51,7 @@ import java.util.*; public class ActiveRegionUnitTest extends BaseTest { private final static boolean DEBUG = false; private GenomeLocParser genomeLocParser; - private IndexedFastaSequenceFile seq; + private ReferenceSequenceFile seq; private String contig; private int contigLength; diff --git a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/baq/BAQUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/baq/BAQUnitTest.java index 0f065eef6..433637927 100644 --- a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/baq/BAQUnitTest.java +++ b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/baq/BAQUnitTest.java @@ -26,6 +26,7 @@ package org.broadinstitute.gatk.utils.baq; +import htsjdk.samtools.reference.ReferenceSequenceFile; import org.broadinstitute.gatk.utils.exceptions.UserException; import org.testng.Assert; import org.testng.annotations.Test; @@ -52,7 +53,7 @@ public class BAQUnitTest extends BaseTest { private final int startChr = 1; private final int numChr = 2; private final int chrSize = 1000; - IndexedFastaSequenceFile fasta = null; + ReferenceSequenceFile fasta = null; @BeforeMethod public void before() { diff --git a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/fasta/CachingIndexedFastaSequenceFileUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/fasta/CachingIndexedFastaSequenceFileUnitTest.java index d55c3ad59..a8d6fd0db 100644 --- a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/fasta/CachingIndexedFastaSequenceFileUnitTest.java +++ b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/fasta/CachingIndexedFastaSequenceFileUnitTest.java @@ -32,6 +32,7 @@ package org.broadinstitute.gatk.utils.fasta; import htsjdk.samtools.reference.IndexedFastaSequenceFile; import htsjdk.samtools.reference.ReferenceSequence; import htsjdk.samtools.SAMSequenceRecord; +import htsjdk.samtools.reference.ReferenceSequenceFile; import org.apache.commons.lang.StringUtils; import org.apache.log4j.Priority; import org.broadinstitute.gatk.utils.BaseTest; @@ -93,7 +94,7 @@ public class CachingIndexedFastaSequenceFileUnitTest extends BaseTest { private void testSequential(final CachingIndexedFastaSequenceFile caching, final File fasta, final int querySize) throws FileNotFoundException { Assert.assertTrue(caching.isPreservingCase(), "testSequential only works for case preserving CachingIndexedFastaSequenceFile readers"); - final IndexedFastaSequenceFile uncached = new IndexedFastaSequenceFile(fasta); + final ReferenceSequenceFile uncached = new IndexedFastaSequenceFile(fasta); SAMSequenceRecord contig = uncached.getSequenceDictionary().getSequence(0); for ( int i = 0; i < contig.getSequenceLength(); i += STEP_SIZE ) { @@ -123,7 +124,7 @@ public class CachingIndexedFastaSequenceFileUnitTest extends BaseTest { // Tests grabbing sequences around a middle cached value. @Test(dataProvider = "fastas", enabled = true && ! DEBUG) public void testCachingIndexedFastaReaderTwoStage(File fasta, int cacheSize, int querySize) throws FileNotFoundException { - final IndexedFastaSequenceFile uncached = new IndexedFastaSequenceFile(fasta); + final ReferenceSequenceFile uncached = new IndexedFastaSequenceFile(fasta); final CachingIndexedFastaSequenceFile caching = new CachingIndexedFastaSequenceFile(fasta, getCacheSize(cacheSize), true, false); SAMSequenceRecord contig = uncached.getSequenceDictionary().getSequence(0); @@ -191,7 +192,7 @@ public class CachingIndexedFastaSequenceFileUnitTest extends BaseTest { // make sure some bases are lower case and some are upper case @Test(enabled = true) public void testMixedCasesInExample() throws FileNotFoundException, InterruptedException { - final IndexedFastaSequenceFile original = new IndexedFastaSequenceFile(new File(exampleFASTA)); + final ReferenceSequenceFile original = new IndexedFastaSequenceFile(new File(exampleFASTA)); final CachingIndexedFastaSequenceFile casePreserving = new CachingIndexedFastaSequenceFile(new File(exampleFASTA), true); final CachingIndexedFastaSequenceFile allUpper = new CachingIndexedFastaSequenceFile(new File(exampleFASTA)); @@ -208,9 +209,9 @@ public class CachingIndexedFastaSequenceFileUnitTest extends BaseTest { Assert.assertTrue(nMixedCase > 0, "No mixed cases sequences found in file. Unexpected test state"); } - private int testCases(final IndexedFastaSequenceFile original, - final IndexedFastaSequenceFile casePreserving, - final IndexedFastaSequenceFile allUpper, + private int testCases(final ReferenceSequenceFile original, + final ReferenceSequenceFile casePreserving, + final ReferenceSequenceFile allUpper, final String contig, final int start, final int stop ) { final String orig = fetchBaseString(original, contig, start, stop); final String keptCase = fetchBaseString(casePreserving, contig, start, stop); @@ -226,7 +227,7 @@ public class CachingIndexedFastaSequenceFileUnitTest extends BaseTest { } } - private String fetchBaseString(final IndexedFastaSequenceFile reader, final String contig, final int start, final int stop) { + private String fetchBaseString(final ReferenceSequenceFile reader, final String contig, final int start, final int stop) { if ( start == -1 ) return new String(reader.getSequence(contig).getBases()); else diff --git a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/refdata/tracks/FeatureManagerUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/refdata/tracks/FeatureManagerUnitTest.java index de14e0f43..18c18a917 100644 --- a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/refdata/tracks/FeatureManagerUnitTest.java +++ b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/refdata/tracks/FeatureManagerUnitTest.java @@ -25,8 +25,7 @@ package org.broadinstitute.gatk.utils.refdata.tracks; - -import htsjdk.samtools.reference.IndexedFastaSequenceFile; +import htsjdk.samtools.reference.ReferenceSequenceFile; import htsjdk.tribble.Feature; import htsjdk.tribble.FeatureCodec; import org.broadinstitute.gatk.utils.BaseTest; @@ -66,7 +65,7 @@ public class FeatureManagerUnitTest extends BaseTest { public void setup() { File referenceFile = new File(b36KGReference); try { - IndexedFastaSequenceFile seq = new CachingIndexedFastaSequenceFile(referenceFile); + final ReferenceSequenceFile seq = new CachingIndexedFastaSequenceFile(referenceFile); genomeLocParser = new GenomeLocParser(seq); manager = new FeatureManager(); } diff --git a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/refdata/tracks/RMDTrackBuilderUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/refdata/tracks/RMDTrackBuilderUnitTest.java index 3131f92d6..e904f58ed 100644 --- a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/refdata/tracks/RMDTrackBuilderUnitTest.java +++ b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/refdata/tracks/RMDTrackBuilderUnitTest.java @@ -26,7 +26,7 @@ package org.broadinstitute.gatk.utils.refdata.tracks; -import htsjdk.samtools.reference.IndexedFastaSequenceFile; +import htsjdk.samtools.reference.ReferenceSequenceFile; import htsjdk.tribble.Tribble; import htsjdk.tribble.index.Index; import htsjdk.tribble.util.LittleEndianOutputStream; @@ -54,7 +54,7 @@ import java.nio.channels.FileChannel; */ public class RMDTrackBuilderUnitTest extends BaseTest { private RMDTrackBuilder builder; - private IndexedFastaSequenceFile seq; + private ReferenceSequenceFile seq; private GenomeLocParser genomeLocParser; @BeforeMethod diff --git a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/refdata/utils/FeatureToGATKFeatureIteratorUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/refdata/utils/FeatureToGATKFeatureIteratorUnitTest.java index da12cf506..17a66c617 100644 --- a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/refdata/utils/FeatureToGATKFeatureIteratorUnitTest.java +++ b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/refdata/utils/FeatureToGATKFeatureIteratorUnitTest.java @@ -25,7 +25,7 @@ package org.broadinstitute.gatk.utils.refdata.utils; -import htsjdk.samtools.reference.IndexedFastaSequenceFile; +import htsjdk.samtools.reference.ReferenceSequenceFile; import htsjdk.tribble.Feature; import org.broadinstitute.gatk.utils.BaseTest; import org.broadinstitute.gatk.utils.GenomeLoc; @@ -43,7 +43,7 @@ public class FeatureToGATKFeatureIteratorUnitTest extends BaseTest { @SuppressWarnings("unchecked") public void testCloseFilePointers() throws IOException { final String chr = "20"; - IndexedFastaSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(BaseTest.hg19Reference)); + ReferenceSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(BaseTest.hg19Reference)); GenomeLocParser parser = new GenomeLocParser(seq); File file = new File(privateTestDir + "NA12878.hg19.example1.vcf"); VCFCodec codec = new VCFCodec(); diff --git a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/sam/ReadUtilsUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/sam/ReadUtilsUnitTest.java index 372a711b0..9a9a540a4 100644 --- a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/sam/ReadUtilsUnitTest.java +++ b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/sam/ReadUtilsUnitTest.java @@ -26,8 +26,8 @@ package org.broadinstitute.gatk.utils.sam; import htsjdk.samtools.SAMReadGroupRecord; -import htsjdk.samtools.reference.IndexedFastaSequenceFile; import htsjdk.samtools.SAMFileHeader; +import htsjdk.samtools.reference.ReferenceSequenceFile; import org.broadinstitute.gatk.utils.BaseTest; import org.broadinstitute.gatk.utils.BaseUtils; import org.broadinstitute.gatk.utils.Utils; @@ -223,7 +223,7 @@ public class ReadUtilsUnitTest extends BaseTest { @Test (enabled = true) public void testReadWithNsRefIndexInDeletion() throws FileNotFoundException { - final IndexedFastaSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(b37KGReference)); + final ReferenceSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(b37KGReference)); final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(seq.getSequenceDictionary()); final int readLength = 76; @@ -239,7 +239,7 @@ public class ReadUtilsUnitTest extends BaseTest { @Test (enabled = true) public void testReadWithNsRefAfterDeletion() throws FileNotFoundException { - final IndexedFastaSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(b37KGReference)); + final ReferenceSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(b37KGReference)); final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(seq.getSequenceDictionary()); final int readLength = 76; From 49507faaa34579ab9ada40132a5cc19539a21c4f Mon Sep 17 00:00:00 2001 From: Samuel Lee Date: Tue, 2 Aug 2016 15:48:19 -0400 Subject: [PATCH 28/68] Changed maximum allowed GQB value to 100. --- .../haplotypecaller/HaplotypeCaller.java | 13 ++++++--- .../gatk/utils/gvcf/GVCFWriter.java | 28 ++++++++++++++----- .../HaplotypeCallerGVCFIntegrationTest.java | 2 +- .../gatk/utils/gvcf/GVCFWriterUnitTest.java | 4 ++- 4 files changed, 34 insertions(+), 13 deletions(-) diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCaller.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCaller.java index 80a95239c..5c2985720 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCaller.java @@ -366,12 +366,17 @@ public class HaplotypeCaller extends ActiveRegionWalker, In * sites are compressed into bands of similar genotype quality (GQ) that are emitted as a single VCF record. See * the FAQ documentation for more details about the GVCF format. * - * This argument allows you to set the GQ boundaries. HC expects a list of multiple GQ threshold values. To pass - * multiple values, you provide them one by one with the argument, as in `-GQB 10 -GQB 20 -GQB 30` and so on. Note - * that GQ values are capped at 99 in the GATK, so values must be integers in [1, 99]. + * This argument allows you to set the GQ bands. HC expects a list of strictly increasing GQ values + * that will act as exclusive upper bounds for the GQ bands. To pass multiple values, + * you provide them one by one with the argument, as in `-GQB 10 -GQB 20 -GQB 30` and so on + * (this would set the GQ bands to be `[0, 10), [10, 20), [20, 30)` and so on, for example). + * Note that GQ values are capped at 99 in the GATK, so values must be integers in [1, 100]. + * If the last value is strictly less than 100, the last GQ band will start at that value (inclusive) + * and end at 100 (exclusive). */ @Advanced - @Argument(fullName="GVCFGQBands", shortName="GQB", doc="GQ thresholds for reference confidence bands (must be in [1, 99] and specified in increasing order)", required = false) + @Argument(fullName="GVCFGQBands", shortName="GQB", doc="Exclusive upper bounds for reference confidence GQ bands " + + "(must be in [1, 100] and specified in increasing order)", required = false) protected List GVCFGQBands = new ArrayList(70) {{ for (int i=1; i<=60; ++i) add(i); add(70); add(80); add(90); add(99); diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/gvcf/GVCFWriter.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/gvcf/GVCFWriter.java index 23d6dbb2d..9bdee19e0 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/gvcf/GVCFWriter.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/gvcf/GVCFWriter.java @@ -98,20 +98,34 @@ public class GVCFWriter implements VariantContextWriter { * @return a non-null string if something is wrong (string explains issue) */ protected static List parsePartitions(final List GQPartitions, final int defaultPloidy) { - if ( GQPartitions == null ) throw new IllegalArgumentException("The list of GQ partitions cannot be null."); - if ( GQPartitions.isEmpty() ) throw new IllegalArgumentException("The list of GQ partitions cannot be empty."); + if ( GQPartitions == null ) { + throw new IllegalArgumentException("The list of GQ partitions cannot be null."); + } + if ( GQPartitions.isEmpty() ) { + throw new IllegalArgumentException("The list of GQ partitions cannot be empty."); + } final List result = new LinkedList<>(); int lastThreshold = 0; for ( final Integer value : GQPartitions ) { - if ( value == null || value <= 0 ) throw new IllegalArgumentException("The list of GQ partitions contains a null or non-positive integer."); - if ( value < lastThreshold ) throw new IllegalArgumentException(String.format("The list of GQ partitions is out of order. Previous value is %d but the next is %d.", lastThreshold, value)); - if ( value == lastThreshold ) throw new IllegalArgumentException(String.format("The value %d appears more than once in the list of GQ partitions.", value)); - if ( value > MAX_GENOTYPE_QUAL ) throw new IllegalArgumentException(String.format("The value %d in the list of GQ partitions is greater than VCFConstants.MAX_GENOTYPE_QUAL = %d.", value, VCFConstants.MAX_GENOTYPE_QUAL)); + if ( value == null || value <= 0 ) { + throw new IllegalArgumentException("The list of GQ partitions contains a null or non-positive integer."); + } + if ( value < lastThreshold ) { + throw new IllegalArgumentException(String.format("The list of GQ partitions is out of order. " + + "Previous value is %d but the next is %d.", lastThreshold, value)); + } + if ( value == lastThreshold ) { + throw new IllegalArgumentException(String.format("The value %d appears more than once in the list of GQ partitions.", value)); + } + if ( value > MAX_GENOTYPE_QUAL + 1 ) { + throw new IllegalArgumentException(String.format("The value %d in the list of GQ partitions is " + + "greater than VCFConstants.MAX_GENOTYPE_QUAL + 1 = %d.", value, VCFConstants.MAX_GENOTYPE_QUAL + 1)); + } result.add(new HomRefBlock(lastThreshold, value, defaultPloidy)); lastThreshold = value; } - if (lastThreshold <= MAX_GENOTYPE_QUAL ) { + if ( lastThreshold <= MAX_GENOTYPE_QUAL ) { result.add(new HomRefBlock(lastThreshold, MAX_GENOTYPE_QUAL + 1, defaultPloidy)); } return result; diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java index ddc6587dd..402184e33 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java @@ -441,7 +441,7 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest { {Arrays.asList(-1, 10, 20)}, {Arrays.asList(10, 20, 1)}, {Arrays.asList(10, 10, 20)}, - {Arrays.asList(10, 20, VCFConstants.MAX_GENOTYPE_QUAL + 1)} + {Arrays.asList(10, 20, VCFConstants.MAX_GENOTYPE_QUAL + 2)} }; } @Test(dataProvider = "dataBadGQBValues") diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/gvcf/GVCFWriterUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/gvcf/GVCFWriterUnitTest.java index 7067fd89f..34a19adce 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/gvcf/GVCFWriterUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/gvcf/GVCFWriterUnitTest.java @@ -390,7 +390,9 @@ public class GVCFWriterUnitTest extends BaseTest { tests.add(new Object[]{Arrays.asList(1, 1, 10), false, null}); tests.add(new Object[]{Arrays.asList(1, 10, VCFConstants.MAX_GENOTYPE_QUAL - 1), true, Arrays.asList(0, 1, 10, VCFConstants.MAX_GENOTYPE_QUAL - 1)}); tests.add(new Object[]{Arrays.asList(1, 10, VCFConstants.MAX_GENOTYPE_QUAL), true, Arrays.asList(0, 1, 10, VCFConstants.MAX_GENOTYPE_QUAL)}); - tests.add(new Object[]{Arrays.asList(1, 10, VCFConstants.MAX_GENOTYPE_QUAL + 1), false, null}); + tests.add(new Object[]{Arrays.asList(1, 10, VCFConstants.MAX_GENOTYPE_QUAL + 1), true, Arrays.asList(0, 1, 10)}); + tests.add(new Object[]{Collections.singletonList(VCFConstants.MAX_GENOTYPE_QUAL + 1), true, Collections.singletonList(0)}); + tests.add(new Object[]{Arrays.asList(1, 10, VCFConstants.MAX_GENOTYPE_QUAL + 2), false, null}); return tests.toArray(new Object[][]{}); } From 3510906c7f860d7cc4344340ec537f31c3566100 Mon Sep 17 00:00:00 2001 From: Peter Fan Date: Wed, 3 Aug 2016 14:03:05 -0400 Subject: [PATCH 29/68] addresses issue #1280 now interval padding works for exclude intervals --- .../variantutils/SelectVariantsIntegrationTest.java | 12 ++++++++++++ .../gatk/utils/interval/IntervalUtils.java | 2 +- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariantsIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariantsIntegrationTest.java index 6edd72240..2709eb3b2 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariantsIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariantsIntegrationTest.java @@ -88,6 +88,18 @@ public class SelectVariantsIntegrationTest extends WalkerTest { executeTest("testDiscordanceNoSampleSpecified--" + testFile, spec); } + @Test + public void testExcludeIntervalsPadding(){ + WalkerTestSpec spec = new WalkerTestSpec( + "-T SelectVariants -R " + hg19Reference + " -L 1:1715011-1734970 -XL 1:1725305 -ip 200 --variant " + + b37hapmapGenotypes + " -o %s --no_cmdline_in_header", + 1, + Arrays.asList("2e31c0be0d639d7110e639a11c03f4ca") + ); + + executeTest("testExcludeIntervalsPadding--", spec); + } + @Test public void testRepeatedLineSelection() { String testfile = privateTestDir + "test.dup.vcf"; diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/interval/IntervalUtils.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/interval/IntervalUtils.java index 9f6e352bb..df92f886a 100644 --- a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/interval/IntervalUtils.java +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/interval/IntervalUtils.java @@ -658,7 +658,7 @@ public class IntervalUtils { GenomeLocSortedSet excludeSortedSet = null; if (excludeIntervals != null && excludeIntervals.size() > 0) { - excludeSortedSet = loadIntervals(excludeIntervals, IntervalSetRule.UNION, intervalMergingRule, 0, genomeLocParser); + excludeSortedSet = loadIntervals(excludeIntervals, IntervalSetRule.UNION, intervalMergingRule, intervalPadding, genomeLocParser); } return new Pair(includeSortedSet, excludeSortedSet); } From f6c18a5182d430f9d636afe4454727bfd8047685 Mon Sep 17 00:00:00 2001 From: Ron Levine Date: Thu, 21 Jul 2016 13:29:36 -0400 Subject: [PATCH 30/68] Throw an exception if the BQSR input covariants file is not found --- .../gatk/engine/GenomeAnalysisEngine.java | 9 +++++++-- .../readutils/PrintReadsIntegrationTest.java | 14 ++++++++++++++ 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/GenomeAnalysisEngine.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/GenomeAnalysisEngine.java index 85c535c1f..c564d78d6 100644 --- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/GenomeAnalysisEngine.java +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/GenomeAnalysisEngine.java @@ -269,8 +269,13 @@ public class GenomeAnalysisEngine { Utils.resetRandomGenerator(System.currentTimeMillis()); // if the use specified an input BQSR recalibration table then enable on the fly recalibration - if (args.BQSR_RECAL_FILE != null) - setBaseRecalibration(args); + if (args.BQSR_RECAL_FILE != null) { + if (args.BQSR_RECAL_FILE.exists()) { + setBaseRecalibration(args); + } else { + throw new UserException("The BQSR recalibration file, " + args.BQSR_RECAL_FILE.getAbsolutePath() + ", does not exist"); + } + } // setup the runtime limits setupRuntimeLimits(args); diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/tools/walkers/readutils/PrintReadsIntegrationTest.java b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/tools/walkers/readutils/PrintReadsIntegrationTest.java index 8f848aa01..d013d0c42 100644 --- a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/tools/walkers/readutils/PrintReadsIntegrationTest.java +++ b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/tools/walkers/readutils/PrintReadsIntegrationTest.java @@ -135,4 +135,18 @@ public class PrintReadsIntegrationTest extends WalkerTest { executeTest("testPrintReadsException-"+params.args, spec); } + @Test + public void testPrintReadsNoBQSRFile() { + + WalkerTestSpec spec = new WalkerTestSpec( + "-T PrintReads" + + " -R " + hg18Reference + + " -I " + privateTestDir + "HiSeq.1mb.bam" + + " -BSQR bqsrFile" + + " --no_pg_tag" + + " -o %s", + 1, UserException.class); + executeTest("testPrintReadsNoBQSRFile-", spec); + } + } From abc4d5b7b3fdfd6d9d6199dec74dc0c7a478d306 Mon Sep 17 00:00:00 2001 From: Ron Levine Date: Tue, 26 Jul 2016 12:55:14 -0400 Subject: [PATCH 31/68] Bypass spanning deletions in Rank Sum tests --- .../walkers/annotator/AS_RankSumTest.java | 3 +- .../annotator/AS_ReadPosRankSumTest.java | 2 + .../tools/walkers/annotator/RankSumTest.java | 10 +++-- .../walkers/annotator/ReadPosRankSumTest.java | 5 +++ ...perGeneralPloidySuite1IntegrationTest.java | 2 +- ...perGeneralPloidySuite2IntegrationTest.java | 2 +- ...dGenotyperIndelCallingIntegrationTest.java | 2 +- .../UnifiedGenotyperIntegrationTest.java | 2 +- ...GenotyperNormalCallingIntegrationTest.java | 2 +- ...lexAndSymbolicVariantsIntegrationTest.java | 6 +-- .../HaplotypeCallerGVCFIntegrationTest.java | 9 ++++ .../HaplotypeCallerIntegrationTest.java | 14 +++--- .../gatk/utils/sam/AlignmentUtils.java | 45 +++++++++++++++++++ .../utils/sam/AlignmentUtilsUnitTest.java | 22 +++++++++ 14 files changed, 106 insertions(+), 20 deletions(-) diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_RankSumTest.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_RankSumTest.java index 1f15d17b5..f413f55cc 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_RankSumTest.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_RankSumTest.java @@ -277,7 +277,8 @@ public abstract class AS_RankSumTest extends RankSumTest implements ReducibleAnn final GATKSAMRecord read = el.getKey(); if ( isUsableRead(read, refLoc) ) { final Double value = getElementForRead(read, refLoc, a); - if ( value == null ) + // Bypass read if the clipping goal is not reached or the refloc is inside a spanning deletion + if ( value == null || value < 0.0 ) continue; if(perAlleleValues.containsKey(a.getMostLikelyAllele())) diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_ReadPosRankSumTest.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_ReadPosRankSumTest.java index 2bd2eb9fd..e0a92af1d 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_ReadPosRankSumTest.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_ReadPosRankSumTest.java @@ -104,6 +104,8 @@ public class AS_ReadPosRankSumTest extends AS_RankSumTest implements AS_Standard int readPos = AlignmentUtils.calcAlignmentByteArrayOffset(read.getCigar(), offset, false, 0, 0); final int numAlignedBases = AlignmentUtils.getNumAlignedBasesCountingSoftClips( read ); + // Note: For a spanning deletion, readPos is at the upstream end of the deletion and is greater than numAlignedBases (which does not include deletions). + // Hence, the resulting readPos will have a negative value. if (readPos > numAlignedBases / 2) readPos = numAlignedBases - (readPos + 1); return (double)readPos; diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/RankSumTest.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/RankSumTest.java index cca58bd66..b1f7bb63e 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/RankSumTest.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/RankSumTest.java @@ -76,6 +76,7 @@ import java.util.*; //TODO: will eventually implement ReducibleAnnotation in order to preserve accuracy for CombineGVCFs and GenotypeGVCFs -- see RMSAnnotation.java for an example of an abstract ReducibleAnnotation public abstract class RankSumTest extends InfoFieldAnnotation implements ActiveRegionBasedAnnotation { static final boolean DEBUG = false; + protected static double INVALID_READ_POSITION = -1; // No mapping to a read position public Map annotate(final RefMetaDataTracker tracker, final AnnotatorCompatible walker, @@ -86,11 +87,11 @@ public abstract class RankSumTest extends InfoFieldAnnotation implements ActiveR // either stratifiedContexts or stratifiedPerReadAlleleLikelihoodMap has to be non-null final GenotypesContext genotypes = vc.getGenotypes(); - if (genotypes == null || genotypes.size() == 0) + if (genotypes == null || genotypes.isEmpty()) return null; - final ArrayList refQuals = new ArrayList<>(); - final ArrayList altQuals = new ArrayList<>(); + final List refQuals = new ArrayList<>(); + final List altQuals = new ArrayList<>(); for ( final Genotype genotype : genotypes.iterateInSampleNameOrder() ) { @@ -183,7 +184,8 @@ public abstract class RankSumTest extends InfoFieldAnnotation implements ActiveR final GATKSAMRecord read = el.getKey(); if ( isUsableRead(read, refLoc) ) { final Double value = getElementForRead(read, refLoc, a); - if ( value == null ) + // Bypass read if the clipping goal is not reached or the refloc is inside a spanning deletion + if ( value == null || value == INVALID_READ_POSITION ) continue; if ( a.getMostLikelyAllele().isReference() ) diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/ReadPosRankSumTest.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/ReadPosRankSumTest.java index 51890fc33..fbba479e4 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/ReadPosRankSumTest.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/ReadPosRankSumTest.java @@ -104,6 +104,11 @@ public class ReadPosRankSumTest extends RankSumTest implements StandardAnnotatio if ( offset == ReadUtils.CLIPPING_GOAL_NOT_REACHED ) return null; + // If the offset inside a deletion, it does not lie on a read. + if ( AlignmentUtils.isInsideDeletion(read.getCigar(), offset) ) { + return INVALID_READ_POSITION; + } + int readPos = AlignmentUtils.calcAlignmentByteArrayOffset( read.getCigar(), offset, false, 0, 0 ); final int numAlignedBases = AlignmentUtils.getNumAlignedBasesCountingSoftClips( read ); if (readPos > numAlignedBases / 2) diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite1IntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite1IntegrationTest.java index 92fb16387..456f5e8ca 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite1IntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite1IntegrationTest.java @@ -88,6 +88,6 @@ public class UnifiedGenotyperGeneralPloidySuite1IntegrationTest extends WalkerTe //TODO the old MD5 is kept for the record. //TODO this should be revisit once we get into addressing inaccuracies by the independent allele approach. // executor.PC_LSV_Test_NoRef("-A AlleleCountBySample -maxAltAlleles 2 -ploidy 1", "LSV_INDEL_DISC_NOREF_p1", "INDEL", "b5ff7530827f4b9039a58bdc8a3560d2"); - executor.PC_LSV_Test_NoRef("-A AlleleCountBySample -maxAltAlleles 2 -ploidy 1", "LSV_INDEL_DISC_NOREF_p1", "INDEL", "b99416c04ba951577f43fd2d25f46388"); + executor.PC_LSV_Test_NoRef("-A AlleleCountBySample -maxAltAlleles 2 -ploidy 1", "LSV_INDEL_DISC_NOREF_p1", "INDEL", "7421a776c75d0ab5a2ff89d9e7f105ff"); } } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite2IntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite2IntegrationTest.java index 4262ab665..ff0f23666 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite2IntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite2IntegrationTest.java @@ -63,7 +63,7 @@ public class UnifiedGenotyperGeneralPloidySuite2IntegrationTest extends WalkerTe @Test(enabled = true) public void testINDEL_maxAltAlleles2_ploidy3_Pools_noRef() { - executor.PC_LSV_Test_NoRef("-A AlleleCountBySample -maxAltAlleles 2 -ploidy 3","LSV_INDEL_DISC_NOREF_p3","INDEL","1d27eaa3557dc28c95b9024114d50ad1"); + executor.PC_LSV_Test_NoRef("-A AlleleCountBySample -maxAltAlleles 2 -ploidy 3","LSV_INDEL_DISC_NOREF_p3","INDEL","f4092488c9785d800c3f6470af7119ce"); } @Test(enabled = true) diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperIndelCallingIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperIndelCallingIntegrationTest.java index 791e4ac6d..2b94dfc7a 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperIndelCallingIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperIndelCallingIntegrationTest.java @@ -140,7 +140,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + result.get(0).getAbsolutePath() + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L " + result.get(0).getAbsolutePath(), 1, - Arrays.asList("08967b41ccc76b1f3c7093e51a90713a")); + Arrays.asList("75b5a925c2009a8c14ea34fff3d04443")); executeTest("test MultiSample Pilot1 CEU indels using GENOTYPE_GIVEN_ALLELES", spec2); } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index 7d45eddb5..3cdfec3ba 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -310,7 +310,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000 " + "-A SnpEff", 1, - Arrays.asList("e99f100fe71bb7f328b485204c16f14a")); + Arrays.asList("65641c92469ab80513b04144d0eae900")); executeTest("testSnpEffAnnotationRequestedWithoutRodBinding", spec); } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperNormalCallingIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperNormalCallingIntegrationTest.java index 7348e12b1..8278148ec 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperNormalCallingIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperNormalCallingIntegrationTest.java @@ -70,7 +70,7 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{ public void testMultiSamplePilot1() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000", 1, - Arrays.asList("c759b04ed0d948bda95008e29f3f5c2d")); + Arrays.asList("e7f216d2f9857a579ef3e211076b37a4")); executeTest("test MultiSample Pilot1", spec); } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java index 383b85801..af1987adf 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java @@ -72,7 +72,7 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa @Test public void testHaplotypeCallerMultiSampleComplex1() { - HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "113ae4c0244c50243313a7d6e77da26b"); + HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "86528820f8c102c712d9562b83204c05"); } private void HCTestSymbolicVariants(String bam, String args, String md5) { @@ -96,7 +96,7 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa @Test public void testHaplotypeCallerMultiSampleGGAComplex() { HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:119673-119823 -L 20:121408-121538", - "8f8680bd8e1549ad88691c9c8af9977c"); + "828ef27284bd4045148728952b3a7d94"); } @Test @@ -114,7 +114,7 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa @Test public void testHaplotypeCallerMultiSampleConsensusModeComplex() { HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:119673-119823 -L 20:121408-121538 -L 20:133041-133161 -L 20:300207-300337", - "353f1895047b15b1fec22b559c9da0c1"); + "060eed2610eed818b2ab55d582eb22ec"); } } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java index 402184e33..bb7cd0a12 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java @@ -453,4 +453,13 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest { spec.disableShadowBCF(); executeTest("testBadGQBValues", spec); } + + @Test + public void testHaplotypeCallerGVCSpanDel() { + final String commandLine = String.format("-T HaplotypeCaller -R %s -I %s -L 1:26357667 -ERC GVCF --no_cmdline_in_header -A AS_ReadPosRankSumTest -A ReadPosRankSumTest -variant_index_type %s -variant_index_parameter %d", + b37KGReference, privateTestDir + "NexPond-377866-1:26357600-26357700.bam", GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER); + final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("93bc22340e6a4b01a7b96e5a3a12dfc3")); + spec.disableShadowBCF(); + executeTest("testHaplotypeCallerGVCSpanDel", spec); + } } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index e34f734e3..31bd7d0a8 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -107,7 +107,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeBAMOutFlags() throws IOException { - HCTestWithBAMOut(NA12878_BAM, " -L 20:10000000-10100000 ", "08943fb76d1cd5b5b8815e3991754911", "6a81bbefa6c4ed7a6b8d2c3e0e5a4756"); + HCTestWithBAMOut(NA12878_BAM, " -L 20:10000000-10100000 ", "56086abc3bd5e3f7d111f452b7cc4fa1", "6a81bbefa6c4ed7a6b8d2c3e0e5a4756"); } @Test @@ -153,17 +153,17 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerGraphBasedSingleSample() throws IOException { - HCTest(NA12878_BAM, "-likelihoodEngine GraphBased", "ba0dc5f416d69558cb5dd3e0a0a5a084"); + HCTest(NA12878_BAM, "-likelihoodEngine GraphBased", "8ab21bd6fb7ef37480f556fd5fa5375c"); } @Test public void testHaplotypeCallerGraphBasedMultiSampleHaploid() throws IOException { - HCTest(CEUTRIO_BAM, "-likelihoodEngine GraphBased -ploidy 1", "129bca18bb9eec23004b2d28aa541de2"); + HCTest(CEUTRIO_BAM, "-likelihoodEngine GraphBased -ploidy 1", "01220e85ff6bc49e35a325a1df2519e5"); } @Test public void testHaplotypeCallerGraphBasedMultiSample() throws IOException { - HCTest(CEUTRIO_BAM, "-likelihoodEngine GraphBased", "2b89c9e102a049e223bc0d91156a08a3"); + HCTest(CEUTRIO_BAM, "-likelihoodEngine GraphBased", "80c5b0f72a7962e1ba846ec20465001f"); } @Test @@ -398,7 +398,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { public void testLackSensitivityDueToBadHaplotypeSelectionFix() { final String commandLine = String.format("-T HaplotypeCaller -pairHMMSub %s %s -R %s -I %s -L %s --no_cmdline_in_header --maxNumHaplotypesInPopulation 16", HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, b37KGReferenceWithDecoy, privateTestDir + "hc-lack-sensitivity.bam", privateTestDir + "hc-lack-sensitivity.interval_list"); - final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("22f5a3e9366e611509f03c984f8b4960")); + final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("5087a8855b3ee9ea1091367674783462")); spec.disableShadowBCF(); executeTest("testLackSensitivityDueToBadHaplotypeSelectionFix", spec); } @@ -484,12 +484,12 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerTandemRepeatAnnotator() throws IOException{ - HCTest(NA12878_BAM, " -L 20:10001000-10010000 -A TandemRepeatAnnotator -XA MappingQualityZero -XA SpanningDeletions", "34328c475325b7dfaa57ab5920478e0c"); + HCTest(NA12878_BAM, " -L 20:10001000-10010000 -A TandemRepeatAnnotator -XA MappingQualityZero -XA SpanningDeletions", "2cf4cab0035d09aa0aec6f3faa2c9df6"); } @Test public void testHBaseCountsBySample() throws IOException{ - HCTest(NA12878_BAM, " -L 20:10001000-10010000 -A BaseCountsBySample", "f5ad4e03c0faaa806ee6ae536af8a479"); + HCTest(NA12878_BAM, " -L 20:10001000-10010000 -A BaseCountsBySample", "c4550a5933cc954bad70980750e0df52"); } @Test diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/sam/AlignmentUtils.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/sam/AlignmentUtils.java index 5aae6af85..c3d6b5cac 100644 --- a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/sam/AlignmentUtils.java +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/sam/AlignmentUtils.java @@ -538,6 +538,51 @@ public final class AlignmentUtils { return alignmentPos; } + /** + * Is the offset inside a deletion? + * + * @param cigar the read's CIGAR -- cannot be null + * @param offset the offset into the CIGAR + * @return true if the offset is inside a deletion, false otherwise + */ + public static boolean isInsideDeletion(final Cigar cigar, final int offset) { + if ( cigar == null ) throw new IllegalArgumentException("attempting to find the alignment position from a CIGAR that is null"); + if ( offset < 0 ) return false; + + // pos counts read bases + int pos = 0; + int prevPos = 0; + + for (final CigarElement ce : cigar.getCigarElements()) { + + switch (ce.getOperator()) { + case I: + case S: + case D: + case M: + case EQ: + case X: + prevPos = pos; + pos += ce.getLength(); + break; + case H: + case P: + case N: + break; + default: + throw new ReviewedGATKException("Unsupported cigar operator: " + ce.getOperator()); + } + + // Is the offset inside a deletion? + if ( prevPos < offset && pos >= offset && ce.getOperator() == CigarOperator.D ) { + return true; + + } + } + + return false; + } + /** * Generate an array of bases for just those that are aligned to the reference (i.e. no clips or insertions) * diff --git a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/sam/AlignmentUtilsUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/sam/AlignmentUtilsUnitTest.java index 24351842f..160d2e51f 100644 --- a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/sam/AlignmentUtilsUnitTest.java +++ b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/sam/AlignmentUtilsUnitTest.java @@ -515,6 +515,28 @@ public class AlignmentUtilsUnitTest { Assert.assertEquals(actual, expectedResult, "Wrong alignment offset detected for cigar " + cigar.toString()); } + @Test + public void testIsInsideDeletion() { + final List cigarElements = Arrays.asList(new CigarElement(5, CigarOperator.S), + new CigarElement(5, CigarOperator.M), + new CigarElement(5, CigarOperator.EQ), + new CigarElement(6, CigarOperator.N), + new CigarElement(5, CigarOperator.X), + new CigarElement(6, CigarOperator.D), + new CigarElement(1, CigarOperator.P), + new CigarElement(1, CigarOperator.H)); + final Cigar cigar = new Cigar(cigarElements); + for ( int i=-1; i <= 20; i++ ) { + Assert.assertFalse(AlignmentUtils.isInsideDeletion(cigar, i)); + } + for ( int i=21; i <= 26; i++ ){ + Assert.assertTrue(AlignmentUtils.isInsideDeletion(cigar, i)); + } + for ( int i=27; i <= 28; i++ ) { + Assert.assertFalse(AlignmentUtils.isInsideDeletion(cigar, i)); + } + } + //////////////////////////////////////////////////// // Test AlignmentUtils.readToAlignmentByteArray() // //////////////////////////////////////////////////// From 01142dfb1c306af14013415a655912fcbe0a2afc Mon Sep 17 00:00:00 2001 From: David Benjamin Date: Thu, 4 Aug 2016 01:50:13 -0400 Subject: [PATCH 32/68] Lots of small improvements to Mutect2 code --- .../cancer/m2/M2ArgumentCollection.java | 35 +- .../gatk/tools/walkers/cancer/m2/MuTect2.java | 385 +++++------ .../cancer/m2/SomaticGenotypingEngine.java | 613 ++++++++---------- .../cancer/m2/TumorPowerCalculator.java | 12 +- 4 files changed, 446 insertions(+), 599 deletions(-) diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/M2ArgumentCollection.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/M2ArgumentCollection.java index 8ad89a848..5cc07df4e 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/M2ArgumentCollection.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/M2ArgumentCollection.java @@ -51,12 +51,41 @@ package org.broadinstitute.gatk.tools.walkers.cancer.m2; +import htsjdk.variant.variantcontext.VariantContext; +import org.broadinstitute.gatk.engine.arguments.DbsnpArgumentCollection; import org.broadinstitute.gatk.tools.walkers.haplotypecaller.AssemblyBasedCallerArgumentCollection; -import org.broadinstitute.gatk.utils.commandline.Advanced; -import org.broadinstitute.gatk.utils.commandline.Argument; -import org.broadinstitute.gatk.utils.commandline.Hidden; +import org.broadinstitute.gatk.utils.commandline.*; + +import java.util.Collections; +import java.util.List; public class M2ArgumentCollection extends AssemblyBasedCallerArgumentCollection { + + /***************************************/ + // Reference Metadata inputs + /***************************************/ + /** + * MuTect2 has the ability to use COSMIC data in conjunction with dbSNP to adjust the threshold for evidence of a variant + * in the normal. If a variant is present in dbSNP, but not in COSMIC, then more evidence is required from the normal + * sample to prove the variant is not present in germline. + */ + @Input(fullName="cosmic", shortName = "cosmic", doc="VCF file of COSMIC sites", required=false) + public List> cosmicRod = Collections.emptyList(); + + /** + * A panel of normals can be a useful (optional) input to help filter out commonly seen sequencing noise that may appear as low allele-fraction somatic variants. + */ + @Input(fullName="normal_panel", shortName = "PON", doc="VCF file of sites observed in normal", required=false) + public List> normalPanelRod = Collections.emptyList(); + + + /** + * rsIDs from this file are used to populate the ID column of the output. Also, the DB INFO flag will be set when appropriate. + * dbSNP overlap is only used to require more evidence of absence in the normal if the variant in question has been seen before in germline. + */ + @ArgumentCollection + protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection(); + @Advanced @Argument(fullName="m2debug", shortName="m2debug", doc="Print out very verbose M2 debug information", required = false) public boolean M2_DEBUG = false; diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/MuTect2.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/MuTect2.java index b3c7d950f..4f509f796 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/MuTect2.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/MuTect2.java @@ -201,7 +201,6 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i // the genotyping engine protected HaplotypeCallerGenotypingEngine genotypingEngine = null; - private byte MIN_TAIL_QUALITY; private double log10GlobalReadMismappingRate; @@ -214,9 +213,8 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i @ArgumentCollection protected LikelihoodEngineArgumentCollection LEAC = new LikelihoodEngineArgumentCollection(); - @Argument(fullName = "debug_read_name", required = false, doc="trace this read name through the calling process") - public String DEBUG_READ_NAME = null; + protected String DEBUG_READ_NAME = null; @Hidden @Advanced @@ -224,24 +222,81 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i final public int MQthreshold = 20; - /***************************************/ - // Reference Metadata inputs - /***************************************/ - /** - * MuTect2 has the ability to use COSMIC data in conjunction with dbSNP to adjust the threshold for evidence of a variant - * in the normal. If a variant is present in dbSNP, but not in COSMIC, then more evidence is required from the normal - * sample to prove the variant is not present in germline. - */ - @Input(fullName="cosmic", shortName = "cosmic", doc="VCF file of COSMIC sites", required=false) - public List> cosmicRod = Collections.emptyList(); - - /** - * A panel of normals can be a useful (optional) input to help filter out commonly seen sequencing noise that may appear as low allele-fraction somatic variants. - */ - @Input(fullName="normal_panel", shortName = "PON", doc="VCF file of sites observed in normal", required=false) - public List> normalPanelRod = Collections.emptyList(); + public RodBinding getDbsnpRodBinding() { return MTAC.dbsnp.dbsnp; } private HaplotypeBAMWriter haplotypeBAMWriter; + /** + * If a call overlaps with a record from the provided comp track, the INFO field will be annotated + * as such in the output with the track name (e.g. -comp:FOO will have 'FOO' in the INFO field). + * Records that are filtered in the comp track will be ignored. + * Note that 'dbSNP' has been special-cased (see the --dbsnp argument). + */ + @Advanced + @Input(fullName="comp", shortName = "comp", doc="comparison VCF file", required=false) + public List> comps = Collections.emptyList(); + public List> getCompRodBindings() { return comps; } + + /** + * Which annotations to add to the output VCF file. See the VariantAnnotator -list argument to view available annotations. + */ + @Advanced + @Argument(fullName="annotation", shortName="A", doc="One or more specific annotations to apply to variant calls", required=false) + protected List annotationsToUse = new ArrayList<>(Arrays.asList(new String[]{"DepthPerAlleleBySample", "BaseQualitySumPerAlleleBySample", "TandemRepeatAnnotator", "OxoGReadCounts"})); + + /** + * Which annotations to exclude from output in the VCF file. Note that this argument has higher priority than the -A or -G arguments, + * so annotations will be excluded even if they are explicitly included with the other options. + */ + @Advanced + @Argument(fullName="excludeAnnotation", shortName="XA", doc="One or more specific annotations to exclude", required=false) + protected List annotationsToExclude = new ArrayList<>(Arrays.asList(new String[]{"SpanningDeletions"})); + + /** + * Which groups of annotations to add to the output VCF file. See the VariantAnnotator -list argument to view available groups. + */ + @Argument(fullName="group", shortName="G", doc="One or more classes/groups of annotations to apply to variant calls", required=false) + protected String[] annotationClassesToUse = { }; + + @Output(doc="File to which variants should be written") + protected VariantContextWriter vcfWriter = null; + + /** + * Active region trimmer reference. + */ + @ArgumentCollection + protected ActiveRegionTrimmer trimmer = new ActiveRegionTrimmer(); + + @Hidden + @Argument(fullName="keepRG", shortName="keepRG", doc="Only use read from this read group when making calls (but use all reads to build the assembly)", required = false) + protected String keepRG = null; + + @Argument(fullName = "min_base_quality_score", shortName = "mbq", doc = "Minimum base quality required to consider a base for calling", required = false) + public byte MIN_BASE_QUALTY_SCORE = 10; + + public PairHMMLikelihoodCalculationEngine.PCR_ERROR_MODEL pcrErrorModel = PairHMMLikelihoodCalculationEngine.PCR_ERROR_MODEL.HOSTILE; + + @Hidden + @Argument(fullName="errorCorrectReads", shortName="errorCorrectReads", doc = "Use an exploratory algorithm to error correct the kmers used during assembly. May cause fundamental problems with the assembly graph itself", required=false) + protected boolean errorCorrectReads = false; + + @Hidden + @Argument(fullName="captureAssemblyFailureBAM", shortName="captureAssemblyFailureBAM", doc="If specified, we will write a BAM called assemblyFailure.bam capturing all of the reads that were in the active region when the assembler failed for any reason", required = false) + protected boolean captureAssemblyFailureBAM = false; + + @Advanced + @Argument(fullName="dontUseSoftClippedBases", shortName="dontUseSoftClippedBases", doc="If specified, we will not analyze soft clipped bases in the reads", required = false) + protected boolean dontUseSoftClippedBases = false; + + @Hidden + @Argument(fullName="justDetermineActiveRegions", shortName="justDetermineActiveRegions", doc = "If specified, the HC won't actually do any assembly or calling, it'll just run the upfront active region determination code. Useful for benchmarking and scalability testing", required=false) + protected boolean justDetermineActiveRegions = false; + + // reference base padding size + private static final int REFERENCE_PADDING = 500; + + private static final byte MIN_TAIL_QUALITY_WITH_ERROR_CORRECTION = 6; + private final static int maxReadsInRegionPerSample = 1000; // TODO -- should be an argument + private final static int minReadsPerAlignmentStart = 5; // TODO -- should be an argument @Override public void initialize() { @@ -250,15 +305,15 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i samplesList = new IndexedSampleList(new ArrayList<>(ReadUtils.getSAMFileSamples(getToolkit().getSAMFileHeader()))); // MUTECT: check that we have at least one tumor bam - for(SAMReaderID id : getToolkit().getReadsDataSource().getReaderIDs()) { + for(final SAMReaderID id : getToolkit().getReadsDataSource().getReaderIDs()) { if (id.getTags().getPositionalTags().size() == 0) { throw new RuntimeException("BAMs must be tagged as either 'tumor' or 'normal'"); } // only supports single-sample BAMs (ie first read group is representative) - String bamSampleName = getToolkit().getReadsDataSource().getHeader(id).getReadGroups().get(0).getSample(); + final String bamSampleName = getToolkit().getReadsDataSource().getHeader(id).getReadGroups().get(0).getSample(); - for(String tag : id.getTags().getPositionalTags()) { + for(final String tag : id.getTags().getPositionalTags()) { if (BAM_TAG_TUMOR.equalsIgnoreCase(tag)) { tumorSAMReaderIDs.add(id); if (tumorSampleName == null) { @@ -330,9 +385,9 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i final MergeVariantsAcrossHaplotypes variantMerger = new MergeVariantsAcrossHaplotypes(); final GenomeAnalysisEngine toolkit = getToolkit(); - final GenomeLocParser genomeLocParser = toolkit.getGenomeLocParser(); - genotypingEngine = new SomaticGenotypingEngine( MTAC, samplesList, genomeLocParser, FixedAFCalculatorProvider.createThreadSafeProvider(getToolkit(), MTAC, logger), !doNotRunPhysicalPhasing, MTAC); + genotypingEngine = new SomaticGenotypingEngine( MTAC, samplesList, toolkit.getGenomeLocParser(), !doNotRunPhysicalPhasing, MTAC, + tumorSampleName, normalSampleName, DEBUG_READ_NAME); genotypingEngine.setCrossHaplotypeEventMerger(variantMerger); genotypingEngine.setAnnotationEngine(annotationEngine); @@ -353,7 +408,7 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i trimmer.snpPadding = 50; samplesList = toolkit.getReadSampleList(); - Set sampleSet = SampleListUtils.asSet(samplesList); + final Set sampleSet = SampleListUtils.asSet(samplesList); if( MTAC.CONTAMINATION_FRACTION_FILE != null ) MTAC.setSampleContamination(AlleleBiasedDownsamplingUtils.loadContaminationFile(MTAC.CONTAMINATION_FRACTION_FILE, MTAC.CONTAMINATION_FRACTION, sampleSet, logger)); @@ -367,7 +422,7 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i } final VariantAnnotatorEngine annotationEngine = new VariantAnnotatorEngine(Arrays.asList(annotationClassesToUse), annotationsToUse, annotationsToExclude, this, getToolkit()); - Set headerInfo = new HashSet<>(); + final Set headerInfo = new HashSet<>(); // all annotation fields from VariantAnnotatorEngine headerInfo.addAll(annotationEngine.getVCFAnnotationDescriptions()); @@ -383,7 +438,7 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i headerInfo.addAll(getM2HeaderLines()); headerInfo.addAll(getSampleHeaderLines()); - List outputSampleNames = getOutputSampleNames(); + final List outputSampleNames = getOutputSampleNames(); vcfWriter.writeHeader(new VCFHeader(headerInfo, outputSampleNames)); @@ -391,7 +446,7 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i } private Set getM2HeaderLines(){ - Set headerInfo = new HashSet<>(); + final Set headerInfo = new HashSet<>(); headerInfo.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.NORMAL_LOD_KEY)); headerInfo.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.TUMOR_LOD_KEY)); headerInfo.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.PANEL_OF_NORMALS_COUNT_KEY)); @@ -430,21 +485,21 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i } private Set getSampleHeaderLines(){ - Set sampleLines = new HashSet<>(); + final Set sampleLines = new HashSet<>(); if (printTCGAsampleHeader) { //NOTE: This will only list the first bam file for each tumor/normal sample if there is more than one - Map normalSampleHeaderAttributes = new HashMap<>(); + final Map normalSampleHeaderAttributes = new HashMap<>(); normalSampleHeaderAttributes.put("ID", "NORMAL"); normalSampleHeaderAttributes.put("SampleName", normalSampleName); if (normalSAMReaderIDs.iterator().hasNext() && !getToolkit().getArguments().disableCommandLineInVCF) normalSampleHeaderAttributes.put("File", normalSAMReaderIDs.iterator().next().getSamFilePath()); - VCFSimpleHeaderLine normalSampleHeader = new VCFSimpleHeaderLine("SAMPLE", normalSampleHeaderAttributes); - Map tumorSampleHeaderAttributes = new HashMap<>(); + final VCFSimpleHeaderLine normalSampleHeader = new VCFSimpleHeaderLine("SAMPLE", normalSampleHeaderAttributes); + final Map tumorSampleHeaderAttributes = new HashMap<>(); tumorSampleHeaderAttributes.put("ID", "TUMOR"); tumorSampleHeaderAttributes.put("SampleName", tumorSampleName); if (tumorSAMReaderIDs.iterator().hasNext() && !getToolkit().getArguments().disableCommandLineInVCF) tumorSampleHeaderAttributes.put("File", tumorSAMReaderIDs.iterator().next().getSamFilePath()); - VCFSimpleHeaderLine tumorSampleHeader = new VCFSimpleHeaderLine("SAMPLE", tumorSampleHeaderAttributes); + final VCFSimpleHeaderLine tumorSampleHeader = new VCFSimpleHeaderLine("SAMPLE", tumorSampleHeaderAttributes); sampleLines.add(normalSampleHeader); sampleLines.add(tumorSampleHeader); @@ -455,7 +510,7 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i private List getOutputSampleNames(){ if (printTCGAsampleHeader) { //Already checked for exactly 1 tumor and 1 normal in printTCGAsampleHeader assignment in initialize() - List sampleNamePlaceholders = new ArrayList<>(2); + final List sampleNamePlaceholders = new ArrayList<>(2); sampleNamePlaceholders.add("TUMOR"); sampleNamePlaceholders.add("NORMAL"); return sampleNamePlaceholders; @@ -466,14 +521,14 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i } @Override - public ActivityProfileState isActive(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + public ActivityProfileState isActive(final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context) { if( context == null || context.getBasePileup().isEmpty() ) // if we don't have any data, just abort early return new ActivityProfileState(ref.getLocus(), 0.0); final Map splitContexts = AlignmentContextUtils.splitContextBySampleName(context); - AlignmentContext tumorContext = splitContexts.get(tumorSampleName); - AlignmentContext normalContext = splitContexts.get(normalSampleName); + final AlignmentContext tumorContext = splitContexts.get(tumorSampleName); + final AlignmentContext normalContext = splitContexts.get(normalSampleName); // if there are no tumor reads... there is no activity! if (tumorContext == null) { @@ -481,7 +536,7 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i } // KCIBUL -- this method was inlined and modified from ReferenceConfidenceModel - ReadBackedPileup tumorPileup = tumorContext.getBasePileup().getMappingFilteredPileup(MQthreshold); + final ReadBackedPileup tumorPileup = tumorContext.getBasePileup().getMappingFilteredPileup(MQthreshold); final double[] tumorGLs = calcGenotypeLikelihoodsOfRefVsAny(tumorPileup, ref.getBase(), MIN_BASE_QUALTY_SCORE); final double tumorLod = tumorGLs[1] - tumorGLs[0]; @@ -495,7 +550,7 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i // in any case, we have to handle the case where there is no normal (and thus no normal context) which is // different than having a normal but having no reads (where we should not enter the active region) if (normalSampleName != null && normalContext != null) { - int nonRefInNormal = getCountOfNonRefEvents(normalContext.getBasePileup(), ref.getBase(), MIN_BASE_QUALTY_SCORE); + final int nonRefInNormal = getCountOfNonRefEvents(normalContext.getBasePileup(), ref.getBase(), MIN_BASE_QUALTY_SCORE); final double[] normalGLs = calcGenotypeLikelihoodsOfRefVsAny(normalContext.getBasePileup(), ref.getBase(), MIN_BASE_QUALTY_SCORE, 0.5f); final double normalLod = normalGLs[0] - normalGLs[1]; @@ -528,13 +583,12 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i // No reads here so nothing to do! if( originalActiveRegion.size() == 0 ) { return referenceModelForNoVariation(originalActiveRegion, true); } - logReadInfo(DEBUG_READ_NAME, originalActiveRegion.getReads(), "Present in original active region"); // create the assembly using just high quality reads (Q20 or higher). We want to use lower // quality reads in the PairHMM (and especially in the normal) later, so we can't use a ReadFilter - ActiveRegion assemblyActiveRegion = new ActiveRegion(originalActiveRegion.getLocation(), originalActiveRegion.getSupportingStates(),originalActiveRegion.isActive(), getToolkit().getGenomeLocParser(), originalActiveRegion.getExtension()); - for (GATKSAMRecord rec : originalActiveRegion.getReads()) { + final ActiveRegion assemblyActiveRegion = new ActiveRegion(originalActiveRegion.getLocation(), originalActiveRegion.getSupportingStates(),originalActiveRegion.isActive(), getToolkit().getGenomeLocParser(), originalActiveRegion.getExtension()); + for (final GATKSAMRecord rec : originalActiveRegion.getReads()) { if (rec.getMappingQuality() >= MQthreshold ) { assemblyActiveRegion.add(rec); } @@ -558,18 +612,11 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i // Stop the trimming madness!!! if (!trimmingResult.isVariationPresent()) return referenceModelForNoVariation(originalActiveRegion,false); - logReadInfo(DEBUG_READ_NAME, trimmingResult.getCallableRegion().getReads(), "Present in trimming result"); final AssemblyResultSet assemblyResult = trimmingResult.needsTrimming() ? untrimmedAssemblyResult.trimTo(trimmingResult.getCallableRegion()) : untrimmedAssemblyResult; -// final AssemblyResultSet assemblyResult = untrimmedAssemblyResult; - - // after talking to Ryan -- they grab the reads out of the assembly (and trim then) to pass into the PairHMM - // because at one point they were trying error correcting of the reads based on the haplotypes.. but that is not - // working out, so it's safe for us just to take the reads -// final ActiveRegion regionForGenotyping = assemblyResult.getRegionForGenotyping(); logReadInfo(DEBUG_READ_NAME, regionForGenotyping.getReads(), "Present in region for genotyping"); @@ -577,10 +624,6 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i haplotypeBAMWriter.addDroppedReadsFromDelta(DroppedReadsTracker.Reason.TRIMMMED, originalActiveRegion.getReads(), regionForGenotyping.getReads()); } -// -// final ActiveRegion regionForGenotyping = trimmingResult.getCallableRegion(); - -// final ActiveRegion regionForGenotyping = originalActiveRegion; // filter out reads from genotyping which fail mapping quality based criteria //TODO - why don't do this before any assembly is done? Why not just once at the beginning of this method @@ -594,7 +637,6 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i } final Map> perSampleFilteredReadList = splitReadsBySample(filteredReads); - logReadInfo(DEBUG_READ_NAME, regionForGenotyping.getReads(), "Present in region for genotyping after filtering reads"); // abort early if something is out of the acceptable range @@ -612,17 +654,17 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i final List haplotypes = assemblyResult.getHaplotypeList(); final Map> reads = splitReadsBySample( regionForGenotyping.getReads() ); - for (List rec : reads.values()) { + for (final List rec : reads.values()) { logReadInfo(DEBUG_READ_NAME, rec, "Present after splitting assemblyResult by sample"); } final HashMap ARreads_origNormalMQ = new HashMap<>(); - for (GATKSAMRecord read : regionForGenotyping.getReads()) { + for (final GATKSAMRecord read : regionForGenotyping.getReads()) { ARreads_origNormalMQ.put(read.getReadName(), read.getMappingQuality()); } // modify MAPQ scores in normal to be high so that we don't do any base quality score capping - for(GATKSAMRecord rec : regionForGenotyping.getReads()) { + for(final GATKSAMRecord rec : regionForGenotyping.getReads()) { if (isReadFromNormal(rec)) { rec.setMappingQuality(60); } @@ -647,8 +689,7 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i } readLikelihoods.changeReads(readRealignments); - - for (GATKSAMRecord rec : readRealignments.keySet()) { + for (final GATKSAMRecord rec : readRealignments.keySet()) { logReadInfo(DEBUG_READ_NAME, rec, "Present after computing read likelihoods"); } @@ -666,15 +707,8 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i assemblyResult.getFullReferenceWithPadding(), assemblyResult.getPaddedReferenceLoc(), regionForGenotyping.getLocation(), - getToolkit().getGenomeLocParser(), metaDataTracker, - givenAlleles, false , - tumorSampleName, - normalSampleName, - dbsnp.dbsnp, - cosmicRod, - DEBUG_READ_NAME - ); + givenAlleles); if ( MTAC.bamWriter != null ) { final Set calledHaplotypeSet = new HashSet<>(calledHaplotypes.getCalledHaplotypes()); @@ -695,16 +729,16 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i if( MTAC.DEBUG ) { logger.info("----------------------------------------------------------------------------------"); } - List annotatedCalls = new ArrayList<>(); - int eventCount = calledHaplotypes.getCalls().size(); + final List annotatedCalls = new ArrayList<>(); + final int eventCount = calledHaplotypes.getCalls().size(); Integer minEventDistance = null; Integer maxEventDistance = null; Integer lastPosition = null; - for (VariantContext vc : calledHaplotypes.getCalls()) { + for (final VariantContext vc : calledHaplotypes.getCalls()) { if (lastPosition == null) { lastPosition = vc.getStart(); } else { - int dist = Math.abs(vc.getStart() - lastPosition); + final int dist = Math.abs(vc.getStart() - lastPosition); if (maxEventDistance == null || dist > maxEventDistance) { maxEventDistance = dist; } @@ -713,23 +747,23 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i } } } - Map eventDistanceAttributes = new HashMap<>(); + final Map eventDistanceAttributes = new HashMap<>(); eventDistanceAttributes.put(GATKVCFConstants.EVENT_COUNT_IN_HAPLOTYPE_KEY, eventCount); eventDistanceAttributes.put(GATKVCFConstants.EVENT_DISTANCE_MIN_KEY, minEventDistance); eventDistanceAttributes.put(GATKVCFConstants.EVENT_DISTANCE_MAX_KEY, maxEventDistance); // can we do this with the Annotation classes instead? - for (VariantContext originalVC : calledHaplotypes.getCalls()) { - VariantContextBuilder vcb = new VariantContextBuilder(originalVC); + for (final VariantContext originalVC : calledHaplotypes.getCalls()) { + final VariantContextBuilder vcb = new VariantContextBuilder(originalVC); - Map attributes = new HashMap<>(originalVC.getAttributes()); + final Map attributes = new HashMap<>(originalVC.getAttributes()); attributes.putAll(eventDistanceAttributes); vcb.attributes(attributes); - Set filters = new HashSet<>(originalVC.getFilters()); + final Set filters = new HashSet<>(originalVC.getFilters()); - double tumorLod = originalVC.getAttributeAsDouble(GATKVCFConstants.TUMOR_LOD_KEY, -1); + final double tumorLod = originalVC.getAttributeAsDouble(GATKVCFConstants.TUMOR_LOD_KEY, -1); if (tumorLod < MTAC.TUMOR_LOD_THRESHOLD) { filters.add(GATKVCFConstants.TUMOR_LOD_FILTER_NAME); } @@ -739,22 +773,12 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i filters.addAll(calculateFilters(metaDataTracker, originalVC, eventDistanceAttributes)); } - if (filters.size() > 0) { - vcb.filters(filters); - } else { - vcb.passFilters(); - } + vcb.filters(filters.isEmpty() ? VariantContext.PASSES_FILTERS : filters); if (printTCGAsampleHeader) { - GenotypesContext genotypesWithBamSampleNames = originalVC.getGenotypes(); - List renamedGenotypes = new ArrayList<>(); - GenotypeBuilder GTbuilder = new GenotypeBuilder(genotypesWithBamSampleNames.get(tumorSampleName)); - GTbuilder.name("TUMOR"); - renamedGenotypes.add(GTbuilder.make()); - GTbuilder = new GenotypeBuilder(genotypesWithBamSampleNames.get(normalSampleName)); - GTbuilder.name("NORMAL"); - renamedGenotypes.add(GTbuilder.make()); - vcb.genotypes(renamedGenotypes); + final Genotype tumorGenotype = new GenotypeBuilder(originalVC.getGenotype(tumorSampleName)).name("TUMOR").make(); + final Genotype normalGenotype = new GenotypeBuilder(originalVC.getGenotype(normalSampleName)).name("NORMAL").make(); + vcb.genotypes(Arrays.asList(tumorGenotype, normalGenotype)); } annotatedCalls.add(vcb.make()); @@ -765,15 +789,15 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i return annotatedCalls; } - private Set calculateFilters(RefMetaDataTracker metaDataTracker, VariantContext vc, Map eventDistanceAttributes) { - Set filters = new HashSet<>(); + private Set calculateFilters(final RefMetaDataTracker metaDataTracker, final VariantContext vc, final Map eventDistanceAttributes) { + final Set filters = new HashSet<>(); - Integer eventCount = (Integer) eventDistanceAttributes.get(GATKVCFConstants.EVENT_COUNT_IN_HAPLOTYPE_KEY); - Integer maxEventDistance = (Integer) eventDistanceAttributes.get(GATKVCFConstants.EVENT_DISTANCE_MAX_KEY); + final Integer eventCount = (Integer) eventDistanceAttributes.get(GATKVCFConstants.EVENT_COUNT_IN_HAPLOTYPE_KEY); + final Integer maxEventDistance = (Integer) eventDistanceAttributes.get(GATKVCFConstants.EVENT_DISTANCE_MAX_KEY); - Collection panelOfNormalsVC = metaDataTracker.getValues(normalPanelRod, + final Collection panelOfNormalsVC = metaDataTracker.getValues(MTAC.normalPanelRod, getToolkit().getGenomeLocParser().createGenomeLoc(vc.getChr(), vc.getStart())); - VariantContext ponVc = panelOfNormalsVC.isEmpty()?null:panelOfNormalsVC.iterator().next(); + final VariantContext ponVc = panelOfNormalsVC.isEmpty()?null:panelOfNormalsVC.iterator().next(); if (ponVc != null) { filters.add(GATKVCFConstants.PON_FILTER_NAME); @@ -788,13 +812,13 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i double normalF = 0; int normalAltQualityScoreSum = 0; if (hasNormal()) { - Genotype normalGenotype = vc.getGenotype(normalSampleName); + final Genotype normalGenotype = vc.getGenotype(normalSampleName); // NOTE: how do we get the non-ref depth here? normalAltCounts = normalGenotype.getAD()[1]; normalF = (Double) normalGenotype.getExtendedAttribute(GATKVCFConstants.ALLELE_FRACTION_KEY); - Object qss = normalGenotype.getExtendedAttribute(GATKVCFConstants.QUALITY_SCORE_SUM_KEY); + final Object qss = normalGenotype.getExtendedAttribute(GATKVCFConstants.QUALITY_SCORE_SUM_KEY); if (qss != null) { normalAltQualityScoreSum = (Integer) ((Object[]) qss)[1]; } else { @@ -819,11 +843,11 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i // such as ACTACTACT -> ACTACT, are overwhelmingly false positives so we // hard filter them out by default if (vc.isIndel()) { - ArrayList rpa = (ArrayList) vc.getAttribute(GATKVCFConstants.REPEATS_PER_ALLELE_KEY); - String ru = vc.getAttributeAsString(GATKVCFConstants.REPEAT_UNIT_KEY, ""); + final ArrayList rpa = (ArrayList) vc.getAttribute(GATKVCFConstants.REPEATS_PER_ALLELE_KEY); + final String ru = vc.getAttributeAsString(GATKVCFConstants.REPEAT_UNIT_KEY, ""); if (rpa != null && rpa.size() > 1 && ru.length() > 1) { - int refCount = (Integer) rpa.get(0); - int altCount = (Integer) rpa.get(1); + final int refCount = (Integer) rpa.get(0); + final int altCount = (Integer) rpa.get(1); if (refCount - altCount == 1) { filters.add(GATKVCFConstants.STR_CONTRACTION_FILTER_NAME); @@ -840,10 +864,10 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i // clustered read position filter if (MTAC.ENABLE_CLUSTERED_READ_POSITION_FILTER){ - Double tumorFwdPosMedian = (Double) vc.getAttribute(GATKVCFConstants.MEDIAN_LEFT_OFFSET_KEY); - Double tumorRevPosMedian = (Double) vc.getAttribute(GATKVCFConstants.MEDIAN_RIGHT_OFFSET_KEY); - Double tumorFwdPosMAD = (Double) vc.getAttribute(GATKVCFConstants.MAD_MEDIAN_LEFT_OFFSET_KEY); - Double tumorRevPosMAD = (Double) vc.getAttribute(GATKVCFConstants.MAD_MEDIAN_RIGHT_OFFSET_KEY); + final Double tumorFwdPosMedian = (Double) vc.getAttribute(GATKVCFConstants.MEDIAN_LEFT_OFFSET_KEY); + final Double tumorRevPosMedian = (Double) vc.getAttribute(GATKVCFConstants.MEDIAN_RIGHT_OFFSET_KEY); + final Double tumorFwdPosMAD = (Double) vc.getAttribute(GATKVCFConstants.MAD_MEDIAN_LEFT_OFFSET_KEY); + final Double tumorRevPosMAD = (Double) vc.getAttribute(GATKVCFConstants.MAD_MEDIAN_RIGHT_OFFSET_KEY); //If the variant is near the read end (median threshold) and the positions are very similar (MAD threshold) then filter if ( (tumorFwdPosMedian != null && tumorFwdPosMedian <= MTAC.PIR_MEDIAN_THRESHOLD && tumorFwdPosMAD != null && tumorFwdPosMAD <= MTAC.PIR_MAD_THRESHOLD) || (tumorRevPosMedian != null && tumorRevPosMedian <= MTAC.PIR_MEDIAN_THRESHOLD && tumorRevPosMAD != null && tumorRevPosMAD <= MTAC.PIR_MAD_THRESHOLD)) @@ -866,14 +890,15 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i */ protected double[] calcGenotypeLikelihoodsOfRefVsAny(final ReadBackedPileup pileup, final byte refBase, final byte minBaseQual, final double f) { final double[] genotypeLikelihoods = new double[2]; - int AA = 0, AB=1; + final int AA = 0; + final int AB=1; for( final PileupElement p : pileup ) { final byte qual = (p.isDeletion() ? REF_MODEL_DELETION_QUAL : p.getQual()); if( p.isDeletion() || qual > minBaseQual ) { // TODO: why not use base qualities here? //double pobs = QualityUtils.qualToErrorProbLog10(qual); - double pobs = 1.0d - pow(10, (30 / -10.0)); + final double pobs = 1.0d - pow(10, (30 / -10.0)); if( isNonRef(refBase, p)) { genotypeLikelihoods[AB] += Math.log10(f*pobs + (1-f)*pobs/3.0d); genotypeLikelihoods[AA] += Math.log10((1-pobs)/3); @@ -905,7 +930,7 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i } protected double[] calcGenotypeLikelihoodsOfRefVsAny(final ReadBackedPileup pileup, final byte refBase, final byte minBaseQual) { - double f = calculateF(pileup, refBase, minBaseQual); + final double f = calculateF(pileup, refBase, minBaseQual); return calcGenotypeLikelihoodsOfRefVsAny(pileup, refBase, minBaseQual, f); } @@ -923,11 +948,10 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i } } } - double f = (double) altCount / ((double) refCount + (double) altCount); - return f; + return (double) altCount / (refCount + altCount); } - private boolean isNonRef(byte refBase, PileupElement p) { + private boolean isNonRef(final byte refBase, final PileupElement p) { return p.getBase() != refBase || p.isDeletion() || p.isBeforeDeletionStart() || p.isAfterDeletionEnd() || p.isBeforeInsertion() || p.isAfterInsertion() || p.isNextToSoftClip(); } @@ -960,8 +984,8 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i return readsToRemove; } - private static GATKSAMRecord findReadByName(Collection reads, String name) { - for(GATKSAMRecord read : reads) { + private static GATKSAMRecord findReadByName(final Collection reads, final String name) { + for(final GATKSAMRecord read : reads) { if (name.equals(read.getReadName())) return read; } return null; @@ -1022,7 +1046,7 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i } @Override - public Integer reduce(List callsInRegion, Integer numCalledRegions) { + public Integer reduce(final List callsInRegion, final Integer numCalledRegions) { for( final VariantContext call : callsInRegion ) { vcfWriter.add( call ); } @@ -1030,7 +1054,7 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i } @Override - public void onTraversalDone(Integer result) { + public void onTraversalDone(final Integer result) { // if ( SCAC.emitReferenceConfidence == ReferenceConfidenceMode.GVCF ) ((GVCFWriter)vcfWriter).close(false); // GROSS -- engine forces us to close our own VCF writer since we wrapped it // referenceConfidenceModel.close(); //TODO remove the need to call close here for debugging, the likelihood output stream should be managed @@ -1045,114 +1069,6 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i public List> getResourceRodBindings() { return Collections.emptyList(); } public boolean alwaysAppendDbsnpId() { return false; } - /** - * rsIDs from this file are used to populate the ID column of the output. Also, the DB INFO flag will be set when appropriate. - * dbSNP overlap is only used to require more evidence of absence in the normal if the variant in question has been seen before in germline. - */ - @ArgumentCollection - protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection(); - public RodBinding getDbsnpRodBinding() { return dbsnp.dbsnp; } - - /** - * If a call overlaps with a record from the provided comp track, the INFO field will be annotated - * as such in the output with the track name (e.g. -comp:FOO will have 'FOO' in the INFO field). - * Records that are filtered in the comp track will be ignored. - * Note that 'dbSNP' has been special-cased (see the --dbsnp argument). - */ - @Advanced - @Input(fullName="comp", shortName = "comp", doc="comparison VCF file", required=false) - public List> comps = Collections.emptyList(); - public List> getCompRodBindings() { return comps; } - - - - /** - * Which annotations to add to the output VCF file. See the VariantAnnotator -list argument to view available annotations. - */ - @Advanced - @Argument(fullName="annotation", shortName="A", doc="One or more specific annotations to apply to variant calls", required=false) -// protected List annotationsToUse = new ArrayList<>(Arrays.asList(new String[]{"ClippingRankSumTest", "DepthPerSampleHC"})); -// protected List annotationsToUse = new ArrayList<>(Arrays.asList(new String[]{"DepthPerAlleleBySample", "BaseQualitySumPerAlleleBy ruSample", "TandemRepeatAnnotator", -// "RMSMappingQuality","MappingQualityRankSumTest","FisherStrand","StrandOddsRatio","ReadPosRankSumTest","QualByDepth", "Coverage"})); - protected List annotationsToUse = new ArrayList<>(Arrays.asList(new String[]{"DepthPerAlleleBySample", "BaseQualitySumPerAlleleBySample", "TandemRepeatAnnotator", "OxoGReadCounts"})); - - /** - * Which annotations to exclude from output in the VCF file. Note that this argument has higher priority than the -A or -G arguments, - * so annotations will be excluded even if they are explicitly included with the other options. - */ - @Advanced - @Argument(fullName="excludeAnnotation", shortName="XA", doc="One or more specific annotations to exclude", required=false) - protected List annotationsToExclude = new ArrayList<>(Arrays.asList(new String[]{"SpanningDeletions"})); - - /** - * Which groups of annotations to add to the output VCF file. See the VariantAnnotator -list argument to view available groups. - */ - @Argument(fullName="group", shortName="G", doc="One or more classes/groups of annotations to apply to variant calls", required=false) - //protected String[] annotationGroupsToUse = { StandardAnnotation.class.getSimpleName() }; - protected String[] annotationClassesToUse = { }; - - /** - * A raw, unfiltered, highly sensitive callset in VCF format. - */ - @Output(doc="File to which variants should be written") - protected VariantContextWriter vcfWriter = null; - - /** - * Active region trimmer reference. - */ - @ArgumentCollection - protected ActiveRegionTrimmer trimmer = new ActiveRegionTrimmer(); - - @Hidden - @Argument(fullName="keepRG", shortName="keepRG", doc="Only use read from this read group when making calls (but use all reads to build the assembly)", required = false) - protected String keepRG = null; - - - - - /** - * The minimum confidence needed for a given base for it to be used in variant calling. - */ - @Argument(fullName = "min_base_quality_score", shortName = "mbq", doc = "Minimum base quality required to consider a base for calling", required = false) - public byte MIN_BASE_QUALTY_SCORE = 10; - - - -// PAIR-HMM-Related Goodness - -// public PairHMMLikelihoodCalculationEngine.PCR_ERROR_MODEL pcrErrorModel = PairHMMLikelihoodCalculationEngine.PCR_ERROR_MODEL.CONSERVATIVE; -// public PairHMMLikelihoodCalculationEngine.PCR_ERROR_MODEL pcrErrorModel = PairHMMLikelihoodCalculationEngine.PCR_ERROR_MODEL.AGGRESSIVE; - public PairHMMLikelihoodCalculationEngine.PCR_ERROR_MODEL pcrErrorModel = PairHMMLikelihoodCalculationEngine.PCR_ERROR_MODEL.HOSTILE; - - // Parameters to control read error correction - @Hidden - @Argument(fullName="errorCorrectReads", shortName="errorCorrectReads", doc = "Use an exploratory algorithm to error correct the kmers used during assembly. May cause fundamental problems with the assembly graph itself", required=false) - protected boolean errorCorrectReads = false; - - @Hidden - @Argument(fullName="captureAssemblyFailureBAM", shortName="captureAssemblyFailureBAM", doc="If specified, we will write a BAM called assemblyFailure.bam capturing all of the reads that were in the active region when the assembler failed for any reason", required = false) - protected boolean captureAssemblyFailureBAM = false; - - @Advanced - @Argument(fullName="dontUseSoftClippedBases", shortName="dontUseSoftClippedBases", doc="If specified, we will not analyze soft clipped bases in the reads", required = false) - protected boolean dontUseSoftClippedBases = false; - - @Hidden - @Argument(fullName="justDetermineActiveRegions", shortName="justDetermineActiveRegions", doc = "If specified, the HC won't actually do any assembly or calling, it'll just run the upfront active region determination code. Useful for benchmarking and scalability testing", required=false) - protected boolean justDetermineActiveRegions = false; - - - - - // reference base padding size - private static final int REFERENCE_PADDING = 500; - - private static final byte MIN_TAIL_QUALITY_WITH_ERROR_CORRECTION = 6; - private final static int maxReadsInRegionPerSample = 1000; // TODO -- should be an argument - private final static int minReadsPerAlignmentStart = 5; // TODO -- should be an argument - - - /** * High-level function that runs the assembler on the active region reads, * returning a data structure with the resulting information needed @@ -1224,14 +1140,11 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i if( !clippedRead.isEmpty() && clippedRead.getCigar().getReadLength() > 0 ) { clippedRead = ReadClipper.hardClipToRegion(clippedRead, activeRegion.getExtendedLoc().getStart(), activeRegion.getExtendedLoc().getStop()); if( activeRegion.readOverlapsRegion(clippedRead) && clippedRead.getReadLength() > 0 ) { - //logger.info("Keeping read " + clippedRead + " start " + clippedRead.getAlignmentStart() + " end " + clippedRead.getAlignmentEnd()); readsToUse.add(clippedRead); } } } - // TODO -- Performance optimization: we partition the reads by sample 4 times right now; let's unify that code. - final List downsampledReads = DownsamplingUtils.levelCoverageByPosition(ReadUtils.sortReadsByCoordinate(readsToUse), maxReadsInRegionPerSample, minReadsPerAlignmentStart); if ( MTAC.bamWriter != null && MTAC.emitDroppedReads ) { @@ -1268,9 +1181,9 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i * * @param reads the list of reads to consider */ - private void cleanOverlappingReadPairs(final List reads, Set normalSampleNames) { - Map> data = splitReadsBySample(reads); - for ( String sampleName : data.keySet() ) { + private void cleanOverlappingReadPairs(final List reads, final Set normalSampleNames) { + final Map> data = splitReadsBySample(reads); + for ( final String sampleName : data.keySet() ) { final boolean isTumor = !normalSampleNames.contains(sampleName); final List perSampleReadList = data.get(sampleName); @@ -1284,16 +1197,15 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i } } - public static void logReadInfo(String readName, Collection records, String message) { + public static void logReadInfo(final String readName, final Collection records, final String message) { if (readName != null) { - for (GATKSAMRecord rec : records) { + for (final GATKSAMRecord rec : records) { logReadInfo(readName, rec, message); } - } } - public static void logReadInfo(String readName, GATKSAMRecord rec, String message) { + public static void logReadInfo(final String readName, final GATKSAMRecord rec, final String message) { if (readName != null && rec != null && readName.equals(rec.getReadName())) { logger.info("Found " + rec.toString() + " - " + message); } @@ -1322,7 +1234,7 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i return result; } - private boolean isReadFromNormal(GATKSAMRecord rec) { + private boolean isReadFromNormal(final GATKSAMRecord rec) { return normalSampleName != null && normalSampleName.equals(rec.getReadGroup().getSample()); } @@ -1338,7 +1250,6 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i @Advanced @Argument(fullName="doNotRunPhysicalPhasing", shortName="doNotRunPhysicalPhasing", doc="Disable physical phasing", required = false) protected boolean doNotRunPhysicalPhasing = false; - } diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/SomaticGenotypingEngine.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/SomaticGenotypingEngine.java index dbacfbe41..dfbcf9cee 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/SomaticGenotypingEngine.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/SomaticGenotypingEngine.java @@ -54,9 +54,11 @@ package org.broadinstitute.gatk.tools.walkers.cancer.m2; import com.google.java.contract.Ensures; import htsjdk.samtools.util.StringUtil; import htsjdk.variant.variantcontext.*; +import org.apache.commons.collections.ListUtils; import org.apache.commons.lang.mutable.MutableDouble; import org.apache.commons.lang.mutable.MutableInt; import org.apache.log4j.Logger; +import org.broadinstitute.gatk.tools.walkers.genotyper.afcalc.AFCalculator; import org.broadinstitute.gatk.tools.walkers.genotyper.afcalc.AFCalculatorProvider; import org.broadinstitute.gatk.tools.walkers.haplotypecaller.HaplotypeCallerGenotypingEngine; import org.broadinstitute.gatk.utils.GenomeLoc; @@ -78,16 +80,33 @@ import java.util.*; public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine { - protected final M2ArgumentCollection MTAC; + private final M2ArgumentCollection MTAC; private final TumorPowerCalculator strandArtifactPowerCalculator; - final boolean REF_AND_ALT = false; - final boolean ALT_ONLY = true; + + private final String tumorSampleName; + private final String matchedNormalSampleName; + private final String DEBUG_READ_NAME; + + // {@link GenotypingEngine} requires a non-null {@link AFCalculatorProvider} but this class doesn't need it. Thus we make a dummy + private static AFCalculatorProvider DUMMY_AF_CALCULATOR_PROVIDER = new AFCalculatorProvider() { + public AFCalculator getInstance(final int ploidy, final int maximumAltAlleles) { return null; } + }; private final static Logger logger = Logger.getLogger(SomaticGenotypingEngine.class); - public SomaticGenotypingEngine(final M2ArgumentCollection configuration, final SampleList samples, final GenomeLocParser genomeLocParser, final AFCalculatorProvider afCalculatorProvider, final boolean doPhysicalPhasing, final M2ArgumentCollection MTAC) { - super(configuration, samples, genomeLocParser, afCalculatorProvider, doPhysicalPhasing); + public SomaticGenotypingEngine(final M2ArgumentCollection configuration, + final SampleList samples, + final GenomeLocParser genomeLocParser, + final boolean doPhysicalPhasing, + final M2ArgumentCollection MTAC, + final String tumorSampleName, + final String matchedNormalSampleName, + final String DEBUG_READ_NAME) { + super(configuration, samples, genomeLocParser, DUMMY_AF_CALCULATOR_PROVIDER, doPhysicalPhasing); this.MTAC = MTAC; + this.tumorSampleName = tumorSampleName; + this.matchedNormalSampleName = matchedNormalSampleName; + this.DEBUG_READ_NAME = DEBUG_READ_NAME; // coverage related initialization final double errorProbability = Math.pow(10, -(MTAC.POWER_CONSTANT_QSCORE/10)); strandArtifactPowerCalculator = new TumorPowerCalculator(errorProbability, MTAC.STRAND_ARTIFACT_LOD_THRESHOLD, 0.0f); @@ -105,37 +124,22 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine { * @param ref Reference bytes at active region * @param refLoc Corresponding active region genome location * @param activeRegionWindow Active window - * @param genomeLocParser GenomeLocParser * @param activeAllelesToGenotype Alleles to genotype - * @param emitReferenceConfidence whether we should add a <NON_REF> alternative allele to the result variation contexts. * * @return A CalledHaplotypes object containing a list of VC's with genotyped events and called haplotypes * */ -// @Requires({"refLoc.containsP(activeRegionWindow)", "haplotypes.size() > 0"}) - @Ensures("result != null") // TODO - can this be refactored? this is hard to follow! public CalledHaplotypes callMutations ( - final List haplotypes, - //final Map haplotypeReadMap, - final ReadLikelihoods readLikelihoods, - final Map originalNormalReadQualities, - final Map> perSampleFilteredReadList, - final byte[] ref, - final GenomeLoc refLoc, - final GenomeLoc activeRegionWindow, - final GenomeLocParser genomeLocParser, - final RefMetaDataTracker tracker, - final List activeAllelesToGenotype, - final boolean emitReferenceConfidence, - final String tumorSampleName, - final String matchedNormalSampleName, - final RodBinding dbsnpRod, - final List> cosmicRod, - final String DEBUG_READ_NAME - - ) { - + final List haplotypes, + final ReadLikelihoods readLikelihoods, + final Map originalNormalReadQualities, + final Map> perSampleFilteredReadList, + final byte[] ref, + final GenomeLoc refLoc, + final GenomeLoc activeRegionWindow, + final RefMetaDataTracker tracker, + final List activeAllelesToGenotype) { // sanity check input arguments if (haplotypes == null || haplotypes.isEmpty()) throw new IllegalArgumentException("haplotypes input should be non-empty and non-null, got "+haplotypes); if (readLikelihoods == null || readLikelihoods.sampleCount() == 0) throw new IllegalArgumentException("readLikelihoods input should be non-empty and non-null, got "+readLikelihoods); @@ -143,8 +147,6 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine { if (refLoc == null || refLoc.size() != ref.length) throw new IllegalArgumentException(" refLoc must be non-null and length must match ref bytes, got "+refLoc); if (activeRegionWindow == null ) throw new IllegalArgumentException("activeRegionWindow must be non-null, got "+activeRegionWindow); if (activeAllelesToGenotype == null ) throw new IllegalArgumentException("activeAllelesToGenotype must be non-null, got "+activeAllelesToGenotype); - if (genomeLocParser == null ) throw new IllegalArgumentException("genomeLocParser must be non-null, got "+genomeLocParser); - // Somatic Tumor/Normal Sample Handling verifySamplePresence(tumorSampleName, readLikelihoods.samples()); @@ -159,293 +161,217 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine { final List returnCalls = new ArrayList<>(); for( final int loc : startPosKeySet ) { - if( loc >= activeRegionWindow.getStart() && loc <= activeRegionWindow.getStop() ) { // genotyping an event inside this active region - final List eventsAtThisLoc = getVCsAtThisLocation(haplotypes, loc, activeAllelesToGenotype); - - if( eventsAtThisLoc.isEmpty() ) { continue; } - - // Create the event mapping object which maps the original haplotype events to the events present at just this locus - final Map> eventMapper = createEventMapper(loc, eventsAtThisLoc, haplotypes); - - // Sanity check the priority list for mistakes - final List priorityList = makePriorityList(eventsAtThisLoc); - - // Merge the event to find a common reference representation - - VariantContext mergedVC = GATKVariantContextUtils.simpleMerge(eventsAtThisLoc, priorityList, - GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, - GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, null, false, false); - - if( mergedVC == null ) { continue; } - - if (emitReferenceConfidence) - mergedVC = addNonRefSymbolicAllele(mergedVC); - - final Map mergeMap = new LinkedHashMap<>(); - mergeMap.put(null, mergedVC.getReference()); // the reference event (null) --> the reference allele - for(int iii = 0; iii < eventsAtThisLoc.size(); iii++) { - mergeMap.put(eventsAtThisLoc.get(iii), mergedVC.getAlternateAllele(iii)); // BUGBUG: This is assuming that the order of alleles is the same as the priority list given to simpleMerge function - } - - final Map> alleleMapper = createAlleleMapper(mergeMap, eventMapper); - - if( configuration.DEBUG && logger != null ) { - if (logger != null) logger.info("Genotyping event at " + loc + " with alleles = " + mergedVC.getAlleles()); - } - - ReadLikelihoods readAlleleLikelihoods = readLikelihoods.marginalize(alleleMapper, genomeLocParser.createPaddedGenomeLoc(genomeLocParser.createGenomeLoc(mergedVC), ALLELE_EXTENSION)); - - //LDG: do we want to do this before or after pulling out overlapping reads? - if (MTAC.isSampleContaminationPresent()) - readAlleleLikelihoods.contaminationDownsampling(MTAC.getSampleContamination()); - - //if (!mergedVC.isBiallelic()) { - // logger.info("[UNSUPPORTED] Detected non-Biallelic VC" + mergedVC.toString()); - // continue; - //} - - // TODO: once tests are passing, refactor to use the new data structure (not the deprecated one) - // handle overlapping fragments - // TODO: CONFIRM WITH GSA IF IT IS OK TO REMOVE READS FROM THE PRALM (should be... they do it in filterPoorlyModeledReads!) - PerReadAlleleLikelihoodMap tumorPRALM = readAlleleLikelihoods.toPerReadAlleleLikelihoodMap(readAlleleLikelihoods.sampleIndex(tumorSampleName)); - filterPRALMForOverlappingReads(tumorPRALM, mergedVC.getReference(), loc, false); - MuTect2.logReadInfo(DEBUG_READ_NAME, tumorPRALM.getLikelihoodReadMap().keySet(), "Present in Tumor PRALM after filtering for overlapping reads"); - // extend to multiple samples - - // compute tumor LOD for each alternate allele - // TODO: somewhere we have to ensure that the all the alleles in the variant context is in alleleFractions passed to getHetGenotypeLogLikelihoods. getHetGenotypeLogLikelihoods will not check that for you - final PerAlleleCollection altAlleleFractions = estimateAlleleFraction(mergedVC, tumorPRALM, false); - final PerAlleleCollection tumorHetGenotypeLLs = getHetGenotypeLogLikelihoods(mergedVC, tumorPRALM, originalNormalReadQualities, altAlleleFractions); - - if( configuration.DEBUG && logger != null ) { - StringBuilder outputSB = new StringBuilder("Calculated allelic fraction at " + loc + " = ["); - for (Allele allele : altAlleleFractions.getAltAlleles()){ - outputSB.append( allele + ": " + altAlleleFractions.getAlt(allele) + ", "); - } - outputSB.append("]"); - logger.info(outputSB.toString()); - } - - final PerAlleleCollection tumorLods = PerAlleleCollection.createPerAltAlleleCollection(); - for (Allele altAllele : mergedVC.getAlternateAlleles()){ - tumorLods.set(altAllele, tumorHetGenotypeLLs.get(altAllele) - tumorHetGenotypeLLs.getRef()); - } - - if (configuration.DEBUG && logger != null) { - StringBuilder outputSB = new StringBuilder("Tumor LOD at " + loc + " = ["); - for (Allele altAllele : tumorLods.getAltAlleles()) { - outputSB.append( altAllele + ": " + tumorLods.getAlt(altAllele) + ", "); - } - outputSB.append("]"); - logger.info(outputSB.toString()); - } - - double INIT_NORMAL_LOD_THRESHOLD = -Double.MAX_VALUE; - double NORMAL_LOD_THRESHOLD = -Double.MAX_VALUE; - PerReadAlleleLikelihoodMap normalPRALM = null; - PerAlleleCollection normalLods = PerAlleleCollection.createPerAltAlleleCollection(); - - // if normal bam is available, compute normal LOD - if (hasNormal) { - normalPRALM = readAlleleLikelihoods.toPerReadAlleleLikelihoodMap(readAlleleLikelihoods.sampleIndex(matchedNormalSampleName)); - filterPRALMForOverlappingReads(normalPRALM, mergedVC.getReference(), loc, true); - MuTect2.logReadInfo(DEBUG_READ_NAME, normalPRALM.getLikelihoodReadMap().keySet(), "Present after in Nomral PRALM filtering for overlapping reads"); - - GenomeLoc eventGenomeLoc = genomeLocParser.createGenomeLoc(activeRegionWindow.getContig(), loc); - Collection cosmicVC = tracker.getValues(cosmicRod, eventGenomeLoc); - Collection dbsnpVC = tracker.getValues(dbsnpRod, eventGenomeLoc); - // remove the effect of cosmic from dbSNP - final boolean germlineAtRisk = (!dbsnpVC.isEmpty() && cosmicVC.isEmpty()); - - INIT_NORMAL_LOD_THRESHOLD = MTAC.INITIAL_NORMAL_LOD_THRESHOLD; //only set this if this job has a normal - NORMAL_LOD_THRESHOLD = (germlineAtRisk)?MTAC.NORMAL_DBSNP_LOD_THRESHOLD:MTAC.NORMAL_LOD_THRESHOLD; - - - // compute normal LOD = LL(X|REF)/LL(X|ALT) where ALT is the diploid HET with AF = 0.5 - // note normal LOD is REF over ALT, the reciprocal of the tumor LOD - final PerAlleleCollection diploidHetAlleleFractions = PerAlleleCollection.createPerRefAndAltAlleleCollection(); - for (final Allele allele : mergedVC.getAlternateAlleles()){ - diploidHetAlleleFractions.setAlt(allele, 0.5); - } - - final PerAlleleCollection normalGenotypeLLs = getHetGenotypeLogLikelihoods(mergedVC, normalPRALM, originalNormalReadQualities, diploidHetAlleleFractions); - - for (final Allele altAllele : mergedVC.getAlternateAlleles()){ - normalLods.setAlt(altAllele, normalGenotypeLLs.getRef() - normalGenotypeLLs.getAlt(altAllele)); - } - - } - - int numPassingAlts = 0; - Set allelesThatPassThreshold = new HashSet<>(); - Allele alleleWithHighestTumorLOD = null; - - // TODO: use lambda - for (Allele altAllele : mergedVC.getAlternateAlleles()) { - final boolean passesTumorLodThreshold = tumorLods.getAlt(altAllele) >= MTAC.INITIAL_TUMOR_LOD_THRESHOLD; - final boolean passesNormalLodThreshold = hasNormal ? normalLods.getAlt(altAllele) >= INIT_NORMAL_LOD_THRESHOLD : true; - if (passesTumorLodThreshold && passesNormalLodThreshold) { - numPassingAlts++; - allelesThatPassThreshold.add(altAllele); - if (alleleWithHighestTumorLOD == null - || tumorLods.getAlt(altAllele) > tumorLods.getAlt(alleleWithHighestTumorLOD)){ - alleleWithHighestTumorLOD = altAllele; - } - } - } - - final boolean emitVariant = numPassingAlts > 0; - - VariantContext call = null; - if (emitVariant) { - VariantContextBuilder callVcb = new VariantContextBuilder(mergedVC); - // FIXME: can simply get first alternate since above we only deal with Bi-allelic sites... - int haplotypeCount = alleleMapper.get(mergedVC.getAlternateAllele(0)).size(); - callVcb.attribute(GATKVCFConstants.HAPLOTYPE_COUNT_KEY, haplotypeCount); - callVcb.attribute(GATKVCFConstants.TUMOR_LOD_KEY, tumorLods.getAlt(alleleWithHighestTumorLOD)); - - if (hasNormal) { - callVcb.attribute(GATKVCFConstants.NORMAL_LOD_KEY, normalLods.getAlt(alleleWithHighestTumorLOD)); - if (normalLods.getAlt(alleleWithHighestTumorLOD) < NORMAL_LOD_THRESHOLD) { - callVcb.filter(GATKVCFConstants.GERMLINE_RISK_FILTER_NAME); - } - } - - // M1-style strand artifact filter - // TODO: move code to MuTect2::calculateFilters() - // skip if VC has multiple alleles - it will get filtered later anyway - if (MTAC.ENABLE_STRAND_ARTIFACT_FILTER && numPassingAlts == 1) { - final PerReadAlleleLikelihoodMap forwardPRALM = new PerReadAlleleLikelihoodMap(); - final PerReadAlleleLikelihoodMap reversePRALM = new PerReadAlleleLikelihoodMap(); - splitPRALMintoForwardAndReverseReads(tumorPRALM, forwardPRALM, reversePRALM); - - MuTect2.logReadInfo(DEBUG_READ_NAME, tumorPRALM.getLikelihoodReadMap().keySet(), "Present in tumor PRALM after PRALM is split"); - MuTect2.logReadInfo(DEBUG_READ_NAME, forwardPRALM.getLikelihoodReadMap().keySet(), "Present in forward PRALM after PRALM is split"); - MuTect2.logReadInfo(DEBUG_READ_NAME, reversePRALM.getLikelihoodReadMap().keySet(), "Present in reverse PRALM after PRALM is split"); - - // TODO: build a new type for probability, likelihood, and log_likelihood. e.g. f_fwd :: probability[], tumorGLs_fwd :: likelihood[] - // TODO: don't want to call getHetGenotypeLogLikelihoods on more than one alternate alelle. May need to overload it to take a scalar f_fwd. - final PerAlleleCollection alleleFractionsForward = estimateAlleleFraction(mergedVC, forwardPRALM, true); - final PerAlleleCollection tumorGenotypeLLForward = getHetGenotypeLogLikelihoods(mergedVC, forwardPRALM, originalNormalReadQualities, alleleFractionsForward); - - final PerAlleleCollection alleleFractionsReverse = estimateAlleleFraction(mergedVC, reversePRALM, true); - final PerAlleleCollection tumorGenotypeLLReverse = getHetGenotypeLogLikelihoods(mergedVC, reversePRALM, originalNormalReadQualities, alleleFractionsReverse); - - if( configuration.DEBUG && logger != null ) { - StringBuilder forwardMessage = new StringBuilder("Calculated forward allelic fraction at " + loc + " = ["); - StringBuilder reverseMessage = new StringBuilder("Calculated reverse allelic fraction at " + loc + " = ["); - - for (Allele altAllele : altAlleleFractions.getAltAlleles()){ - forwardMessage.append( altAllele + ": " + alleleFractionsForward.getAlt(altAllele) + ", "); - reverseMessage.append( altAllele + ": " + alleleFractionsReverse.getAlt(altAllele) + ", "); - } - - forwardMessage.append("]"); - reverseMessage.append("]"); - - logger.info(forwardMessage.toString()); - logger.info(reverseMessage.toString()); - } - - double tumorLod_fwd = tumorGenotypeLLForward.getAlt(alleleWithHighestTumorLOD) - tumorGenotypeLLForward.getRef(); - double tumorLod_rev = tumorGenotypeLLReverse.getAlt(alleleWithHighestTumorLOD) - tumorGenotypeLLReverse.getRef(); - - double tumorSBpower_fwd = 0.0; - double tumorSBpower_rev = 0.0; - try { - // Note that we use the observed combined (+ and -) allele fraction for power calculation in either direction - tumorSBpower_fwd = strandArtifactPowerCalculator.cachedPowerCalculation(forwardPRALM.getNumberOfStoredElements(), altAlleleFractions.getAlt(alleleWithHighestTumorLOD)); - tumorSBpower_rev = strandArtifactPowerCalculator.cachedPowerCalculation(reversePRALM.getNumberOfStoredElements(), altAlleleFractions.getAlt(alleleWithHighestTumorLOD)); - } - catch (Throwable t) { - System.err.println("Error processing " + activeRegionWindow.getContig() + ":" + loc); - t.printStackTrace(System.err); - throw new RuntimeException(t); - } - - callVcb.attribute(GATKVCFConstants.TLOD_FWD_KEY, tumorLod_fwd); - callVcb.attribute(GATKVCFConstants.TLOD_REV_KEY, tumorLod_rev); - callVcb.attribute(GATKVCFConstants.TUMOR_SB_POWER_FWD_KEY, tumorSBpower_fwd); - callVcb.attribute(GATKVCFConstants.TUMOR_SB_POWER_REV_KEY, tumorSBpower_rev); - // TODO: add vcf INFO fields. see callVcb.attribute(GATKVCFConstants.HAPLOTYPE_COUNT_KEY, haplotypeCount); - - if ((tumorSBpower_fwd > MTAC.STRAND_ARTIFACT_POWER_THRESHOLD && tumorLod_fwd < MTAC.STRAND_ARTIFACT_LOD_THRESHOLD) || - (tumorSBpower_rev > MTAC.STRAND_ARTIFACT_POWER_THRESHOLD && tumorLod_rev < MTAC.STRAND_ARTIFACT_LOD_THRESHOLD)) - callVcb.filter(GATKVCFConstants.STRAND_ARTIFACT_FILTER_NAME); - } - - // TODO: this probably belongs in M2::calculateFilters() - if (numPassingAlts > 1) { - callVcb.filter(GATKVCFConstants.TRIALLELIC_SITE_FILTER_NAME); - } - - // build genotypes TODO: this part needs review and refactor - List tumorAlleles = new ArrayList<>(); - tumorAlleles.add(mergedVC.getReference()); - tumorAlleles.add(alleleWithHighestTumorLOD); - Genotype tumorGenotype = new GenotypeBuilder(tumorSampleName, tumorAlleles) - .attribute(GATKVCFConstants.ALLELE_FRACTION_KEY, altAlleleFractions.getAlt(alleleWithHighestTumorLOD)) - .make(); // TODO: add ADs? - List genotypes = new ArrayList<>(); - genotypes.add(tumorGenotype); - - // We assume that the genotype in the normal is 0/0 - // TODO: is normal always homozygous reference? - List homRefAllelesforNormalGenotype = new ArrayList<>(); - homRefAllelesforNormalGenotype.addAll(Collections.nCopies(2, mergedVC.getReference())); - - // if we are calling with a normal, build the genotype for the sample to appear in vcf - int REF = 0, ALT = 1; - if (hasNormal) { - PerAlleleCollection normalCounts = getRefAltCount(mergedVC, normalPRALM, false); - final int normalRefAlleleDepth = normalCounts.getRef(); - final int normalAltAlleleDepth = normalCounts.getAlt(alleleWithHighestTumorLOD); - final int[] normalAlleleDepths = { normalRefAlleleDepth, normalAltAlleleDepth }; - final double normalAlleleFraction = (double) normalAltAlleleDepth / ( normalRefAlleleDepth + normalAltAlleleDepth); - - final Genotype normalGenotype = new GenotypeBuilder(matchedNormalSampleName, homRefAllelesforNormalGenotype) - .AD(normalAlleleDepths) - .attribute(GATKVCFConstants.ALLELE_FRACTION_KEY, normalAlleleFraction) - .make(); - genotypes.add(normalGenotype); - } - - //only use alleles found in the tumor ( - call = new VariantContextBuilder(callVcb).alleles(tumorAlleles).genotypes(genotypes).make(); - - } - - // how should we be making use of _perSampleFilteredReadList_? - if( call != null ) { - readAlleleLikelihoods = prepareReadAlleleLikelihoodsForAnnotation(readLikelihoods, perSampleFilteredReadList, - genomeLocParser, emitReferenceConfidence, alleleMapper, readAlleleLikelihoods, call); - - ReferenceContext referenceContext = new ReferenceContext(genomeLocParser, genomeLocParser.createGenomeLoc(mergedVC.getChr(), mergedVC.getStart(), mergedVC.getEnd()), refLoc, ref); - VariantContext annotatedCall = annotationEngine.annotateContextForActiveRegion(referenceContext, tracker, readAlleleLikelihoods, call, false); - - if( call.getAlleles().size() != mergedVC.getAlleles().size() ) - annotatedCall = GATKVariantContextUtils.reverseTrimAlleles(annotatedCall); - - // maintain the set of all called haplotypes - for ( final Allele calledAllele : call.getAlleles() ) { - final List haplotypeList = alleleMapper.get(calledAllele); - if (haplotypeList == null) continue; - calledHaplotypes.addAll(haplotypeList); - } - - returnCalls.add( annotatedCall ); - } - + if( loc < activeRegionWindow.getStart() || loc > activeRegionWindow.getStop() ) { + continue; } + final List eventsAtThisLoc = getVCsAtThisLocation(haplotypes, loc, activeAllelesToGenotype); + + if( eventsAtThisLoc.isEmpty() ) { continue; } + + // Create the event mapping object which maps the original haplotype events to the events present at just this locus + final Map> eventMapper = createEventMapper(loc, eventsAtThisLoc, haplotypes); + + final List priorityList = makePriorityList(eventsAtThisLoc); + + VariantContext mergedVC = GATKVariantContextUtils.simpleMerge(eventsAtThisLoc, priorityList, + GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, + GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, null, false, false); + + if( mergedVC == null ) { continue; } + + final Map mergeMap = new LinkedHashMap<>(); + mergeMap.put(null, mergedVC.getReference()); // the reference event (null) --> the reference allele + for(int iii = 0; iii < eventsAtThisLoc.size(); iii++) { + mergeMap.put(eventsAtThisLoc.get(iii), mergedVC.getAlternateAllele(iii)); // BUGBUG: This is assuming that the order of alleles is the same as the priority list given to simpleMerge function + } + + final Map> alleleMapper = createAlleleMapper(mergeMap, eventMapper); + + ReadLikelihoods readAlleleLikelihoods = readLikelihoods.marginalize(alleleMapper, genomeLocParser.createPaddedGenomeLoc(genomeLocParser.createGenomeLoc(mergedVC), ALLELE_EXTENSION)); + + //LDG: do we want to do this before or after pulling out overlapping reads? + if (MTAC.isSampleContaminationPresent()) + readAlleleLikelihoods.contaminationDownsampling(MTAC.getSampleContamination()); + + final PerReadAlleleLikelihoodMap tumorPRALM = readAlleleLikelihoods.toPerReadAlleleLikelihoodMap(readAlleleLikelihoods.sampleIndex(tumorSampleName)); + filterPRALMForOverlappingReads(tumorPRALM, mergedVC.getReference(), loc, false); + MuTect2.logReadInfo(DEBUG_READ_NAME, tumorPRALM.getLikelihoodReadMap().keySet(), "Present in Tumor PRALM after filtering for overlapping reads"); + // extend to multiple samples + + // compute tumor LOD for each alternate allele + // TODO: somewhere we have to ensure that the all the alleles in the variant context is in alleleFractions passed to getHetGenotypeLogLikelihoods. getHetGenotypeLogLikelihoods will not check that for you + final PerAlleleCollection altAlleleFractions = estimateAlleleFraction(mergedVC, tumorPRALM, false); + final PerAlleleCollection tumorHetGenotypeLLs = getHetGenotypeLogLikelihoods(mergedVC, tumorPRALM, originalNormalReadQualities, altAlleleFractions); + + final PerAlleleCollection tumorLods = PerAlleleCollection.createPerAltAlleleCollection(); + for (final Allele altAllele : mergedVC.getAlternateAlleles()){ + tumorLods.set(altAllele, tumorHetGenotypeLLs.get(altAllele) - tumorHetGenotypeLLs.getRef()); + } + + double INIT_NORMAL_LOD_THRESHOLD = -Double.MAX_VALUE; + double NORMAL_LOD_THRESHOLD = -Double.MAX_VALUE; + PerReadAlleleLikelihoodMap normalPRALM = null; + final PerAlleleCollection normalLods = PerAlleleCollection.createPerAltAlleleCollection(); + + // if normal bam is available, compute normal LOD + if (hasNormal) { + normalPRALM = readAlleleLikelihoods.toPerReadAlleleLikelihoodMap(readAlleleLikelihoods.sampleIndex(matchedNormalSampleName)); + filterPRALMForOverlappingReads(normalPRALM, mergedVC.getReference(), loc, true); + MuTect2.logReadInfo(DEBUG_READ_NAME, normalPRALM.getLikelihoodReadMap().keySet(), "Present after in Nomral PRALM filtering for overlapping reads"); + + final GenomeLoc eventGenomeLoc = genomeLocParser.createGenomeLoc(activeRegionWindow.getContig(), loc); + final Collection cosmicVC = tracker.getValues(MTAC.cosmicRod, eventGenomeLoc); + final Collection dbsnpVC = tracker.getValues(MTAC.dbsnp.dbsnp, eventGenomeLoc); + // remove the effect of cosmic from dbSNP + final boolean germlineAtRisk = (!dbsnpVC.isEmpty() && cosmicVC.isEmpty()); + + INIT_NORMAL_LOD_THRESHOLD = MTAC.INITIAL_NORMAL_LOD_THRESHOLD; //only set this if this job has a normal + NORMAL_LOD_THRESHOLD = (germlineAtRisk)?MTAC.NORMAL_DBSNP_LOD_THRESHOLD:MTAC.NORMAL_LOD_THRESHOLD; + + // compute normal LOD = LL(X|REF)/LL(X|ALT) where ALT is the diploid HET with AF = 0.5 + // note normal LOD is REF over ALT, the reciprocal of the tumor LOD + final PerAlleleCollection diploidHetAlleleFractions = PerAlleleCollection.createPerRefAndAltAlleleCollection(); + for (final Allele allele : mergedVC.getAlternateAlleles()){ + diploidHetAlleleFractions.setAlt(allele, 0.5); + } + + final PerAlleleCollection normalGenotypeLLs = getHetGenotypeLogLikelihoods(mergedVC, normalPRALM, originalNormalReadQualities, diploidHetAlleleFractions); + + for (final Allele altAllele : mergedVC.getAlternateAlleles()){ + normalLods.setAlt(altAllele, normalGenotypeLLs.getRef() - normalGenotypeLLs.getAlt(altAllele)); + } + } + + int numPassingAlts = 0; + final Set allelesThatPassThreshold = new HashSet<>(); + Allele alleleWithHighestTumorLOD = null; + + // TODO: use lambda + for (final Allele altAllele : mergedVC.getAlternateAlleles()) { + final boolean passesTumorLodThreshold = tumorLods.getAlt(altAllele) >= MTAC.INITIAL_TUMOR_LOD_THRESHOLD; + final boolean passesNormalLodThreshold = hasNormal ? normalLods.getAlt(altAllele) >= INIT_NORMAL_LOD_THRESHOLD : true; + if (passesTumorLodThreshold && passesNormalLodThreshold) { + numPassingAlts++; + allelesThatPassThreshold.add(altAllele); + if (alleleWithHighestTumorLOD == null + || tumorLods.getAlt(altAllele) > tumorLods.getAlt(alleleWithHighestTumorLOD)){ + alleleWithHighestTumorLOD = altAllele; + } + } + } + if (numPassingAlts == 0) { + continue; + } + + + final VariantContextBuilder callVcb = new VariantContextBuilder(mergedVC); + // FIXME: can simply get first alternate since above we only deal with Bi-allelic sites... + final int haplotypeCount = alleleMapper.get(mergedVC.getAlternateAllele(0)).size(); + callVcb.attribute(GATKVCFConstants.HAPLOTYPE_COUNT_KEY, haplotypeCount); + callVcb.attribute(GATKVCFConstants.TUMOR_LOD_KEY, tumorLods.getAlt(alleleWithHighestTumorLOD)); + + if (hasNormal) { + callVcb.attribute(GATKVCFConstants.NORMAL_LOD_KEY, normalLods.getAlt(alleleWithHighestTumorLOD)); + if (normalLods.getAlt(alleleWithHighestTumorLOD) < NORMAL_LOD_THRESHOLD) { + callVcb.filter(GATKVCFConstants.GERMLINE_RISK_FILTER_NAME); + } + } + + // M1-style strand artifact filter + // TODO: move code to MuTect2::calculateFilters() + // skip if VC has multiple alleles - it will get filtered later anyway + if (MTAC.ENABLE_STRAND_ARTIFACT_FILTER && numPassingAlts == 1) { + final PerReadAlleleLikelihoodMap forwardPRALM = new PerReadAlleleLikelihoodMap(); + final PerReadAlleleLikelihoodMap reversePRALM = new PerReadAlleleLikelihoodMap(); + splitPRALMintoForwardAndReverseReads(tumorPRALM, forwardPRALM, reversePRALM); + + MuTect2.logReadInfo(DEBUG_READ_NAME, tumorPRALM.getLikelihoodReadMap().keySet(), "Present in tumor PRALM after PRALM is split"); + MuTect2.logReadInfo(DEBUG_READ_NAME, forwardPRALM.getLikelihoodReadMap().keySet(), "Present in forward PRALM after PRALM is split"); + MuTect2.logReadInfo(DEBUG_READ_NAME, reversePRALM.getLikelihoodReadMap().keySet(), "Present in reverse PRALM after PRALM is split"); + + // TODO: build a new type for probability, likelihood, and log_likelihood. e.g. f_fwd :: probability[], tumorGLs_fwd :: likelihood[] + // TODO: don't want to call getHetGenotypeLogLikelihoods on more than one alternate alelle. May need to overload it to take a scalar f_fwd. + final PerAlleleCollection alleleFractionsForward = estimateAlleleFraction(mergedVC, forwardPRALM, true); + final PerAlleleCollection tumorGenotypeLLForward = getHetGenotypeLogLikelihoods(mergedVC, forwardPRALM, originalNormalReadQualities, alleleFractionsForward); + + final PerAlleleCollection alleleFractionsReverse = estimateAlleleFraction(mergedVC, reversePRALM, true); + final PerAlleleCollection tumorGenotypeLLReverse = getHetGenotypeLogLikelihoods(mergedVC, reversePRALM, originalNormalReadQualities, alleleFractionsReverse); + + final double tumorLod_fwd = tumorGenotypeLLForward.getAlt(alleleWithHighestTumorLOD) - tumorGenotypeLLForward.getRef(); + final double tumorLod_rev = tumorGenotypeLLReverse.getAlt(alleleWithHighestTumorLOD) - tumorGenotypeLLReverse.getRef(); + + // Note that we use the observed combined (+ and -) allele fraction for power calculation in either direction + final double tumorSBpower_fwd = strandArtifactPowerCalculator.cachedPowerCalculation(forwardPRALM.getNumberOfStoredElements(), altAlleleFractions.getAlt(alleleWithHighestTumorLOD)); + final double tumorSBpower_rev = strandArtifactPowerCalculator.cachedPowerCalculation(reversePRALM.getNumberOfStoredElements(), altAlleleFractions.getAlt(alleleWithHighestTumorLOD)); + + + callVcb.attribute(GATKVCFConstants.TLOD_FWD_KEY, tumorLod_fwd); + callVcb.attribute(GATKVCFConstants.TLOD_REV_KEY, tumorLod_rev); + callVcb.attribute(GATKVCFConstants.TUMOR_SB_POWER_FWD_KEY, tumorSBpower_fwd); + callVcb.attribute(GATKVCFConstants.TUMOR_SB_POWER_REV_KEY, tumorSBpower_rev); + // TODO: add vcf INFO fields. see callVcb.attribute(GATKVCFConstants.HAPLOTYPE_COUNT_KEY, haplotypeCount); + + if ((tumorSBpower_fwd > MTAC.STRAND_ARTIFACT_POWER_THRESHOLD && tumorLod_fwd < MTAC.STRAND_ARTIFACT_LOD_THRESHOLD) || + (tumorSBpower_rev > MTAC.STRAND_ARTIFACT_POWER_THRESHOLD && tumorLod_rev < MTAC.STRAND_ARTIFACT_LOD_THRESHOLD)) + callVcb.filter(GATKVCFConstants.STRAND_ARTIFACT_FILTER_NAME); + } + + // TODO: this probably belongs in M2::calculateFilters() + if (numPassingAlts > 1) { + callVcb.filter(GATKVCFConstants.TRIALLELIC_SITE_FILTER_NAME); + } + + // build genotypes TODO: this part needs review and refactor + final List tumorAlleles = Arrays.asList(mergedVC.getReference(), alleleWithHighestTumorLOD); + final Genotype tumorGenotype = new GenotypeBuilder(tumorSampleName, tumorAlleles) + .attribute(GATKVCFConstants.ALLELE_FRACTION_KEY, altAlleleFractions.getAlt(alleleWithHighestTumorLOD)) + .make(); // TODO: add ADs? + final List genotypes = new ArrayList<>(); + genotypes.add(tumorGenotype); + + // We assume that the genotype in the normal is 0/0 + // TODO: is normal always homozygous reference? + final List homRefAllelesforNormalGenotype = Collections.nCopies(2, mergedVC.getReference()); + + // if we are calling with a normal, build the genotype for the sample to appear in vcf + if (hasNormal) { + final PerAlleleCollection normalCounts = getRefAltCount(mergedVC, normalPRALM, false); + final int normalRefAlleleDepth = normalCounts.getRef(); + final int normalAltAlleleDepth = normalCounts.getAlt(alleleWithHighestTumorLOD); + final int[] normalAlleleDepths = { normalRefAlleleDepth, normalAltAlleleDepth }; + final double normalAlleleFraction = (double) normalAltAlleleDepth / ( normalRefAlleleDepth + normalAltAlleleDepth); + + final Genotype normalGenotype = new GenotypeBuilder(matchedNormalSampleName, homRefAllelesforNormalGenotype) + .AD(normalAlleleDepths) + .attribute(GATKVCFConstants.ALLELE_FRACTION_KEY, normalAlleleFraction) + .make(); + genotypes.add(normalGenotype); + } + + //only use alleles found in the tumor ( + final VariantContext call = new VariantContextBuilder(callVcb).alleles(tumorAlleles).genotypes(genotypes).make(); + + // how should we be making use of _perSampleFilteredReadList_? + readAlleleLikelihoods = prepareReadAlleleLikelihoodsForAnnotation(readLikelihoods, perSampleFilteredReadList, + genomeLocParser, false, alleleMapper, readAlleleLikelihoods, call); + + final ReferenceContext referenceContext = new ReferenceContext(genomeLocParser, genomeLocParser.createGenomeLoc(mergedVC.getChr(), mergedVC.getStart(), mergedVC.getEnd()), refLoc, ref); + VariantContext annotatedCall = annotationEngine.annotateContextForActiveRegion(referenceContext, tracker, readAlleleLikelihoods, call, false); + + if( call.getAlleles().size() != mergedVC.getAlleles().size() ) + annotatedCall = GATKVariantContextUtils.reverseTrimAlleles(annotatedCall); + + // maintain the set of all called haplotypes + call.getAlleles().stream().map(alleleMapper::get).filter(Objects::nonNull).forEach(calledHaplotypes::addAll); + returnCalls.add( annotatedCall ); } // TODO: understand effect of enabling this for somatic calling... - final List phasedCalls = doPhysicalPhasing ? phaseCalls(returnCalls, calledHaplotypes) : returnCalls; - return new CalledHaplotypes(phasedCalls, calledHaplotypes); - //return new CalledHaplotypes(returnCalls, calledHaplotypes); + final List outputCalls = doPhysicalPhasing ? phaseCalls(returnCalls, calledHaplotypes) : returnCalls; + return new CalledHaplotypes(outputCalls, calledHaplotypes); } - private void verifySamplePresence(String sampleName, List samples) { + private void verifySamplePresence(final String sampleName, final List samples) { if (!samples.contains(sampleName)) { throw new IllegalArgumentException("Unable to find sample name "+sampleName+"in sample list of " + StringUtil.join(",", samples)); } @@ -459,7 +385,7 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine { * @param alleleFractions allele fraction(s) for alternate allele(s) * * @return genotype likelihoods for homRef and het for each alternate allele - */ + */ private PerAlleleCollection getHetGenotypeLogLikelihoods(final VariantContext mergedVC, final PerReadAlleleLikelihoodMap tumorPRALM, final Map originalNormalMQs, @@ -470,13 +396,11 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine { } final PerAlleleCollection genotypeLogLikelihoods = PerAlleleCollection.createPerRefAndAltAlleleCollection(); - for (final Allele allele : mergedVC.getAlleles()){ - genotypeLogLikelihoods.set(allele, new MutableDouble(0.0)); - } + mergedVC.getAlleles().forEach(a -> genotypeLogLikelihoods.set(a, new MutableDouble(0))); final Allele refAllele = mergedVC.getReference(); for(Map.Entry> readAlleleLikelihoodMap : tumorPRALM.getLikelihoodReadMap().entrySet()) { - Map alleleLikelihoodMap = readAlleleLikelihoodMap.getValue(); + final Map alleleLikelihoodMap = readAlleleLikelihoodMap.getValue(); if (originalNormalMQs.get(readAlleleLikelihoodMap.getKey().getReadName()) == 0) { continue; } @@ -484,9 +408,9 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine { final double readRefLogLikelihood = alleleLikelihoodMap.get(refAllele); genotypeLogLikelihoods.getRef().add(readRefLogLikelihood); - for (Allele altAllele : alleleFractions.getAltAlleles()) { - double readAltLogLikelihood = alleleLikelihoodMap.get(altAllele); - double adjustedReadAltLL = Math.log10( + for (final Allele altAllele : alleleFractions.getAltAlleles()) { + final double readAltLogLikelihood = alleleLikelihoodMap.get(altAllele); + final double adjustedReadAltLL = Math.log10( Math.pow(10, readRefLogLikelihood) * (1 - alleleFractions.getAlt(altAllele)) + Math.pow(10, readAltLogLikelihood) * alleleFractions.getAlt(altAllele) ); @@ -515,9 +439,9 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine { final PerAlleleCollection alleleCounts = getRefAltCount(vc, pralm, oneStrandOnly); final PerAlleleCollection alleleFractions = PerAlleleCollection.createPerAltAlleleCollection(); - int refCount = alleleCounts.getRef(); + final int refCount = alleleCounts.getRef(); for ( final Allele altAllele : vc.getAlternateAlleles() ) { - int altCount = alleleCounts.getAlt(altAllele); + final int altCount = alleleCounts.getAlt(altAllele); double alleleFraction = (double) altCount / (refCount + altCount); // weird case, but I've seen it happen in one strand cases if (refCount == 0 && altCount == refCount ) { @@ -552,16 +476,12 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine { final PerAlleleCollection alleleCounts = PerAlleleCollection.createPerRefAndAltAlleleCollection(); - - // initialize the allele counts to 0 - for (final Allele allele : vcAlleles) { - alleleCounts.set(allele, new MutableInt(0)); - } + vcAlleles.stream().forEach(a -> alleleCounts.set(a, new MutableInt(0))); for (final Map.Entry> readAlleleLikelihoodMap : pralm.getLikelihoodReadMap().entrySet()) { final GATKSAMRecord read = readAlleleLikelihoodMap.getKey(); final Map alleleLikelihoodMap = readAlleleLikelihoodMap.getValue(); - MostLikelyAllele mostLikelyAllele = PerReadAlleleLikelihoodMap.getMostLikelyAllele(alleleLikelihoodMap, vcAlleles); + final MostLikelyAllele mostLikelyAllele = PerReadAlleleLikelihoodMap.getMostLikelyAllele(alleleLikelihoodMap, vcAlleles); if (read.getMappingQuality() > 0 && mostLikelyAllele.isInformative()) { alleleCounts.get(mostLikelyAllele.getMostLikelyAllele()).increment(); @@ -581,18 +501,16 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine { } } - private void filterPRALMForOverlappingReads(PerReadAlleleLikelihoodMap pralm, Allele ref, int location, boolean retainMismatches) { - - Map> m = pralm.getLikelihoodReadMap(); - + private void filterPRALMForOverlappingReads(final PerReadAlleleLikelihoodMap pralm, final Allele ref, final int location, final boolean retainMismatches) { + final Map> m = pralm.getLikelihoodReadMap(); // iterate through the reads, if the name has been seen before we have overlapping (potentially) fragments, so handle them - Map nameToRead = new HashMap<>(); - Set readsToKeep = new HashSet<>(); + final Map nameToRead = new HashMap<>(); + final Set readsToKeep = new HashSet<>(); - for(GATKSAMRecord rec : m.keySet()) { + for(final GATKSAMRecord rec : m.keySet()) { // if we haven't seen it... just record the name and add it to the list of reads to keep - GATKSAMRecord existing = nameToRead.get(rec.getReadName()); + final GATKSAMRecord existing = nameToRead.get(rec.getReadName()); if (existing == null) { nameToRead.put(rec.getReadName(), rec); readsToKeep.add(rec); @@ -604,11 +522,11 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine { // TODO: CHECK IF THE READS BOTH OVERLAP THE POSITION!!!! if ( ReadUtils.isInsideRead(existing, location) && ReadUtils.isInsideRead(rec, location) ) { - MostLikelyAllele existingMLA = pralm.getMostLikelyAllele(pralm.getLikelihoodReadMap().get(existing)); - Allele existingAllele = existingMLA.getMostLikelyAllele(); + final MostLikelyAllele existingMLA = PerReadAlleleLikelihoodMap.getMostLikelyAllele(pralm.getLikelihoodReadMap().get(existing)); + final Allele existingAllele = existingMLA.getMostLikelyAllele(); - MostLikelyAllele recMLA = pralm.getMostLikelyAllele(pralm.getLikelihoodReadMap().get(rec)); - Allele recAllele = recMLA.getMostLikelyAllele(); + final MostLikelyAllele recMLA = PerReadAlleleLikelihoodMap.getMostLikelyAllele(pralm.getLikelihoodReadMap().get(rec)); + final Allele recAllele = recMLA.getMostLikelyAllele(); // if the reads disagree at this position... if (!existingAllele.equals(recAllele)) { @@ -654,7 +572,7 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine { } private void splitPRALMintoForwardAndReverseReads(final PerReadAlleleLikelihoodMap originalPRALM, final PerReadAlleleLikelihoodMap forwardPRALM, final PerReadAlleleLikelihoodMap reversePRALM) { - Map> origReadAlleleLikelihoodMap = originalPRALM.getLikelihoodReadMap(); + final Map> origReadAlleleLikelihoodMap = originalPRALM.getLikelihoodReadMap(); for (final GATKSAMRecord read : origReadAlleleLikelihoodMap.keySet()) { if (read.isStrandless()) continue; @@ -669,17 +587,4 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine { } } } - - - // Move to utility class so we can use one shared with HaplotypeCallerGenotypingEngine - private VariantContext addNonRefSymbolicAllele(final VariantContext mergedVC) { - final VariantContextBuilder vcb = new VariantContextBuilder(mergedVC); - final List originalList = mergedVC.getAlleles(); - final List alleleList = new ArrayList<>(originalList.size() + 1); - alleleList.addAll(mergedVC.getAlleles()); - alleleList.add(GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE); - vcb.alleles(alleleList); - return vcb.make(); - } - } diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/TumorPowerCalculator.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/TumorPowerCalculator.java index 5dd37dc7c..8c09160ff 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/TumorPowerCalculator.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/TumorPowerCalculator.java @@ -55,6 +55,7 @@ import org.apache.commons.math.MathException; import org.apache.commons.math.distribution.BinomialDistribution; import org.apache.commons.math.distribution.BinomialDistributionImpl; import org.apache.commons.math3.util.Pair; +import org.broadinstitute.gatk.utils.exceptions.GATKException; import java.util.Arrays; import java.util.HashMap; @@ -70,7 +71,6 @@ public class TumorPowerCalculator { private final double tumorLODThreshold; private final double contamination; private final boolean enableSmoothing; - public static int numCacheHits = 0; private final HashMap cache = new HashMap(); @@ -134,16 +134,18 @@ public class TumorPowerCalculator { * @throws MathException * */ - public double cachedPowerCalculation(final int numReads, final double alleleFraction) throws MathException { + public double cachedPowerCalculation(final int numReads, final double alleleFraction) { PowerCacheKey key = new PowerCacheKey(numReads, alleleFraction); // we first look up if power for given number of read and allele fraction has already been computed and stored in the cache. // if not we compute it and store it in teh cache. Double power = cache.get(key); if (power == null) { - power = calculatePower(numReads, alleleFraction); + try { + power = calculatePower(numReads, alleleFraction); + } catch (final Exception ex) { + throw new GATKException("Power calculation failed", ex); + } cache.put(key, power); - } else { - numCacheHits++; } return power; } From cd5c04f8066513bb5d1bfc79163961d82debcc07 Mon Sep 17 00:00:00 2001 From: Ron Levine Date: Fri, 19 Aug 2016 14:29:33 -0400 Subject: [PATCH 33/68] Make getElementForRead() in RankSumTest robust --- .../gatk/tools/walkers/annotator/AS_RankSumTest.java | 2 +- .../tools/walkers/annotator/AS_ReadPosRankSumTest.java | 7 +++++-- .../gatk/tools/walkers/annotator/RankSumTest.java | 4 ++-- .../gatk/tools/walkers/annotator/ReadPosRankSumTest.java | 2 +- ...ypeCallerComplexAndSymbolicVariantsIntegrationTest.java | 2 +- .../HaplotypeCallerGVCFIntegrationTest.java | 2 +- 6 files changed, 11 insertions(+), 8 deletions(-) diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_RankSumTest.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_RankSumTest.java index f413f55cc..05a696a32 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_RankSumTest.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_RankSumTest.java @@ -278,7 +278,7 @@ public abstract class AS_RankSumTest extends RankSumTest implements ReducibleAnn if ( isUsableRead(read, refLoc) ) { final Double value = getElementForRead(read, refLoc, a); // Bypass read if the clipping goal is not reached or the refloc is inside a spanning deletion - if ( value == null || value < 0.0 ) + if ( value == null || value == INVALID_ELEMENT_FROM_READ ) continue; if(perAlleleValues.containsKey(a.getMostLikelyAllele())) diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_ReadPosRankSumTest.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_ReadPosRankSumTest.java index e0a92af1d..0028fe9f8 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_ReadPosRankSumTest.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_ReadPosRankSumTest.java @@ -102,10 +102,13 @@ public class AS_ReadPosRankSumTest extends AS_RankSumTest implements AS_Standard if ( offset == ReadUtils.CLIPPING_GOAL_NOT_REACHED ) return null; + // If the offset inside a deletion, it does not lie on a read. + if ( AlignmentUtils.isInsideDeletion(read.getCigar(), offset) ) { + return INVALID_ELEMENT_FROM_READ; + } + int readPos = AlignmentUtils.calcAlignmentByteArrayOffset(read.getCigar(), offset, false, 0, 0); final int numAlignedBases = AlignmentUtils.getNumAlignedBasesCountingSoftClips( read ); - // Note: For a spanning deletion, readPos is at the upstream end of the deletion and is greater than numAlignedBases (which does not include deletions). - // Hence, the resulting readPos will have a negative value. if (readPos > numAlignedBases / 2) readPos = numAlignedBases - (readPos + 1); return (double)readPos; diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/RankSumTest.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/RankSumTest.java index b1f7bb63e..c9eaba76a 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/RankSumTest.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/RankSumTest.java @@ -76,7 +76,7 @@ import java.util.*; //TODO: will eventually implement ReducibleAnnotation in order to preserve accuracy for CombineGVCFs and GenotypeGVCFs -- see RMSAnnotation.java for an example of an abstract ReducibleAnnotation public abstract class RankSumTest extends InfoFieldAnnotation implements ActiveRegionBasedAnnotation { static final boolean DEBUG = false; - protected static double INVALID_READ_POSITION = -1; // No mapping to a read position + protected static double INVALID_ELEMENT_FROM_READ = Double.NEGATIVE_INFINITY; public Map annotate(final RefMetaDataTracker tracker, final AnnotatorCompatible walker, @@ -185,7 +185,7 @@ public abstract class RankSumTest extends InfoFieldAnnotation implements ActiveR if ( isUsableRead(read, refLoc) ) { final Double value = getElementForRead(read, refLoc, a); // Bypass read if the clipping goal is not reached or the refloc is inside a spanning deletion - if ( value == null || value == INVALID_READ_POSITION ) + if ( value == null || value == INVALID_ELEMENT_FROM_READ ) continue; if ( a.getMostLikelyAllele().isReference() ) diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/ReadPosRankSumTest.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/ReadPosRankSumTest.java index fbba479e4..62256c207 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/ReadPosRankSumTest.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/ReadPosRankSumTest.java @@ -106,7 +106,7 @@ public class ReadPosRankSumTest extends RankSumTest implements StandardAnnotatio // If the offset inside a deletion, it does not lie on a read. if ( AlignmentUtils.isInsideDeletion(read.getCigar(), offset) ) { - return INVALID_READ_POSITION; + return INVALID_ELEMENT_FROM_READ; } int readPos = AlignmentUtils.calcAlignmentByteArrayOffset( read.getCigar(), offset, false, 0, 0 ); diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java index af1987adf..d5c1eba67 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java @@ -72,7 +72,7 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa @Test public void testHaplotypeCallerMultiSampleComplex1() { - HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "86528820f8c102c712d9562b83204c05"); + HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "b01df95864808dc67295efc6db37983d"); } private void HCTestSymbolicVariants(String bam, String args, String md5) { diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java index bb7cd0a12..739371f39 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java @@ -458,7 +458,7 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest { public void testHaplotypeCallerGVCSpanDel() { final String commandLine = String.format("-T HaplotypeCaller -R %s -I %s -L 1:26357667 -ERC GVCF --no_cmdline_in_header -A AS_ReadPosRankSumTest -A ReadPosRankSumTest -variant_index_type %s -variant_index_parameter %d", b37KGReference, privateTestDir + "NexPond-377866-1:26357600-26357700.bam", GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER); - final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("93bc22340e6a4b01a7b96e5a3a12dfc3")); + final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("b8f0bb74bc099a8f78d600d88861e1b6")); spec.disableShadowBCF(); executeTest("testHaplotypeCallerGVCSpanDel", spec); } From bc3b3ac0ec4b4fd72a9e856470edaeb4c7566a06 Mon Sep 17 00:00:00 2001 From: Takuto Sato Date: Thu, 25 Aug 2016 13:20:13 -0400 Subject: [PATCH 34/68] Cleaned up SomaticGenotypingEngine::callMutations and added some TODOs. --- .../cancer/m2/SomaticGenotypingEngine.java | 106 +++++++++++------- .../cancer/m2/MuTect2IntegrationTest.java | 6 +- 2 files changed, 70 insertions(+), 42 deletions(-) diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/SomaticGenotypingEngine.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/SomaticGenotypingEngine.java index dfbcf9cee..20940c74f 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/SomaticGenotypingEngine.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/SomaticGenotypingEngine.java @@ -107,8 +107,9 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine { this.tumorSampleName = tumorSampleName; this.matchedNormalSampleName = matchedNormalSampleName; this.DEBUG_READ_NAME = DEBUG_READ_NAME; + // coverage related initialization - final double errorProbability = Math.pow(10, -(MTAC.POWER_CONSTANT_QSCORE/10)); + final double errorProbability = Math.pow(10, -MTAC.POWER_CONSTANT_QSCORE / 10); strandArtifactPowerCalculator = new TumorPowerCalculator(errorProbability, MTAC.STRAND_ARTIFACT_LOD_THRESHOLD, 0.0f); } @@ -148,8 +149,12 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine { if (activeRegionWindow == null ) throw new IllegalArgumentException("activeRegionWindow must be non-null, got "+activeRegionWindow); if (activeAllelesToGenotype == null ) throw new IllegalArgumentException("activeAllelesToGenotype must be non-null, got "+activeAllelesToGenotype); - // Somatic Tumor/Normal Sample Handling - verifySamplePresence(tumorSampleName, readLikelihoods.samples()); + if (!readLikelihoods.samples().contains(tumorSampleName)) { + throw new IllegalArgumentException("readLikelihoods does not contain the tumor sample "+ tumorSampleName); + } + + // if we don't have the normal sample, we are in tumor only mode + // TODO: check in MuTect2.java for code we can skip when in tumor only mode final boolean hasNormal = matchedNormalSampleName != null; // update the haplotypes so we're ready to call, getting the ordered list of positions on the reference @@ -164,6 +169,7 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine { if( loc < activeRegionWindow.getStart() || loc > activeRegionWindow.getStop() ) { continue; } + final List eventsAtThisLoc = getVCsAtThisLocation(haplotypes, loc, activeAllelesToGenotype); if( eventsAtThisLoc.isEmpty() ) { continue; } @@ -171,28 +177,46 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine { // Create the event mapping object which maps the original haplotype events to the events present at just this locus final Map> eventMapper = createEventMapper(loc, eventsAtThisLoc, haplotypes); + // TODO: priorityList is not sorted by priority, might as well just use eventsAtThisLoc.map(VariantContext::getSource) final List priorityList = makePriorityList(eventsAtThisLoc); + // merge variant contexts from multiple haplotypes into one variant context + // TODO: we should use haplotypes if possible, but that may have to wait for GATK4 VariantContext mergedVC = GATKVariantContextUtils.simpleMerge(eventsAtThisLoc, priorityList, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, null, false, false); if( mergedVC == null ) { continue; } + // TODO: this varaible needs a descriptive name final Map mergeMap = new LinkedHashMap<>(); + mergeMap.put(null, mergedVC.getReference()); // the reference event (null) --> the reference allele - for(int iii = 0; iii < eventsAtThisLoc.size(); iii++) { - mergeMap.put(eventsAtThisLoc.get(iii), mergedVC.getAlternateAllele(iii)); // BUGBUG: This is assuming that the order of alleles is the same as the priority list given to simpleMerge function + for(int i = 0; i < eventsAtThisLoc.size(); i++) { + // TODO: as noted below, this operation seems dangerous. Understand how things can go wrong. + mergeMap.put(eventsAtThisLoc.get(i), mergedVC.getAlternateAllele(i)); // BUGBUG: This is assuming that the order of alleles is the same as the priority list given to simpleMerge function } + /** TODO: the code in the for loop up to here needs refactor. The goal, as far as I can tell, is to create two things: alleleMapper and mergedVC + * alleleMapper maps alleles to haplotypes, and we need this to create readAlleleLikelihoods. + * To make alleleMapper we make mergeMap (of type VC -> Allele) and eventMapper (of type Event -> List(Haplotypes), where Event is essentialy Variant Context) + * If we just want a map of Alleles to Haplotypes, we should be able to do so directly; no need for intermediate maps, which just complicates the code. + **/ + + final Map> alleleMapper = createAlleleMapper(mergeMap, eventMapper); + // converting ReadLikelihoods to ReadLikeliHoods ReadLikelihoods readAlleleLikelihoods = readLikelihoods.marginalize(alleleMapper, genomeLocParser.createPaddedGenomeLoc(genomeLocParser.createGenomeLoc(mergedVC), ALLELE_EXTENSION)); - //LDG: do we want to do this before or after pulling out overlapping reads? - if (MTAC.isSampleContaminationPresent()) + // LDG: do we want to do this before or after pulling out overlapping reads? + // TODO: do we want this at all? How does downsampling help? + if (MTAC.isSampleContaminationPresent()) { readAlleleLikelihoods.contaminationDownsampling(MTAC.getSampleContamination()); + } + // TODO: this is a good break point for a new method + // TODO: replace PRALM with ReadLikelihoods final PerReadAlleleLikelihoodMap tumorPRALM = readAlleleLikelihoods.toPerReadAlleleLikelihoodMap(readAlleleLikelihoods.sampleIndex(tumorSampleName)); filterPRALMForOverlappingReads(tumorPRALM, mergedVC.getReference(), loc, false); MuTect2.logReadInfo(DEBUG_READ_NAME, tumorPRALM.getLikelihoodReadMap().keySet(), "Present in Tumor PRALM after filtering for overlapping reads"); @@ -208,11 +232,23 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine { tumorLods.set(altAllele, tumorHetGenotypeLLs.get(altAllele) - tumorHetGenotypeLLs.getRef()); } - double INIT_NORMAL_LOD_THRESHOLD = -Double.MAX_VALUE; - double NORMAL_LOD_THRESHOLD = -Double.MAX_VALUE; + + // TODO: another good breakpoint e.g. compute normal LOD/set thresholds + // TODO: anything related to normal should be encapsulated in Optional + + // Normal LOD must exceed this threshold for the variant to make it in the vcf + // TODO: variable name too log + double normalLodThresholdForVCF = -Double.MIN_VALUE; + + // A variant candidate whose normal LOD is below this threshold will be filtered as 'germline_risk' + // This is a more stringent threshold than normalLodThresholdForVCF + double normalLodFilterThreshold = -Double.MIN_VALUE; + PerReadAlleleLikelihoodMap normalPRALM = null; final PerAlleleCollection normalLods = PerAlleleCollection.createPerAltAlleleCollection(); + // TODO: this if statement should be a standalone method for computing normal LOD + // TODO: then we can do something like normalLodThreshold = hasNormal ? thisMethod() : Optional.empty() // if normal bam is available, compute normal LOD if (hasNormal) { normalPRALM = readAlleleLikelihoods.toPerReadAlleleLikelihoodMap(readAlleleLikelihoods.sampleIndex(matchedNormalSampleName)); @@ -222,13 +258,13 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine { final GenomeLoc eventGenomeLoc = genomeLocParser.createGenomeLoc(activeRegionWindow.getContig(), loc); final Collection cosmicVC = tracker.getValues(MTAC.cosmicRod, eventGenomeLoc); final Collection dbsnpVC = tracker.getValues(MTAC.dbsnp.dbsnp, eventGenomeLoc); - // remove the effect of cosmic from dbSNP - final boolean germlineAtRisk = (!dbsnpVC.isEmpty() && cosmicVC.isEmpty()); - INIT_NORMAL_LOD_THRESHOLD = MTAC.INITIAL_NORMAL_LOD_THRESHOLD; //only set this if this job has a normal - NORMAL_LOD_THRESHOLD = (germlineAtRisk)?MTAC.NORMAL_DBSNP_LOD_THRESHOLD:MTAC.NORMAL_LOD_THRESHOLD; + final boolean germlineAtRisk = !dbsnpVC.isEmpty() && cosmicVC.isEmpty(); - // compute normal LOD = LL(X|REF)/LL(X|ALT) where ALT is the diploid HET with AF = 0.5 + normalLodThresholdForVCF = MTAC.INITIAL_NORMAL_LOD_THRESHOLD; + normalLodFilterThreshold = germlineAtRisk ? MTAC.NORMAL_DBSNP_LOD_THRESHOLD : MTAC.NORMAL_LOD_THRESHOLD; + + // compute normal LOD = LL(X|REF)/LL(X|ALT) where REF is the diploid HET with AF = 0.5 // note normal LOD is REF over ALT, the reciprocal of the tumor LOD final PerAlleleCollection diploidHetAlleleFractions = PerAlleleCollection.createPerRefAndAltAlleleCollection(); for (final Allele allele : mergedVC.getAlternateAlleles()){ @@ -246,40 +282,36 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine { final Set allelesThatPassThreshold = new HashSet<>(); Allele alleleWithHighestTumorLOD = null; - // TODO: use lambda for (final Allele altAllele : mergedVC.getAlternateAlleles()) { final boolean passesTumorLodThreshold = tumorLods.getAlt(altAllele) >= MTAC.INITIAL_TUMOR_LOD_THRESHOLD; - final boolean passesNormalLodThreshold = hasNormal ? normalLods.getAlt(altAllele) >= INIT_NORMAL_LOD_THRESHOLD : true; + final boolean passesNormalLodThreshold = hasNormal ? normalLods.getAlt(altAllele) >= normalLodThresholdForVCF : true; if (passesTumorLodThreshold && passesNormalLodThreshold) { numPassingAlts++; allelesThatPassThreshold.add(altAllele); - if (alleleWithHighestTumorLOD == null - || tumorLods.getAlt(altAllele) > tumorLods.getAlt(alleleWithHighestTumorLOD)){ + if (alleleWithHighestTumorLOD == null || tumorLods.getAlt(altAllele) > tumorLods.getAlt(alleleWithHighestTumorLOD)){ alleleWithHighestTumorLOD = altAllele; } } } + if (numPassingAlts == 0) { continue; } - final VariantContextBuilder callVcb = new VariantContextBuilder(mergedVC); - // FIXME: can simply get first alternate since above we only deal with Bi-allelic sites... - final int haplotypeCount = alleleMapper.get(mergedVC.getAlternateAllele(0)).size(); + final int haplotypeCount = alleleMapper.get(alleleWithHighestTumorLOD).size(); callVcb.attribute(GATKVCFConstants.HAPLOTYPE_COUNT_KEY, haplotypeCount); callVcb.attribute(GATKVCFConstants.TUMOR_LOD_KEY, tumorLods.getAlt(alleleWithHighestTumorLOD)); if (hasNormal) { callVcb.attribute(GATKVCFConstants.NORMAL_LOD_KEY, normalLods.getAlt(alleleWithHighestTumorLOD)); - if (normalLods.getAlt(alleleWithHighestTumorLOD) < NORMAL_LOD_THRESHOLD) { + if (normalLods.getAlt(alleleWithHighestTumorLOD) < normalLodFilterThreshold) { callVcb.filter(GATKVCFConstants.GERMLINE_RISK_FILTER_NAME); } } - // M1-style strand artifact filter + // TODO: this should be a separate method // TODO: move code to MuTect2::calculateFilters() - // skip if VC has multiple alleles - it will get filtered later anyway if (MTAC.ENABLE_STRAND_ARTIFACT_FILTER && numPassingAlts == 1) { final PerReadAlleleLikelihoodMap forwardPRALM = new PerReadAlleleLikelihoodMap(); final PerReadAlleleLikelihoodMap reversePRALM = new PerReadAlleleLikelihoodMap(); @@ -304,12 +336,10 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine { final double tumorSBpower_fwd = strandArtifactPowerCalculator.cachedPowerCalculation(forwardPRALM.getNumberOfStoredElements(), altAlleleFractions.getAlt(alleleWithHighestTumorLOD)); final double tumorSBpower_rev = strandArtifactPowerCalculator.cachedPowerCalculation(reversePRALM.getNumberOfStoredElements(), altAlleleFractions.getAlt(alleleWithHighestTumorLOD)); - callVcb.attribute(GATKVCFConstants.TLOD_FWD_KEY, tumorLod_fwd); callVcb.attribute(GATKVCFConstants.TLOD_REV_KEY, tumorLod_rev); callVcb.attribute(GATKVCFConstants.TUMOR_SB_POWER_FWD_KEY, tumorSBpower_fwd); callVcb.attribute(GATKVCFConstants.TUMOR_SB_POWER_REV_KEY, tumorSBpower_rev); - // TODO: add vcf INFO fields. see callVcb.attribute(GATKVCFConstants.HAPLOTYPE_COUNT_KEY, haplotypeCount); if ((tumorSBpower_fwd > MTAC.STRAND_ARTIFACT_POWER_THRESHOLD && tumorLod_fwd < MTAC.STRAND_ARTIFACT_LOD_THRESHOLD) || (tumorSBpower_rev > MTAC.STRAND_ARTIFACT_POWER_THRESHOLD && tumorLod_rev < MTAC.STRAND_ARTIFACT_LOD_THRESHOLD)) @@ -323,9 +353,15 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine { // build genotypes TODO: this part needs review and refactor final List tumorAlleles = Arrays.asList(mergedVC.getReference(), alleleWithHighestTumorLOD); + // TODO: estimateAlleleFraction should not repeat counting allele depths + final PerAlleleCollection tumorAlleleDepths = getRefAltCount(mergedVC, tumorPRALM, false); + final int tumorRefAlleleDepth = tumorAlleleDepths.getRef(); + final int tumorAltAlleleDepth = tumorAlleleDepths.getAlt(alleleWithHighestTumorLOD); final Genotype tumorGenotype = new GenotypeBuilder(tumorSampleName, tumorAlleles) + .AD(new int[] { tumorRefAlleleDepth, tumorAltAlleleDepth }) .attribute(GATKVCFConstants.ALLELE_FRACTION_KEY, altAlleleFractions.getAlt(alleleWithHighestTumorLOD)) - .make(); // TODO: add ADs? + .make(); + final List genotypes = new ArrayList<>(); genotypes.add(tumorGenotype); @@ -335,20 +371,18 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine { // if we are calling with a normal, build the genotype for the sample to appear in vcf if (hasNormal) { - final PerAlleleCollection normalCounts = getRefAltCount(mergedVC, normalPRALM, false); - final int normalRefAlleleDepth = normalCounts.getRef(); - final int normalAltAlleleDepth = normalCounts.getAlt(alleleWithHighestTumorLOD); - final int[] normalAlleleDepths = { normalRefAlleleDepth, normalAltAlleleDepth }; + final PerAlleleCollection normalAlleleDepths = getRefAltCount(mergedVC, normalPRALM, false); + final int normalRefAlleleDepth = normalAlleleDepths.getRef(); + final int normalAltAlleleDepth = normalAlleleDepths.getAlt(alleleWithHighestTumorLOD); final double normalAlleleFraction = (double) normalAltAlleleDepth / ( normalRefAlleleDepth + normalAltAlleleDepth); final Genotype normalGenotype = new GenotypeBuilder(matchedNormalSampleName, homRefAllelesforNormalGenotype) - .AD(normalAlleleDepths) + .AD(new int[] { normalRefAlleleDepth, normalAltAlleleDepth }) .attribute(GATKVCFConstants.ALLELE_FRACTION_KEY, normalAlleleFraction) .make(); genotypes.add(normalGenotype); } - //only use alleles found in the tumor ( final VariantContext call = new VariantContextBuilder(callVcb).alleles(tumorAlleles).genotypes(genotypes).make(); // how should we be making use of _perSampleFilteredReadList_? @@ -371,12 +405,6 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine { return new CalledHaplotypes(outputCalls, calledHaplotypes); } - private void verifySamplePresence(final String sampleName, final List samples) { - if (!samples.contains(sampleName)) { - throw new IllegalArgumentException("Unable to find sample name "+sampleName+"in sample list of " + StringUtil.join(",", samples)); - } - } - /** Calculate the likelihoods of hom ref and each het genotype of the form ref/alt * * @param mergedVC input VC diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/MuTect2IntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/MuTect2IntegrationTest.java index 0b93b9fc1..b7ca42c4f 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/MuTect2IntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/MuTect2IntegrationTest.java @@ -149,7 +149,7 @@ public class MuTect2IntegrationTest extends WalkerTest { */ @Test public void testFalsePositivesDream3() { - M2Test(DREAM3_TUMOR_BAM, DREAM3_NORMAL_BAM, DREAM3_FP_INTERVALS_FILE, "", "c23f794866797f9bbcb3ed04451758be"); // e2413f4166b6ed20be6cdee6616ba43d + M2Test(DREAM3_TUMOR_BAM, DREAM3_NORMAL_BAM, DREAM3_FP_INTERVALS_FILE, "", "c9eec57bbc93ea630c202b7620f8dca8"); // e2413f4166b6ed20be6cdee6616ba43d } /** @@ -170,12 +170,12 @@ public class MuTect2IntegrationTest extends WalkerTest { @Test public void testStrandArtifactFilter(){ - M2Test(DREAM3_TUMOR_BAM, DREAM3_NORMAL_BAM, DREAM3_FP_INTERVALS_FILE, "--enable_strand_artifact_filter", "75c9349ff9f8dc84291396ac50871f64"); + M2Test(DREAM3_TUMOR_BAM, DREAM3_NORMAL_BAM, DREAM3_FP_INTERVALS_FILE, "--enable_strand_artifact_filter", "1686c1a0e63768497f21b9d7bb6548c5"); } @Test public void testClusteredReadPositionFilter() { - M2Test(DREAM3_TUMOR_BAM, DREAM3_NORMAL_BAM, DREAM3_FP_INTERVALS_FILE, "--enable_clustered_read_position_filter", "c333f7dc11e39e0713147ad9af2bf4db"); + M2Test(DREAM3_TUMOR_BAM, DREAM3_NORMAL_BAM, DREAM3_FP_INTERVALS_FILE, "--enable_clustered_read_position_filter", "b44c23af7de84f96d2371db25d29aba2"); } From a88390510155da798e58ebb61c7e2d7d1bc59ee7 Mon Sep 17 00:00:00 2001 From: Ron Levine Date: Wed, 17 Aug 2016 10:31:59 -0400 Subject: [PATCH 35/68] Remove -stand_emit_conf argument --- .../GenotypeCalculationArgumentCollection.java | 6 ------ .../tools/walkers/genotyper/GenotypingEngine.java | 12 ++++++------ .../tools/walkers/genotyper/UnifiedGenotyper.java | 1 - .../walkers/genotyper/UnifiedGenotypingEngine.java | 14 +++++++------- .../walkers/haplotypecaller/HaplotypeCaller.java | 4 ---- .../genotyper/UnifiedGenotyperIntegrationTest.java | 2 +- ...ifiedGenotyperNormalCallingIntegrationTest.java | 4 ++-- .../HaplotypeCallerIntegrationTest.java | 6 +++--- .../variantutils/GenotypeGVCFsIntegrationTest.java | 14 +++++++------- 9 files changed, 26 insertions(+), 37 deletions(-) diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/arguments/GenotypeCalculationArgumentCollection.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/arguments/GenotypeCalculationArgumentCollection.java index 4a5456d94..d879eec82 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/arguments/GenotypeCalculationArgumentCollection.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/arguments/GenotypeCalculationArgumentCollection.java @@ -110,12 +110,6 @@ public class GenotypeCalculationArgumentCollection implements Cloneable{ @Argument(fullName = "standard_min_confidence_threshold_for_calling", shortName = "stand_call_conf", doc = "The minimum phred-scaled confidence threshold at which variants should be called", required = false) public double STANDARD_CONFIDENCE_FOR_CALLING = 30.0; - /** - * This argument allows you to emit low quality calls as filtered records. - */ - @Argument(fullName = "standard_min_confidence_threshold_for_emitting", shortName = "stand_emit_conf", doc = "The minimum phred-scaled confidence threshold at which variants should be emitted (and filtered with LowQual if less than the calling threshold)", required = false) - public double STANDARD_CONFIDENCE_FOR_EMITTING = 30.0; - /** * If there are more than this number of alternate alleles presented to the genotyper (either through discovery or GENOTYPE_GIVEN_ALLELES), * then only this many alleles will be used. Note that genotyping sites with many alternate alleles is both CPU and memory intensive and it diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypingEngine.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypingEngine.java index cdcb2c195..a9ec24c4f 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypingEngine.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypingEngine.java @@ -68,6 +68,7 @@ import org.broadinstitute.gatk.utils.contexts.AlignmentContext; import org.broadinstitute.gatk.utils.contexts.AlignmentContextUtils; import org.broadinstitute.gatk.utils.contexts.ReferenceContext; import org.broadinstitute.gatk.utils.exceptions.UserException; +import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.gatk.utils.genotyper.SampleList; import org.broadinstitute.gatk.utils.gga.GenotypingGivenAllelesUtils; import org.broadinstitute.gatk.utils.pileup.ReadBackedPileup; @@ -218,7 +219,7 @@ public abstract class GenotypingEngine stratifiedContexts, final VariantContext vc, final GenotypeLikelihoodsCalculationModel.Model model, final boolean inheritAttributesFromInputVC, - final Map perReadAlleleLikelihoodMap, + final Map perReadAlleleLikelihoodMap, final boolean doAlleleSpecificCalcs) { final boolean limitedContext = tracker == null || refContext == null || rawContext == null || stratifiedContexts == null; @@ -362,7 +363,7 @@ public abstract class GenotypingEngine= Math.min(configuration.genotypeArgs.STANDARD_CONFIDENCE_FOR_CALLING, - configuration.genotypeArgs.STANDARD_CONFIDENCE_FOR_EMITTING); + conf >= configuration.genotypeArgs.STANDARD_CONFIDENCE_FOR_CALLING; } protected final boolean passesCallThreshold(double conf) { @@ -682,7 +682,7 @@ public abstract class GenotypingEngine composeCallAttributes(final boolean inheritAttributesFromInputVC, final VariantContext vc, final AlignmentContext rawContext, final Map stratifiedContexts, final RefMetaDataTracker tracker, final ReferenceContext refContext, final List alleleCountsofMLE, final boolean bestGuessIsRef, final AFCalculationResult AFresult, final List allAllelesToUse, final GenotypesContext genotypes, - final GenotypeLikelihoodsCalculationModel.Model model, final Map perReadAlleleLikelihoodMap, + final GenotypeLikelihoodsCalculationModel.Model model, final Map perReadAlleleLikelihoodMap, final boolean doAlleleSpecificCalcs) { final HashMap attributes = new HashMap<>(); @@ -778,7 +778,7 @@ public abstract class GenotypingEngine= QualityUtils.qualToErrorProbLog10(configuration.genotypeArgs.STANDARD_CONFIDENCE_FOR_EMITTING)) + if (normalizedLog10ACeq0Posterior >= QualityUtils.qualToErrorProbLog10(configuration.genotypeArgs.STANDARD_CONFIDENCE_FOR_CALLING/3)) return 0.0; return 1.0 - Math.pow(10.0, normalizedLog10ACeq0Posterior); diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyper.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyper.java index 188ca1e49..1f6eb8b8d 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyper.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyper.java @@ -116,7 +116,6 @@ import java.util.*; * --dbsnp dbSNP.vcf \ * -o snps.raw.vcf \ * -stand_call_conf [50.0] \ - * -stand_emit_conf 10.0 \ * [-L targets.interval_list] * * diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotypingEngine.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotypingEngine.java index 5c3c68c43..d1774160e 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotypingEngine.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotypingEngine.java @@ -203,7 +203,7 @@ public class UnifiedGenotypingEngine extends GenotypingEngine models = getGLModelsToUse(tracker, rawContext); - final Map perReadAlleleLikelihoodMap = new HashMap<>(); + final Map perReadAlleleLikelihoodMap = new HashMap<>(); final VariantCallContext defaultResult = configuration.outputMode == OutputMode.EMIT_ALL_SITES && configuration.genotypingOutputMode == GenotypingOutputMode.GENOTYPE_GIVEN_ALLELES @@ -267,7 +267,7 @@ public class UnifiedGenotypingEngine extends GenotypingEngine perReadAlleleLikelihoodMap) { + final Map perReadAlleleLikelihoodMap) { final List models = getGLModelsToUse(tracker, rawContext); if ( models.isEmpty() ) { return null; @@ -345,7 +345,7 @@ public class UnifiedGenotypingEngine extends GenotypingEngine alternateAllelesToUse, final boolean useBAQedPileup, final GenotypeLikelihoodsCalculationModel.Model model, - final Map perReadAlleleLikelihoodMap) { + final Map perReadAlleleLikelihoodMap) { return glcm.get().get(model.name()).getLikelihoods(tracker, refContext, stratifiedContexts, type, alternateAllelesToUse, useBAQedPileup && BAQEnabledOnCMDLine, genomeLocParser != null || refContext == null ? genomeLocParser : refContext.getGenomeLocParser(), perReadAlleleLikelihoodMap); @@ -360,7 +360,7 @@ public class UnifiedGenotypingEngine extends GenotypingEngine stratifiedContexts, final VariantContext vc, final GenotypeLikelihoodsCalculationModel.Model model, final boolean inheritAttributesFromInputVC, - final Map perReadAlleleLikelihoodMap) { + final Map perReadAlleleLikelihoodMap) { return calculateGenotypes(tracker, refContext, rawContext, stratifiedContexts, vc, model, inheritAttributesFromInputVC, perReadAlleleLikelihoodMap, false); } @@ -370,7 +370,7 @@ public class UnifiedGenotypingEngine extends GenotypingEngine stratifiedContexts, final VariantContext vc, final GenotypeLikelihoodsCalculationModel.Model model, - final Map perReadAlleleLikelihoodMap, + final Map perReadAlleleLikelihoodMap, final boolean useAlleleSpecificCalcs) { return calculateGenotypes(tracker, refContext, rawContext, stratifiedContexts, vc, model, false, perReadAlleleLikelihoodMap, useAlleleSpecificCalcs); } @@ -386,7 +386,7 @@ public class UnifiedGenotypingEngine extends GenotypingEngine stratifiedContexts, final VariantContext vc, final GenotypeLikelihoodsCalculationModel.Model model, final boolean inheritAttributesFromInputVC, - final Map perReadAlleleLikelihoodMap, + final Map perReadAlleleLikelihoodMap, final boolean useAlleleSpecificCalcs) { boolean limitedContext = tracker == null || refContext == null || rawContext == null || stratifiedContexts == null; final VariantCallContext result = super.calculateGenotypes(tracker,refContext,rawContext,stratifiedContexts,vc,model,inheritAttributesFromInputVC,perReadAlleleLikelihoodMap, useAlleleSpecificCalcs); @@ -410,7 +410,7 @@ public class UnifiedGenotypingEngine extends GenotypingEngine composeCallAttributes(final boolean inheritAttributesFromInputVC, final VariantContext vc, final AlignmentContext rawContext, final Map stratifiedContexts, final RefMetaDataTracker tracker, final ReferenceContext refContext, final List alleleCountsofMLE, final boolean bestGuessIsRef, final AFCalculationResult AFresult, final List allAllelesToUse, final GenotypesContext genotypes, - final GenotypeLikelihoodsCalculationModel.Model model, final Map perReadAlleleLikelihoodMap, + final GenotypeLikelihoodsCalculationModel.Model model, final Map perReadAlleleLikelihoodMap, final boolean useAlleleSpecificCalcs) { final Map result = super.composeCallAttributes(inheritAttributesFromInputVC, vc,rawContext,stratifiedContexts,tracker,refContext,alleleCountsofMLE,bestGuessIsRef, AFresult,allAllelesToUse,genotypes,model,perReadAlleleLikelihoodMap, useAlleleSpecificCalcs); diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCaller.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCaller.java index 5c2985720..491abf6fc 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCaller.java @@ -211,7 +211,6 @@ import java.util.*; * -I sample1.bam [-I sample2.bam ...] \ * [--dbsnp dbSNP.vcf] \ * [-stand_call_conf 30] \ - * [-stand_emit_conf 10] \ * [-L targets.interval_list] \ * -o output.raw.snps.indels.vcf * @@ -224,7 +223,6 @@ import java.util.*; * -I sample1.bam \ * [--dbsnp dbSNP.vcf] \ * -stand_call_conf 20 \ - * -stand_emit_conf 20 \ * -o output.raw.snps.indels.vcf * * @@ -590,7 +588,6 @@ public class HaplotypeCaller extends ActiveRegionWalker, In if (HCAC.genotypingOutputMode == GenotypingOutputMode.GENOTYPE_GIVEN_ALLELES) throw new UserException.BadArgumentValue("ERC/gt_mode","you cannot request reference confidence output and GENOTYPE_GIVEN_ALLELES at the same time"); - HCAC.genotypeArgs.STANDARD_CONFIDENCE_FOR_EMITTING = -0.0; HCAC.genotypeArgs.STANDARD_CONFIDENCE_FOR_CALLING = -0.0; // also, we don't need to output several of the annotations @@ -632,7 +629,6 @@ public class HaplotypeCaller extends ActiveRegionWalker, In simpleUAC.outputMode = OutputMode.EMIT_VARIANTS_ONLY; simpleUAC.genotypingOutputMode = GenotypingOutputMode.DISCOVERY; simpleUAC.genotypeArgs.STANDARD_CONFIDENCE_FOR_CALLING = Math.min(MAXMIN_CONFIDENCE_FOR_CONSIDERING_A_SITE_AS_POSSIBLE_VARIANT_IN_ACTIVE_REGION_DISCOVERY, HCAC.genotypeArgs.STANDARD_CONFIDENCE_FOR_CALLING ); // low values used for isActive determination only, default/user-specified values used for actual calling - simpleUAC.genotypeArgs.STANDARD_CONFIDENCE_FOR_EMITTING = Math.min(MAXMIN_CONFIDENCE_FOR_CONSIDERING_A_SITE_AS_POSSIBLE_VARIANT_IN_ACTIVE_REGION_DISCOVERY, HCAC.genotypeArgs.STANDARD_CONFIDENCE_FOR_EMITTING ); // low values used for isActive determination only, default/user-specified values used for actual calling simpleUAC.CONTAMINATION_FRACTION = 0.0; simpleUAC.CONTAMINATION_FRACTION_FILE = null; simpleUAC.exactCallsLog = null; diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index 3cdfec3ba..9fc875267 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -310,7 +310,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000 " + "-A SnpEff", 1, - Arrays.asList("65641c92469ab80513b04144d0eae900")); + Arrays.asList("2a1eced23dd605d1b0a3efde3f04e23f")); executeTest("testSnpEffAnnotationRequestedWithoutRodBinding", spec); } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperNormalCallingIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperNormalCallingIntegrationTest.java index 8278148ec..b6cc66d1c 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperNormalCallingIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperNormalCallingIntegrationTest.java @@ -70,7 +70,7 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{ public void testMultiSamplePilot1() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000", 1, - Arrays.asList("e7f216d2f9857a579ef3e211076b37a4")); + Arrays.asList("f03e4ef62d6614c9b1b0a600f7e9f16d")); executeTest("test MultiSample Pilot1", spec); } @@ -102,7 +102,7 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{ public void testMultipleSNPAlleles() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -glm BOTH --dbsnp " + b37dbSNP129 + " -I " + privateTestDir + "multiallelic.snps.bam -o %s -L " + privateTestDir + "multiallelic.snps.intervals", 1, - Arrays.asList("25b710f830749448cd056c9b2e7798ff")); + Arrays.asList("fca6cacfb523114a3fb93772569deb08")); executeTest("test Multiple SNP alleles", spec); } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index 31bd7d0a8..d170e3667 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -107,7 +107,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeBAMOutFlags() throws IOException { - HCTestWithBAMOut(NA12878_BAM, " -L 20:10000000-10100000 ", "56086abc3bd5e3f7d111f452b7cc4fa1", "6a81bbefa6c4ed7a6b8d2c3e0e5a4756"); + HCTestWithBAMOut(NA12878_BAM, " -L 20:10000000-10100000 ", "729ebefdce0d5ea6f535c354c329e6b9", "d38aab5bf8ef0bc7c18e8c909819da84"); } @Test @@ -264,7 +264,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void HCTestDoesNotFailOnBadRefBase() { // don't care about the output - just want to make sure it doesn't fail - final String base = String.format("-T HaplotypeCaller --disableDithering -pairHMMSub %s %s -R %s -I %s", HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, REF, privateTestDir + "NA12878.readsOverBadBase.chr3.bam") + " --no_cmdline_in_header -o /dev/null -L 3:60830000-60840000 --minPruning 3 -stand_call_conf 2 -stand_emit_conf 2"; + final String base = String.format("-T HaplotypeCaller --disableDithering -pairHMMSub %s %s -R %s -I %s", HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, REF, privateTestDir + "NA12878.readsOverBadBase.chr3.bam") + " --no_cmdline_in_header -o /dev/null -L 3:60830000-60840000 --minPruning 3 -stand_call_conf 2"; final WalkerTestSpec spec = new WalkerTestSpec(base, Collections.emptyList()); executeTest("HCTestDoesNotFailOnBadRefBase: ", spec); } @@ -421,7 +421,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { final String DBSNP = b37dbSNP138; final String commandLineWithoutInterval = String.format("-T HaplotypeCaller -pairHMMSub %s %s -I %s -R %s -D %s " + "-variant_index_type LINEAR -variant_index_parameter 128000 --no_cmdline_in_header " - + "-stand_call_conf 10.0 -stand_emit_conf 10.0", HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, TEST_BAM, REFERENCE, DBSNP); + + "-stand_call_conf 10.0", HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, TEST_BAM, REFERENCE, DBSNP); final String commandLineShortInterval = commandLineWithoutInterval + " -L " + SHORT_INTERVAL; final String commandLineLongInterval = commandLineWithoutInterval + " -L " + LONG_INTERVAL; diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFsIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFsIntegrationTest.java index ff42afac0..f511990c7 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFsIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFsIntegrationTest.java @@ -108,7 +108,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { " -V:sample3 " + privateTestDir + "combine.single.sample.pipeline.3.vcf" + " -L 20:10,000,000-11,000,000", b37KGReference), 1, - Collections.singletonList("c9edd4ca8c2801c4681322087d82e781")); + Collections.singletonList("61dd2aaabf94a8f5b87d5069a75d84d7")); executeTest("combineSingleSamplePipelineGVCF", spec); } @@ -170,7 +170,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { " -V:sample3 " + privateTestDir + "combine.single.sample.pipeline.3.vcf" + " -L 20:10,000,000-20,000,000", b37KGReference), 1, - Collections.singletonList("f48114bc6348cdc9dc4f0960f5dcf5f8")); + Collections.singletonList("3943e70eed48618040469e157509868e")); executeTest("combineSingleSamplePipelineGVCFHierarchical", spec); } @@ -182,7 +182,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { " -V:sample3 " + privateTestDir + "combine.single.sample.pipeline.3.vcf" + " -L 20:10,000,000-11,000,000 --dbsnp " + b37dbSNP132, b37KGReference), 1, - Collections.singletonList("f88841deb5c2ce4f3bbea1e914a13898")); + Collections.singletonList("51d498327342bd3b0b092845b437aad5")); executeTest("combineSingleSamplePipelineGVCF_addDbsnp", spec); } @@ -257,9 +257,9 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { @Test public void testStandardConf() { final WalkerTestSpec spec = new WalkerTestSpec( - baseBPResolutionString("-stand_call_conf 300 -stand_emit_conf 100"), + baseBPResolutionString("-stand_call_conf 300"), 1, - Collections.singletonList("0283e784ed49bc2dce32a26137c43409")); + Collections.singletonList("30903101c5459f602d7004934bc85ca9")); executeTest("testStandardConf", spec); } @@ -299,7 +299,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { " -V:combined2 " + privateTestDir + "combine.single.sample.pipeline.combined.vcf" + " --uniquifySamples", b37KGReference), 1, - Collections.singletonList("16d7374502fa3cf99863d15d31b5ef86")); + Collections.singletonList("c23b1e3f9a960e022038768998a8df82")); executeTest("testUniquifiedSamples", spec); } @@ -666,7 +666,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { public void testGenotypingSpanningDeletionAcrossLines() { final WalkerTestSpec spec = new WalkerTestSpec( baseTestString(" -V " + privateTestDir + "input-1_2256566.vcf", b37KGReference), - Collections.singletonList("1f914189326cdd17d0a8753f13cb221f")); + Collections.singletonList("24ac243e77e679508c6554194923317b")); spec.disableShadowBCF(); executeTest("testGenotypingSpanningDeletionAcrossLines", spec); } From 601c26a5922c47a51ee5e0b9659f9856111f1ab0 Mon Sep 17 00:00:00 2001 From: David Benjamin Date: Thu, 25 Aug 2016 11:54:02 -0400 Subject: [PATCH 36/68] More small refactorings of Mutect2 code --- .../gatk/tools/walkers/cancer/m2/MuTect2.java | 302 ++++++------------ .../cancer/m2/SomaticGenotypingEngine.java | 52 +-- .../cancer/m2/MuTect2IntegrationTest.java | 7 - 3 files changed, 116 insertions(+), 245 deletions(-) diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/MuTect2.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/MuTect2.java index 4f509f796..4722e51f3 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/MuTect2.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/MuTect2.java @@ -71,6 +71,7 @@ import org.broadinstitute.gatk.tools.walkers.haplotypecaller.*; import org.broadinstitute.gatk.tools.walkers.haplotypecaller.readthreading.ReadThreadingAssembler; import org.broadinstitute.gatk.utils.GenomeLoc; import org.broadinstitute.gatk.utils.GenomeLocParser; +import org.broadinstitute.gatk.utils.MathUtils; import org.broadinstitute.gatk.utils.QualityUtils; import org.broadinstitute.gatk.utils.activeregion.ActiveRegion; import org.broadinstitute.gatk.utils.activeregion.ActiveRegionReadState; @@ -189,20 +190,18 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i protected SampleList samplesList; protected boolean printTCGAsampleHeader = false; - // fasta reference reader to supplement the edges of the reference sequence protected CachingIndexedFastaSequenceFile referenceReader; - - // the assembly engine protected LocalAssemblyEngine assemblyEngine = null; - - // the likelihoods engine protected ReadLikelihoodCalculationEngine likelihoodCalculationEngine = null; - - // the genotyping engine - protected HaplotypeCallerGenotypingEngine genotypingEngine = null; + protected SomaticGenotypingEngine genotypingEngine = null; + private HaplotypeBAMWriter haplotypeBAMWriter; private byte MIN_TAIL_QUALITY; private double log10GlobalReadMismappingRate; + private static final int REFERENCE_PADDING = 500; + private static final byte MIN_TAIL_QUALITY_WITH_ERROR_CORRECTION = 6; + private final static int maxReadsInRegionPerSample = 1000; // TODO -- should be an argument + private final static int minReadsPerAlignmentStart = 5; // TODO -- should be an argument @ArgumentCollection protected M2ArgumentCollection MTAC = new M2ArgumentCollection(); @@ -221,10 +220,6 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i @Argument(fullName = "MQ_filtering_level", shortName = "MQthreshold", required = false, doc="Set an alternate MQ threshold for debugging") final public int MQthreshold = 20; - - public RodBinding getDbsnpRodBinding() { return MTAC.dbsnp.dbsnp; } - - private HaplotypeBAMWriter haplotypeBAMWriter; /** * If a call overlaps with a record from the provided comp track, the INFO field will be annotated * as such in the output with the track name (e.g. -comp:FOO will have 'FOO' in the INFO field). @@ -291,12 +286,14 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i @Argument(fullName="justDetermineActiveRegions", shortName="justDetermineActiveRegions", doc = "If specified, the HC won't actually do any assembly or calling, it'll just run the upfront active region determination code. Useful for benchmarking and scalability testing", required=false) protected boolean justDetermineActiveRegions = false; - // reference base padding size - private static final int REFERENCE_PADDING = 500; + /** + * As of GATK 3.3, HaplotypeCaller outputs physical (read-based) information (see version 3.3 release notes and documentation for details). This argument disables that behavior. + */ + @Advanced + @Argument(fullName="doNotRunPhysicalPhasing", shortName="doNotRunPhysicalPhasing", doc="Disable physical phasing", required = false) + protected boolean doNotRunPhysicalPhasing = false; - private static final byte MIN_TAIL_QUALITY_WITH_ERROR_CORRECTION = 6; - private final static int maxReadsInRegionPerSample = 1000; // TODO -- should be an argument - private final static int minReadsPerAlignmentStart = 5; // TODO -- should be an argument + public RodBinding getDbsnpRodBinding() { return MTAC.dbsnp.dbsnp; } @Override public void initialize() { @@ -388,7 +385,6 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i genotypingEngine = new SomaticGenotypingEngine( MTAC, samplesList, toolkit.getGenomeLocParser(), !doNotRunPhysicalPhasing, MTAC, tumorSampleName, normalSampleName, DEBUG_READ_NAME); - genotypingEngine.setCrossHaplotypeEventMerger(variantMerger); genotypingEngine.setAnnotationEngine(annotationEngine); @@ -438,7 +434,8 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i headerInfo.addAll(getM2HeaderLines()); headerInfo.addAll(getSampleHeaderLines()); - final List outputSampleNames = getOutputSampleNames(); + // if printTCGAsampleHeader, we already checked for exactly 1 tumor and 1 normal in printTCGAsampleHeader assignment in initialize() + final List outputSampleNames = printTCGAsampleHeader ? Arrays.asList("TUMOR", "NORMAL") : SampleListUtils.asList(samplesList); vcfWriter.writeHeader(new VCFHeader(headerInfo, outputSampleNames)); @@ -507,19 +504,6 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i return sampleLines; } - private List getOutputSampleNames(){ - if (printTCGAsampleHeader) { - //Already checked for exactly 1 tumor and 1 normal in printTCGAsampleHeader assignment in initialize() - final List sampleNamePlaceholders = new ArrayList<>(2); - sampleNamePlaceholders.add("TUMOR"); - sampleNamePlaceholders.add("NORMAL"); - return sampleNamePlaceholders; - } - else { - return SampleListUtils.asList(samplesList); - } - } - @Override public ActivityProfileState isActive(final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context) { if( context == null || context.getBasePileup().isEmpty() ) @@ -564,7 +548,6 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i prob = 1; logger.debug("At " + ref.getLocus().toString() + " tlod: " + tumorLod + " and no-normal calling"); } - } return new ActivityProfileState( ref.getLocus(), prob, ActivityProfileState.Type.NONE, null); @@ -573,16 +556,13 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i private final static List NO_CALLS = Collections.emptyList(); @Override public List map( final ActiveRegion originalActiveRegion, final RefMetaDataTracker metaDataTracker ) { - if ( justDetermineActiveRegions ) - // we're benchmarking ART and/or the active region determination code in the HC, just leave without doing any work + if ( justDetermineActiveRegions ) { return NO_CALLS; - - if( !originalActiveRegion.isActive() ) - // Not active so nothing to do! + } else if( !originalActiveRegion.isActive() ) { return referenceModelForNoVariation(originalActiveRegion, true); - - // No reads here so nothing to do! - if( originalActiveRegion.size() == 0 ) { return referenceModelForNoVariation(originalActiveRegion, true); } + } else if( originalActiveRegion.size() == 0 ) { + return referenceModelForNoVariation(originalActiveRegion, true); + } logReadInfo(DEBUG_READ_NAME, originalActiveRegion.getReads(), "Present in original active region"); // create the assembly using just high quality reads (Q20 or higher). We want to use lower @@ -593,30 +573,25 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i assemblyActiveRegion.add(rec); } } - logReadInfo(DEBUG_READ_NAME, assemblyActiveRegion.getReads(), "Present in assembly active region"); // run the local assembler, getting back a collection of information on how we should proceed - final List givenAlleles = new ArrayList<>(); - final AssemblyResultSet untrimmedAssemblyResult = assembleReads(assemblyActiveRegion, givenAlleles); - - + final AssemblyResultSet untrimmedAssemblyResult = assembleReads(assemblyActiveRegion, Collections.EMPTY_LIST); final TreeSet allVariationEvents = untrimmedAssemblyResult.getVariationEvents(); - // TODO - line bellow might be unecessary : it might be that assemblyResult will always have those alleles anyway - // TODO - so check and remove if that is the case: - allVariationEvents.addAll(givenAlleles); - final ActiveRegionTrimmer.Result trimmingResult = trimmer.trim(originalActiveRegion,allVariationEvents); - - - // Stop the trimming madness!!! - if (!trimmingResult.isVariationPresent()) - return referenceModelForNoVariation(originalActiveRegion,false); + if (!trimmingResult.isVariationPresent()) { + return referenceModelForNoVariation(originalActiveRegion, false); + } logReadInfo(DEBUG_READ_NAME, trimmingResult.getCallableRegion().getReads(), "Present in trimming result"); final AssemblyResultSet assemblyResult = trimmingResult.needsTrimming() ? untrimmedAssemblyResult.trimTo(trimmingResult.getCallableRegion()) : untrimmedAssemblyResult; + // it is conceivable that the region is active yet has no events upon assembling only the well-mapped reads + if( ! assemblyResult.isVariationPresent() ) { + return referenceModelForNoVariation(originalActiveRegion, false); + } + final ActiveRegion regionForGenotyping = assemblyResult.getRegionForGenotyping(); logReadInfo(DEBUG_READ_NAME, regionForGenotyping.getReads(), "Present in region for genotyping"); @@ -624,7 +599,6 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i haplotypeBAMWriter.addDroppedReadsFromDelta(DroppedReadsTracker.Reason.TRIMMMED, originalActiveRegion.getReads(), regionForGenotyping.getReads()); } - // filter out reads from genotyping which fail mapping quality based criteria //TODO - why don't do this before any assembly is done? Why not just once at the beginning of this method //TODO - on the originalActiveRegion? @@ -639,17 +613,6 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i final Map> perSampleFilteredReadList = splitReadsBySample(filteredReads); logReadInfo(DEBUG_READ_NAME, regionForGenotyping.getReads(), "Present in region for genotyping after filtering reads"); - // abort early if something is out of the acceptable range - // TODO is this ever true at this point??? perhaps GGA. Need to check. - if( ! assemblyResult.isVariationPresent() ) - return referenceModelForNoVariation(originalActiveRegion, false); - - // TODO is this ever true at this point??? perhaps GGA. Need to check. - if( regionForGenotyping.size() == 0 ) { - // no reads remain after filtering so nothing else to do! - return referenceModelForNoVariation(originalActiveRegion, false); - } - // evaluate each sample's reads against all haplotypes final List haplotypes = assemblyResult.getHaplotypeList(); @@ -658,6 +621,11 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i logReadInfo(DEBUG_READ_NAME, rec, "Present after splitting assemblyResult by sample"); } + //TODO: this should be written as + //TODO final Map ARreads_origNormalMQ = regionForGenotyping.getReads().stream() + //TODO .collect(Collectors.toMap(GATKSAMRecord::getReadName, GATKSAMRecord::getMappingQuality)); + //TODO: but for some reason sometimes streamifying Mutect2 code causes a MalformedWalkerArgumentsException + //TODO: this will probably not occur after the port to GATK4 final HashMap ARreads_origNormalMQ = new HashMap<>(); for (final GATKSAMRecord read : regionForGenotyping.getReads()) { ARreads_origNormalMQ.put(read.getReadName(), read.getMappingQuality()); @@ -671,14 +639,8 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i } logger.debug("Computing read likelihoods with " + regionForGenotyping.getReads().size() + " reads against " + haplotypes.size() + " haplotypes across region " + assemblyResult.getRegionForGenotyping().toString()); + final ReadLikelihoods readLikelihoods = likelihoodCalculationEngine.computeReadLikelihoods(assemblyResult,samplesList,reads); - - // Calculate the likelihoods: CPU intensive part. - final ReadLikelihoods readLikelihoods = - likelihoodCalculationEngine.computeReadLikelihoods(assemblyResult,samplesList,reads); - - // Realign reads to their best haplotype. - // KCIBUL: this is new stuff -- review it! final Map readRealignments = realignReadsToTheirBestHaplotype(readLikelihoods, assemblyResult.getReferenceHaplotype(), assemblyResult.getPaddedReferenceLoc()); if ( MTAC.bamWriter != null && MTAC.emitDroppedReads ) { @@ -693,22 +655,14 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i logReadInfo(DEBUG_READ_NAME, rec, "Present after computing read likelihoods"); } - // Note: we used to subset down at this point to only the "best" haplotypes in all samples for genotyping, but there - // was a bad interaction between that selection and the marginalization that happens over each event when computing - // GLs. In particular, for samples that are heterozygous non-reference (B/C) the marginalization for B treats the - // haplotype containing C as reference (and vice versa). Now this is fine if all possible haplotypes are included - // in the genotyping, but we lose information if we select down to a few haplotypes. [EB] - - final HaplotypeCallerGenotypingEngine.CalledHaplotypes calledHaplotypes = ((SomaticGenotypingEngine)genotypingEngine).callMutations( - haplotypes, + final HaplotypeCallerGenotypingEngine.CalledHaplotypes calledHaplotypes = genotypingEngine.callMutations( readLikelihoods, ARreads_origNormalMQ, perSampleFilteredReadList, assemblyResult.getFullReferenceWithPadding(), assemblyResult.getPaddedReferenceLoc(), regionForGenotyping.getLocation(), - metaDataTracker, - givenAlleles); + metaDataTracker); if ( MTAC.bamWriter != null ) { final Set calledHaplotypeSet = new HashSet<>(calledHaplotypes.getCalledHaplotypes()); @@ -727,66 +681,7 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i } if( MTAC.DEBUG ) { logger.info("----------------------------------------------------------------------------------"); } - - - final List annotatedCalls = new ArrayList<>(); - final int eventCount = calledHaplotypes.getCalls().size(); - Integer minEventDistance = null; - Integer maxEventDistance = null; - Integer lastPosition = null; - for (final VariantContext vc : calledHaplotypes.getCalls()) { - if (lastPosition == null) { - lastPosition = vc.getStart(); - } else { - final int dist = Math.abs(vc.getStart() - lastPosition); - if (maxEventDistance == null || dist > maxEventDistance) { - maxEventDistance = dist; - } - if (minEventDistance == null || dist < minEventDistance) { - minEventDistance = dist; - } - } - } - final Map eventDistanceAttributes = new HashMap<>(); - eventDistanceAttributes.put(GATKVCFConstants.EVENT_COUNT_IN_HAPLOTYPE_KEY, eventCount); - eventDistanceAttributes.put(GATKVCFConstants.EVENT_DISTANCE_MIN_KEY, minEventDistance); - eventDistanceAttributes.put(GATKVCFConstants.EVENT_DISTANCE_MAX_KEY, maxEventDistance); - - - // can we do this with the Annotation classes instead? - for (final VariantContext originalVC : calledHaplotypes.getCalls()) { - final VariantContextBuilder vcb = new VariantContextBuilder(originalVC); - - final Map attributes = new HashMap<>(originalVC.getAttributes()); - attributes.putAll(eventDistanceAttributes); - vcb.attributes(attributes); - - final Set filters = new HashSet<>(originalVC.getFilters()); - - final double tumorLod = originalVC.getAttributeAsDouble(GATKVCFConstants.TUMOR_LOD_KEY, -1); - if (tumorLod < MTAC.TUMOR_LOD_THRESHOLD) { - filters.add(GATKVCFConstants.TUMOR_LOD_FILTER_NAME); - } - - // if we are in artifact detection mode, apply the thresholds for the LOD scores - if (!MTAC.ARTIFACT_DETECTION_MODE) { - filters.addAll(calculateFilters(metaDataTracker, originalVC, eventDistanceAttributes)); - } - - vcb.filters(filters.isEmpty() ? VariantContext.PASSES_FILTERS : filters); - - if (printTCGAsampleHeader) { - final Genotype tumorGenotype = new GenotypeBuilder(originalVC.getGenotype(tumorSampleName)).name("TUMOR").make(); - final Genotype normalGenotype = new GenotypeBuilder(originalVC.getGenotype(normalSampleName)).name("NORMAL").make(); - vcb.genotypes(Arrays.asList(tumorGenotype, normalGenotype)); - } - - annotatedCalls.add(vcb.make()); - } - - // TODO: find a better place for this debug message - // logger.info("We had " + TumorPowerCalculator.numCacheHits + " hits in starnd artifact power calculation"); - return annotatedCalls; + return annotateVCs(calledHaplotypes, metaDataTracker); } private Set calculateFilters(final RefMetaDataTracker metaDataTracker, final VariantContext vc, final Map eventDistanceAttributes) { @@ -828,15 +723,10 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i if ( (normalAltCounts > MTAC.MAX_ALT_ALLELES_IN_NORMAL_COUNT || normalF > MTAC.MAX_ALT_ALLELE_IN_NORMAL_FRACTION ) && normalAltQualityScoreSum > MTAC.MAX_ALT_ALLELES_IN_NORMAL_QSCORE_SUM) { filters.add(GATKVCFConstants.ALT_ALLELE_IN_NORMAL_FILTER_NAME); - } else { - - // NOTE: does normal alt counts presume the normal had all these events in CIS? - if ( eventCount > 1 && normalAltCounts >= 1) { - filters.add(GATKVCFConstants.MULTI_EVENT_ALT_ALLELE_IN_NORMAL_FILTER_NAME); - } else if (eventCount >= 3) { - filters.add(GATKVCFConstants.HOMOLOGOUS_MAPPING_EVENT_FILTER_NAME); - } - + } else if ( eventCount > 1 && normalAltCounts >= 1) { + filters.add(GATKVCFConstants.MULTI_EVENT_ALT_ALLELE_IN_NORMAL_FILTER_NAME); + } else if (eventCount >= 3) { + filters.add(GATKVCFConstants.HOMOLOGOUS_MAPPING_EVENT_FILTER_NAME); } // STR contractions, that is the deletion of one repeat unit of a short repeat (>1bp repeat unit) @@ -861,7 +751,6 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i filters.add(GATKVCFConstants.CLUSTERED_EVENTS_FILTER_NAME); } - // clustered read position filter if (MTAC.ENABLE_CLUSTERED_READ_POSITION_FILTER){ final Double tumorFwdPosMedian = (Double) vc.getAttribute(GATKVCFConstants.MEDIAN_LEFT_OFFSET_KEY); @@ -878,7 +767,6 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i return filters; } - private final static byte REF_MODEL_DELETION_QUAL = (byte) 30; /** * Calculate the genotype likelihoods for the sample in pileup for being hom-ref contrasted with being ref vs. alt @@ -916,6 +804,7 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i return (normalSampleName != null); } + //TODO: streamify in GATK4 protected int getCountOfNonRefEvents(final ReadBackedPileup pileup, final byte refBase, final byte minBaseQual) { int i=0; for( final PileupElement p : pileup ) { @@ -960,37 +849,21 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i protected Set filterNonPassingReads( final ActiveRegion activeRegion) { final Set readsToRemove = new LinkedHashSet<>(); for( final GATKSAMRecord rec : activeRegion.getReads() ) { - + //TODO: Takuto points out that this is questionable. Let's think hard abut it. // KCIBUL: only perform read quality filtering on tumor reads... if (isReadFromNormal(rec)) { - if( rec.getReadLength() < MIN_READ_LENGTH ) { readsToRemove.add(rec); } - - } else { - - - if( rec.getReadLength() < MIN_READ_LENGTH || - rec.getMappingQuality() < MQthreshold || - BadMateFilter.hasBadMate(rec) || - + } else if( rec.getReadLength() < MIN_READ_LENGTH || rec.getMappingQuality() < MQthreshold || BadMateFilter.hasBadMate(rec) || (keepRG != null && !rec.getReadGroup().getId().equals(keepRG)) ) { - readsToRemove.add(rec); - } + readsToRemove.add(rec); } } activeRegion.removeAll(readsToRemove); return readsToRemove; } - private static GATKSAMRecord findReadByName(final Collection reads, final String name) { - for(final GATKSAMRecord read : reads) { - if (name.equals(read.getReadName())) return read; - } - return null; - } - /** * Instantiates the appropriate likelihood calculation engine. * @@ -1025,13 +898,7 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i // enable non primary and extended reads in the active region @Override public EnumSet desiredReadStates() { -// if ( includeUnmappedReads ) -// throw new UserException.BadArgumentValue("includeUnmappedReads", "is not yet functional"); -// else - return EnumSet.of( - ActiveRegionReadState.PRIMARY, - ActiveRegionReadState.NONPRIMARY, - ActiveRegionReadState.EXTENDED); + return EnumSet.of(ActiveRegionReadState.PRIMARY, ActiveRegionReadState.NONPRIMARY, ActiveRegionReadState.EXTENDED); } //--------------------------------------------------------------------------------------------------------------- @@ -1063,7 +930,6 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i logger.info("Ran local assembly on " + result + " active regions"); } - // The following are not used but are required by the AnnotatorCompatible interface public RodBinding getSnpEffRodBinding() { return null; } public List> getResourceRodBindings() { return Collections.emptyList(); } @@ -1087,15 +953,12 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i final Haplotype referenceHaplotype = createReferenceHaplotype(activeRegion, paddedReferenceLoc); // Create ReadErrorCorrector object if requested - will be used within assembly engine. - ReadErrorCorrector readErrorCorrector = null; - if (errorCorrectReads) - readErrorCorrector = new ReadErrorCorrector(RTAC.kmerLengthForReadErrorCorrection, MIN_TAIL_QUALITY_WITH_ERROR_CORRECTION, RTAC.minObservationsForKmerToBeSolid, MTAC.DEBUG, fullReferenceWithPadding); - + final ReadErrorCorrector readErrorCorrector = errorCorrectReads ? new ReadErrorCorrector(RTAC.kmerLengthForReadErrorCorrection, MIN_TAIL_QUALITY_WITH_ERROR_CORRECTION, + RTAC.minObservationsForKmerToBeSolid, MTAC.DEBUG, fullReferenceWithPadding) : null; try { final AssemblyResultSet assemblyResultSet = assemblyEngine.runLocalAssembly( activeRegion, referenceHaplotype, fullReferenceWithPadding, paddedReferenceLoc, giveAlleles,readErrorCorrector ); assemblyResultSet.debugDump(logger); return assemblyResultSet; - } catch ( final Exception e ) { // Capture any exception that might be thrown, and write out the assembly failure BAM if requested if ( captureAssemblyFailureBAM ) { @@ -1151,10 +1014,6 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i haplotypeBAMWriter.addDroppedReadsFromDelta(DroppedReadsTracker.Reason.DOWNSAMPLED, activeRegion.getReads(), downsampledReads); } - // handle overlapping read pairs from the same fragment - // KC: commented out as we handle overlapping read pairs in a different way... - //cleanOverlappingReadPairs(downsampledReads, normalSampleNames); - activeRegion.clearReads(); activeRegion.addAll(downsampledReads); activeRegion.setFinalized(true); @@ -1189,11 +1048,7 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i final FragmentCollection fragmentCollection = FragmentUtils.create(perSampleReadList); for ( final List overlappingPair : fragmentCollection.getOverlappingPairs() ) - - // in MuTect -- right now we compare the FragmentUtils.adjustQualsOfOverlappingPairedFragments(overlappingPair); - - } } @@ -1236,20 +1091,61 @@ public class MuTect2 extends ActiveRegionWalker, Integer> i private boolean isReadFromNormal(final GATKSAMRecord rec) { return normalSampleName != null && normalSampleName.equals(rec.getReadGroup().getSample()); - } public String getTumorSampleName(){ return tumorSampleName; } - // KCIBUL: new stuff -- read up on this!! - /** - * As of GATK 3.3, HaplotypeCaller outputs physical (read-based) information (see version 3.3 release notes and documentation for details). This argument disables that behavior. - */ - @Advanced - @Argument(fullName="doNotRunPhysicalPhasing", shortName="doNotRunPhysicalPhasing", doc="Disable physical phasing", required = false) - protected boolean doNotRunPhysicalPhasing = false; + final List annotateVCs(final HaplotypeCallerGenotypingEngine.CalledHaplotypes calledHaplotypes, final RefMetaDataTracker metaDataTracker) { + final int eventCount = calledHaplotypes.getCalls().size(); + final Map eventDistanceAttributes = new HashMap<>(); //TODO: should be Map -- see TODO below + eventDistanceAttributes.put(GATKVCFConstants.EVENT_COUNT_IN_HAPLOTYPE_KEY, eventCount); + if (eventCount > 1) { + final int lastPosition = calledHaplotypes.getCalls().get(0).getStart(); + final int[] eventDistances = new int[calledHaplotypes.getCalls().size() - 1]; + for (int n = 0; n < eventDistances.length; n++) { + eventDistances[n] = Math.abs(calledHaplotypes.getCalls().get(n + 1).getStart() - lastPosition); + } + eventDistanceAttributes.put(GATKVCFConstants.EVENT_DISTANCE_MIN_KEY, MathUtils.arrayMin(eventDistances)); + eventDistanceAttributes.put(GATKVCFConstants.EVENT_DISTANCE_MAX_KEY, MathUtils.arrayMax(eventDistances)); + } else { //TODO: putting null is a hack -- we should remove this and update the integration test md5s + eventDistanceAttributes.put(GATKVCFConstants.EVENT_DISTANCE_MIN_KEY, null); + eventDistanceAttributes.put(GATKVCFConstants.EVENT_DISTANCE_MAX_KEY, null); + } + + final List annotatedCalls = new ArrayList<>(); + // can we do this with the Annotation classes instead? + for (final VariantContext originalVC : calledHaplotypes.getCalls()) { + final VariantContextBuilder vcb = new VariantContextBuilder(originalVC); + + final Map attributes = new HashMap<>(originalVC.getAttributes()); + attributes.putAll(eventDistanceAttributes); + vcb.attributes(attributes); + + final Set filters = new HashSet<>(originalVC.getFilters()); + + final double tumorLod = originalVC.getAttributeAsDouble(GATKVCFConstants.TUMOR_LOD_KEY, -1); + if (tumorLod < MTAC.TUMOR_LOD_THRESHOLD) { + filters.add(GATKVCFConstants.TUMOR_LOD_FILTER_NAME); + } + + // if we are in artifact detection mode, apply the thresholds for the LOD scores + if (!MTAC.ARTIFACT_DETECTION_MODE) { + filters.addAll(calculateFilters(metaDataTracker, originalVC, eventDistanceAttributes)); + } + + vcb.filters(filters.isEmpty() ? VariantContext.PASSES_FILTERS : filters); + + if (printTCGAsampleHeader) { + final Genotype tumorGenotype = new GenotypeBuilder(originalVC.getGenotype(tumorSampleName)).name("TUMOR").make(); + final Genotype normalGenotype = new GenotypeBuilder(originalVC.getGenotype(normalSampleName)).name("NORMAL").make(); + vcb.genotypes(Arrays.asList(tumorGenotype, normalGenotype)); + } + annotatedCalls.add(vcb.make()); + } + return annotatedCalls; + } } diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/SomaticGenotypingEngine.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/SomaticGenotypingEngine.java index 20940c74f..a0b259e1e 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/SomaticGenotypingEngine.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/SomaticGenotypingEngine.java @@ -51,10 +51,7 @@ package org.broadinstitute.gatk.tools.walkers.cancer.m2; -import com.google.java.contract.Ensures; -import htsjdk.samtools.util.StringUtil; import htsjdk.variant.variantcontext.*; -import org.apache.commons.collections.ListUtils; import org.apache.commons.lang.mutable.MutableDouble; import org.apache.commons.lang.mutable.MutableInt; import org.apache.log4j.Logger; @@ -63,7 +60,6 @@ import org.broadinstitute.gatk.tools.walkers.genotyper.afcalc.AFCalculatorProvid import org.broadinstitute.gatk.tools.walkers.haplotypecaller.HaplotypeCallerGenotypingEngine; import org.broadinstitute.gatk.utils.GenomeLoc; import org.broadinstitute.gatk.utils.GenomeLocParser; -import org.broadinstitute.gatk.utils.commandline.RodBinding; import org.broadinstitute.gatk.utils.contexts.ReferenceContext; import org.broadinstitute.gatk.utils.genotyper.MostLikelyAllele; import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap; @@ -87,6 +83,9 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine { private final String matchedNormalSampleName; private final String DEBUG_READ_NAME; + //Mutect2 does not run in GGA mode + private static final List NO_GIVEN_ALLELES = Collections.EMPTY_LIST; + // {@link GenotypingEngine} requires a non-null {@link AFCalculatorProvider} but this class doesn't need it. Thus we make a dummy private static AFCalculatorProvider DUMMY_AF_CALCULATOR_PROVIDER = new AFCalculatorProvider() { public AFCalculator getInstance(final int ploidy, final int maximumAltAlleles) { return null; } @@ -109,7 +108,8 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine { this.DEBUG_READ_NAME = DEBUG_READ_NAME; // coverage related initialization - final double errorProbability = Math.pow(10, -MTAC.POWER_CONSTANT_QSCORE / 10); + //TODO: in GATK4, use a QualityUtils method + final double errorProbability = Math.pow(10, -MTAC.POWER_CONSTANT_QSCORE/10); strandArtifactPowerCalculator = new TumorPowerCalculator(errorProbability, MTAC.STRAND_ARTIFACT_LOD_THRESHOLD, 0.0f); } @@ -118,48 +118,40 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine { * genotype likelihoods and assemble into a list of variant contexts and genomic events ready for calling * * The list of samples we're working with is obtained from the readLikelihoods - - * @param haplotypes Haplotypes to assign likelihoods to * @param readLikelihoods Map from reads->(haplotypes,likelihoods) * @param perSampleFilteredReadList Map from sample to reads that were filtered after assembly and before calculating per-read likelihoods. * @param ref Reference bytes at active region * @param refLoc Corresponding active region genome location * @param activeRegionWindow Active window - * @param activeAllelesToGenotype Alleles to genotype * * @return A CalledHaplotypes object containing a list of VC's with genotyped events and called haplotypes * */ - // TODO - can this be refactored? this is hard to follow! public CalledHaplotypes callMutations ( - final List haplotypes, final ReadLikelihoods readLikelihoods, final Map originalNormalReadQualities, final Map> perSampleFilteredReadList, final byte[] ref, final GenomeLoc refLoc, final GenomeLoc activeRegionWindow, - final RefMetaDataTracker tracker, - final List activeAllelesToGenotype) { - // sanity check input arguments - if (haplotypes == null || haplotypes.isEmpty()) throw new IllegalArgumentException("haplotypes input should be non-empty and non-null, got "+haplotypes); + final RefMetaDataTracker tracker) { + //TODO: in GATK4 use Utils.nonNull if (readLikelihoods == null || readLikelihoods.sampleCount() == 0) throw new IllegalArgumentException("readLikelihoods input should be non-empty and non-null, got "+readLikelihoods); if (ref == null || ref.length == 0 ) throw new IllegalArgumentException("ref bytes input should be non-empty and non-null, got "+ref); if (refLoc == null || refLoc.size() != ref.length) throw new IllegalArgumentException(" refLoc must be non-null and length must match ref bytes, got "+refLoc); if (activeRegionWindow == null ) throw new IllegalArgumentException("activeRegionWindow must be non-null, got "+activeRegionWindow); - if (activeAllelesToGenotype == null ) throw new IllegalArgumentException("activeAllelesToGenotype must be non-null, got "+activeAllelesToGenotype); + final List haplotypes = readLikelihoods.alleles(); + + // Somatic Tumor/Normal Sample Handling if (!readLikelihoods.samples().contains(tumorSampleName)) { - throw new IllegalArgumentException("readLikelihoods does not contain the tumor sample "+ tumorSampleName); + throw new IllegalArgumentException("readLikelihoods does not contain the tumor sample " + tumorSampleName); } - - // if we don't have the normal sample, we are in tumor only mode - // TODO: check in MuTect2.java for code we can skip when in tumor only mode final boolean hasNormal = matchedNormalSampleName != null; // update the haplotypes so we're ready to call, getting the ordered list of positions on the reference // that carry events among the haplotypes - final TreeSet startPosKeySet = decomposeHaplotypesIntoVariantContexts(haplotypes, readLikelihoods, ref, refLoc, activeAllelesToGenotype); + final TreeSet startPosKeySet = decomposeHaplotypesIntoVariantContexts(haplotypes, readLikelihoods, ref, refLoc, NO_GIVEN_ALLELES); // Walk along each position in the key set and create each event to be outputted final Set calledHaplotypes = new HashSet<>(); @@ -170,7 +162,7 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine { continue; } - final List eventsAtThisLoc = getVCsAtThisLocation(haplotypes, loc, activeAllelesToGenotype); + final List eventsAtThisLoc = getVCsAtThisLocation(haplotypes, loc, NO_GIVEN_ALLELES); if( eventsAtThisLoc.isEmpty() ) { continue; } @@ -203,14 +195,12 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine { * If we just want a map of Alleles to Haplotypes, we should be able to do so directly; no need for intermediate maps, which just complicates the code. **/ - final Map> alleleMapper = createAlleleMapper(mergeMap, eventMapper); // converting ReadLikelihoods to ReadLikeliHoods ReadLikelihoods readAlleleLikelihoods = readLikelihoods.marginalize(alleleMapper, genomeLocParser.createPaddedGenomeLoc(genomeLocParser.createGenomeLoc(mergedVC), ALLELE_EXTENSION)); - // LDG: do we want to do this before or after pulling out overlapping reads? - // TODO: do we want this at all? How does downsampling help? + //LDG: do we want to do this before or after pulling out overlapping reads? if (MTAC.isSampleContaminationPresent()) { readAlleleLikelihoods.contaminationDownsampling(MTAC.getSampleContamination()); } @@ -232,24 +222,18 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine { tumorLods.set(altAllele, tumorHetGenotypeLLs.get(altAllele) - tumorHetGenotypeLLs.getRef()); } - // TODO: another good breakpoint e.g. compute normal LOD/set thresholds // TODO: anything related to normal should be encapsulated in Optional - // Normal LOD must exceed this threshold for the variant to make it in the vcf - // TODO: variable name too log - double normalLodThresholdForVCF = -Double.MIN_VALUE; - // A variant candidate whose normal LOD is below this threshold will be filtered as 'germline_risk' // This is a more stringent threshold than normalLodThresholdForVCF - double normalLodFilterThreshold = -Double.MIN_VALUE; - + double normalLodFilterThreshold = -Double.MAX_VALUE; PerReadAlleleLikelihoodMap normalPRALM = null; final PerAlleleCollection normalLods = PerAlleleCollection.createPerAltAlleleCollection(); + // if normal bam is available, compute normal LOD // TODO: this if statement should be a standalone method for computing normal LOD // TODO: then we can do something like normalLodThreshold = hasNormal ? thisMethod() : Optional.empty() - // if normal bam is available, compute normal LOD if (hasNormal) { normalPRALM = readAlleleLikelihoods.toPerReadAlleleLikelihoodMap(readAlleleLikelihoods.sampleIndex(matchedNormalSampleName)); filterPRALMForOverlappingReads(normalPRALM, mergedVC.getReference(), loc, true); @@ -258,10 +242,8 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine { final GenomeLoc eventGenomeLoc = genomeLocParser.createGenomeLoc(activeRegionWindow.getContig(), loc); final Collection cosmicVC = tracker.getValues(MTAC.cosmicRod, eventGenomeLoc); final Collection dbsnpVC = tracker.getValues(MTAC.dbsnp.dbsnp, eventGenomeLoc); - final boolean germlineAtRisk = !dbsnpVC.isEmpty() && cosmicVC.isEmpty(); - normalLodThresholdForVCF = MTAC.INITIAL_NORMAL_LOD_THRESHOLD; normalLodFilterThreshold = germlineAtRisk ? MTAC.NORMAL_DBSNP_LOD_THRESHOLD : MTAC.NORMAL_LOD_THRESHOLD; // compute normal LOD = LL(X|REF)/LL(X|ALT) where REF is the diploid HET with AF = 0.5 @@ -284,7 +266,7 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine { for (final Allele altAllele : mergedVC.getAlternateAlleles()) { final boolean passesTumorLodThreshold = tumorLods.getAlt(altAllele) >= MTAC.INITIAL_TUMOR_LOD_THRESHOLD; - final boolean passesNormalLodThreshold = hasNormal ? normalLods.getAlt(altAllele) >= normalLodThresholdForVCF : true; + final boolean passesNormalLodThreshold = hasNormal ? normalLods.getAlt(altAllele) >= MTAC.INITIAL_NORMAL_LOD_THRESHOLD : true; if (passesTumorLodThreshold && passesNormalLodThreshold) { numPassingAlts++; allelesThatPassThreshold.add(altAllele); diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/MuTect2IntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/MuTect2IntegrationTest.java index b7ca42c4f..1c99946db 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/MuTect2IntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/cancer/m2/MuTect2IntegrationTest.java @@ -70,8 +70,6 @@ public class MuTect2IntegrationTest extends WalkerTest { final static String DREAM3_TP_INTERVALS_FILE = privateTestDir + "m2_dream3.tp.intervals"; final static String DREAM3_FP_INTERVALS_FILE = privateTestDir + "m2_dream3.fp.intervals"; - - final String commandLine = "-T MuTect2 --no_cmdline_in_header -dt NONE --disableDithering -alwaysloadVectorHMM -pairHMM LOGLESS_CACHING -ip 50 -R %s --dbsnp %s --cosmic %s --normal_panel %s -I:tumor %s -I:normal %s -L %s"; @@ -160,9 +158,6 @@ public class MuTect2IntegrationTest extends WalkerTest { M2Test(CCLE_MICRO_TUMOR_BAM, CCLE_MICRO_NORMAL_BAM, CCLE_MICRO_INTERVALS_FILE, "-contamination 0.1", "c25e48edd704bbb436cd6456d9f47d8b"); } - /** - * Test that tumor-only mode does not create an empty vcf - */ @Test public void testTumorOnly(){ m2TumorOnlyTest(CCLE_MICRO_TUMOR_BAM, "2:166000000-167000000", "", "2af2253b1f09ea8fd354e1bf2c4612f0"); @@ -177,6 +172,4 @@ public class MuTect2IntegrationTest extends WalkerTest { public void testClusteredReadPositionFilter() { M2Test(DREAM3_TUMOR_BAM, DREAM3_NORMAL_BAM, DREAM3_FP_INTERVALS_FILE, "--enable_clustered_read_position_filter", "b44c23af7de84f96d2371db25d29aba2"); } - - } From 3c88e6859f0d0987df3f37e2765cacda06aaa8cf Mon Sep 17 00:00:00 2001 From: Steve Huang Date: Tue, 6 Sep 2016 17:30:18 -0400 Subject: [PATCH 37/68] fix 1430 for genotype filters; refactored filter() method; added unit and integration test; more comprehensive fix must be done first on htsjdk side in JEXLMap (#1456) --- .../VariantFiltrationIntegrationTest.java | 18 + .../walkers/filters/VariantFiltration.java | 542 ++++++++++-------- .../filters/VariantFiltrationUnitTest.java | 141 ++++- 3 files changed, 461 insertions(+), 240 deletions(-) diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/filters/VariantFiltrationIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/filters/VariantFiltrationIntegrationTest.java index 61d92f0bb..9f2678d20 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/filters/VariantFiltrationIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/filters/VariantFiltrationIntegrationTest.java @@ -211,6 +211,24 @@ public class VariantFiltrationIntegrationTest extends WalkerTest { executeTest("testFilteringDPfromFORMAT", spec); } + @Test + public void testFilteringDPfromFORMATWithMissing() { + WalkerTestSpec spec = new WalkerTestSpec( + "-T VariantFiltration -o %s --no_cmdline_in_header -R " + b37KGReference + + " --genotypeFilterExpression 'DP < 10' --genotypeFilterName lowDP -V " + privateTestDir + "filteringDepthInFormatWithMissing.vcf", 1, + Arrays.asList("4bf46103a71bac92a11eae04b97f9877")); + executeTest("testFilteringDPfromFORMATWithMissing", spec); + } + + @Test + public void testFilteringDPfromFORMATAndFailMissing() { + WalkerTestSpec spec = new WalkerTestSpec( + "-T VariantFiltration -o %s --no_cmdline_in_header -R " + b37KGReference + + " --missingValuesInExpressionsShouldEvaluateAsFailing --genotypeFilterExpression 'DP < 10' --genotypeFilterName lowDP -V " + privateTestDir + "filteringDepthInFormatWithMissing.vcf", 1, + Arrays.asList("baeda696c92adc8745ac4ebbdead6c91")); + executeTest("testFilteringDPfromFORMATAndFailMissing", spec); + } + @Test public void testInvertGenotypeFilterExpression() { WalkerTestSpec spec = new WalkerTestSpec( diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/filters/VariantFiltration.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/filters/VariantFiltration.java index fb975c37b..c3087fe42 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/filters/VariantFiltration.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/filters/VariantFiltration.java @@ -25,6 +25,7 @@ package org.broadinstitute.gatk.tools.walkers.filters; +import com.google.common.annotations.VisibleForTesting; import htsjdk.tribble.Feature; import org.broadinstitute.gatk.utils.Utils; import org.broadinstitute.gatk.utils.commandline.*; @@ -55,7 +56,7 @@ import java.util.*; * *

    * This tool is designed for hard-filtering variant calls based on certain criteria. Records are hard-filtered - * by changing the value in the FILTER field to something other than PASS. Filtered records will be preserved + * by changing the value in the FILTER field to something other than PASS. Filtered records will be preserved * in the output unless their removal is requested in the command line.

    * *

    The most common way of specifying filtering criteria is by using JEXL queries. See the @@ -84,7 +85,7 @@ import java.util.*; * * *

    Caveat

    - *

    when you run VariantFiltration with a command that includes multiple logical parts, each part of the command is applied + *

    when you run {@link VariantFiltration} with a command that includes multiple logical parts, each part of the command is applied * individually to the original form of the VCF record. Say you ran a VF command that includes three parts: one applies * some genotype filters, another applies setFilterGtToNoCall (which changes sample genotypes to ./. whenever a sample has a * genotype-level FT annotation), and yet another one filters sites based on whether any samples have a no-call there. You might @@ -98,14 +99,17 @@ import java.util.*; @Reference(window=@Window(start=-50,stop=50)) public class VariantFiltration extends RodWalker { + // ----------------------------------------------------------------------------------------------- + // Arguments + // ----------------------------------------------------------------------------------------------- @ArgumentCollection protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection(); /** * Any variant which overlaps entries from the provided mask rod will be filtered. If the user wants logic to be reversed, - * i.e. filter variants that do not overlap with provided mask, then argument -filterNotInMask can be used. + * i.e. filter variants that do not overlap with provided mask, then argument {@code -filterNotInMask} can be used. * Note that it is up to the user to adapt the name of the mask to make it clear that the reverse logic was used - * (e.g. if masking against Hapmap, use -maskName=hapmap for the normal masking and -maskName=not_hapmap for the reverse masking). + * (e.g. if masking against Hapmap, use {@code -maskName=hapmap} for the normal masking and {@code -maskName=not_hapmap} for the reverse masking). */ @Input(fullName="mask", shortName="mask", doc="Input ROD mask", required=false) public RodBinding mask; @@ -115,41 +119,41 @@ public class VariantFiltration extends RodWalker { /** * VariantFiltration accepts any number of JEXL expressions (so you can have two named filters by using - * --filterName One --filterExpression "X < 1" --filterName Two --filterExpression "X > 2"). + * {@code --filterName One --filterExpression "X < 1" --filterName Two --filterExpression "X > 2"}). */ @Argument(fullName="filterExpression", shortName="filter", doc="One or more expression used with INFO fields to filter", required=false) - protected ArrayList filterExpressions = new ArrayList(); + protected ArrayList filterExpressions = new ArrayList<>(); /** - * This name is put in the FILTER field for variants that get filtered. Note that there must be a 1-to-1 mapping between filter expressions and filter names. + * This name is put in the

    FILTER
    field for variants that get filtered. Note that there must be a 1-to-1 mapping between filter expressions and filter names. */ @Argument(fullName="filterName", shortName="filterName", doc="Names to use for the list of filters", required=false) - protected ArrayList filterNames = new ArrayList(); + protected ArrayList filterNames = new ArrayList<>(); /** - * Similar to the INFO field based expressions, but used on the FORMAT (genotype) fields instead. - * VariantFiltration will add the sample-level FT tag to the FORMAT field of filtered samples (this does not affect the record's FILTER tag). - * One can filter normally based on most fields (e.g. "GQ < 5.0"), but the GT (genotype) field is an exception. We have put in convenience - * methods so that one can now filter out hets ("isHet == 1"), refs ("isHomRef == 1"), or homs ("isHomVar == 1"). Also available are - * expressions isCalled, isNoCall, isMixed, and isAvailable, in accordance with the methods of the Genotype object. + * Similar to the
    INFO
    field based expressions, but used on the
    FORMAT
    (genotype) fields instead. + * {@link VariantFiltration} will add the sample-level
    FT
    tag to the
    FORMAT
    field of filtered samples (this does not affect the record's
    FILTER
    tag). + * One can filter normally based on most fields (e.g. {@code "GQ < 5.0"}), but the
    GT
    (genotype) field is an exception. + * We have put in convenience methods so that one can now filter out hets ({@code "isHet == 1"}), refs ({@code "isHomRef == 1"}), or homs ({@code "isHomVar == 1"}). + * Also available are expressions {@code isCalled}, {@code isNoCall}, {@code isMixed}, and {@code isAvailable}, in accordance with the methods of the {@link Genotype} object. */ @Argument(fullName="genotypeFilterExpression", shortName="G_filter", doc="One or more expression used with FORMAT (sample/genotype-level) fields to filter (see documentation guide for more info)", required=false) - protected ArrayList genotypeFilterExpressions = new ArrayList(); + protected ArrayList genotypeFilterExpressions = new ArrayList<>(); /** - * Similar to the INFO field based expressions, but used on the FORMAT (genotype) fields instead. + * Similar to the
    INFO
    field based expressions, but used on the
    FORMAT
    (genotype) fields instead. */ @Argument(fullName="genotypeFilterName", shortName="G_filterName", doc="Names to use for the list of sample/genotype filters (must be a 1-to-1 mapping); this name is put in the FILTER field for variants that get filtered", required=false) - protected ArrayList genotypeFilterNames = new ArrayList(); + protected ArrayList genotypeFilterNames = new ArrayList<>(); /** - * Works together with the --clusterWindowSize argument. + * Works together with the {@code --clusterWindowSize} argument. */ @Argument(fullName="clusterSize", shortName="cluster", doc="The number of SNPs which make up a cluster", required=false) protected Integer clusterSize = 3; /** - * Works together with the --clusterSize argument. To disable the clustered SNP filter, set this value to less than 1. + * Works together with the {@code --clusterWindowSize} argument. To disable the clustered SNP filter, set this value to less than 1. */ @Argument(fullName="clusterWindowSize", shortName="window", doc="The window size (in bases) in which to evaluate clustered SNPs", required=false) protected Integer clusterWindow = 0; @@ -158,45 +162,45 @@ public class VariantFiltration extends RodWalker { protected Integer maskExtension = 0; /** - * When using the -mask argument, the maskName will be annotated in the variant record. - * Note that when using the -filterNotInMask argument to reverse the masking logic, + * When using the {@code -mask} argument, the {@code maskName} will be annotated in the variant record. + * Note that when using the {@code -filterNotInMask} argument to reverse the masking logic, * it is up to the user to adapt the name of the mask to make it clear that the reverse logic was used - * (e.g. if masking against Hapmap, use -maskName=hapmap for the normal masking and -maskName=not_hapmap for the reverse masking). + * (e.g. if masking against Hapmap, use {@code -maskName=hapmap} for the normal masking and {@code -maskName=not_hapmap} for the reverse masking). */ @Argument(fullName="maskName", shortName="maskName", doc="The text to put in the FILTER field if a 'mask' rod is provided and overlaps with a variant call", required=false) protected String maskName = "Mask"; /** - * By default, if the -mask argument is used, any variant falling in a mask will be filtered. + * By default, if the {@code -mask} argument is used, any variant falling in a mask will be filtered. * If this argument is used, logic is reversed, and variants falling outside a given mask will be filtered. * Use case is, for example, if we have an interval list or BED file with "good" sites. * Note that it is up to the user to adapt the name of the mask to make it clear that the reverse logic was used - * (e.g. if masking against Hapmap, use -maskName=hapmap for the normal masking and -maskName=not_hapmap for the reverse masking). + * (e.g. if masking against Hapmap, use {@code -maskName=hapmap} for the normal masking and {@code -maskName=not_hapmap} for the reverse masking). */ @Argument(fullName="filterNotInMask", shortName="filterNotInMask", doc="Filter records NOT in given input mask.", required=false) protected boolean filterRecordsNotInMask = false; /** - * By default, if JEXL cannot evaluate your expression for a particular record because one of the annotations is not present, the whole expression evaluates as PASSing. + * By default, if JEXL cannot evaluate your expression for a particular record because one of the annotations is not present, the whole expression evaluates as
    PASS
    ing. * Use this argument to have it evaluate as failing filters instead for these cases. */ @Argument(fullName="missingValuesInExpressionsShouldEvaluateAsFailing", doc="When evaluating the JEXL expressions, missing values should be considered failing the expression", required=false) protected Boolean failMissingValues = false; /** - * Invalidate previous filters applied to the VariantContext, applying only the filters here + * Invalidate previous filters applied to the {@link VariantContext}, applying only the filters here. */ @Argument(fullName="invalidatePreviousFilters",doc="Remove previous filters applied to the VCF",required=false) boolean invalidatePrevious = false; /** - * Invert the selection criteria for --filterExpression + * Invert the selection criteria for {@code --filterExpression}. */ @Argument(fullName="invertFilterExpression", shortName="invfilter", doc="Invert the selection criteria for --filterExpression", required=false) protected boolean invertFilterExpression = false; /** - * Invert the selection criteria for --genotypeFilterExpression + * Invert the selection criteria for {@code --genotypeFilterExpression}. */ @Argument(fullName="invertGenotypeFilterExpression", shortName="invG_filter", doc="Invert the selection criteria for --genotypeFilterExpression", required=false) protected boolean invertGenotypeFilterExpression = false; @@ -207,81 +211,42 @@ public class VariantFiltration extends RodWalker { @Argument(fullName="setFilteredGtToNocall", required=false, doc="Set filtered genotypes to no-call") private boolean setFilteredGenotypesToNocall = false; + // ----------------------------------------------------------------------------------------------- + // Fields + // ----------------------------------------------------------------------------------------------- + // JEXL expressions for the filters List filterExps; List genotypeFilterExps; public static final String CLUSTERED_SNP_FILTER_NAME = "SnpCluster"; + private ClusteredSnps clusteredSNPs = null; private GenomeLoc previousMaskPosition = null; - - // the structures necessary to initialize and maintain a windowed context - private FiltrationContextWindow variantContextWindow; + private FiltrationContextWindow variantContextWindow; // the structures necessary to initialize and maintain a windowed context private static final int WINDOW_SIZE = 10; // 10 variants on either end of the current one - private ArrayList windowInitializer = new ArrayList(); + private ArrayList windowInitializer = new ArrayList<>(); - private final List diploidNoCallAlleles = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL); + private static final List DIPLOID_NO_CALL_ALLELES = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL); - /** - * Prepend inverse phrase to description if --invertFilterExpression - * - * @param description the description - * @return the description with inverse prepended if --invert_filter_expression - */ - private String possiblyInvertFilterExpression( String description ){ - if ( invertFilterExpression ) - description = "Inverse of: " + description; - return description; - } - - private void initializeVcfWriter() { - - final List inputNames = Arrays.asList(variantCollection.variants.getName()); - - // setup the header fields - Set hInfo = new HashSet(); - hInfo.addAll(GATKVCFUtils.getHeaderFields(getToolkit(), inputNames)); - - // need AC, AN and AF since output if set filtered genotypes to no-call - if ( setFilteredGenotypesToNocall ) { - hInfo.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_COUNT_KEY)); - hInfo.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_NUMBER_KEY)); - hInfo.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_FREQUENCY_KEY)); - } - - if ( clusterWindow > 0 ) - hInfo.add(new VCFFilterHeaderLine(CLUSTERED_SNP_FILTER_NAME, "SNPs found in clusters")); - - if ( !genotypeFilterExps.isEmpty() ) - hInfo.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_FILTER_KEY)); - - try { - for ( VariantContextUtils.JexlVCMatchExp exp : filterExps ) - hInfo.add(new VCFFilterHeaderLine(exp.name, possiblyInvertFilterExpression(exp.exp.toString()))); - for ( VariantContextUtils.JexlVCMatchExp exp : genotypeFilterExps ) - hInfo.add(new VCFFilterHeaderLine(exp.name, possiblyInvertFilterExpression(exp.exp.toString()))); - - if ( mask.isBound() ) { - if (filterRecordsNotInMask) - hInfo.add(new VCFFilterHeaderLine(maskName, "Doesn't overlap a user-input mask")); - else hInfo.add(new VCFFilterHeaderLine(maskName, "Overlaps a user-input mask")); - } - } catch (IllegalArgumentException e) { - throw new UserException.BadInput(e.getMessage()); - } - - writer.writeHeader(new VCFHeader(hInfo, SampleUtils.getUniqueSamplesFromRods(getToolkit(), inputNames))); - } + // ----------------------------------------------------------------------------------------------- + // public methods from base classes + // ----------------------------------------------------------------------------------------------- public void initialize() { - if ( clusterWindow > 0 ) - clusteredSNPs = new ClusteredSnps(getToolkit().getGenomeLocParser(),clusterSize, clusterWindow); - if ( maskExtension < 0 ) - throw new UserException.BadArgumentValue("maskExtension", "negative values are not allowed"); + if ( maskExtension < 0 ) { + throw new UserException.BadArgumentValue("maskExtension", "negative values are not allowed"); + } + + if (filterRecordsNotInMask && !mask.isBound()) { + throw new UserException.BadArgumentValue("filterNotInMask", "argument not allowed if mask argument is not provided"); + } + + if ( clusterWindow > 0 ) { + clusteredSNPs = new ClusteredSnps(getToolkit().getGenomeLocParser(), clusterSize, clusterWindow); + } - if (filterRecordsNotInMask && !mask.isBound()) - throw new UserException.BadArgumentValue("filterNotInMask","argument not allowed if mask argument is not provided"); filterExps = VariantContextUtils.initializeMatchExps(filterNames, filterExpressions); genotypeFilterExps = VariantContextUtils.initializeMatchExps(genotypeFilterNames, genotypeFilterExpressions); @@ -290,44 +255,45 @@ public class VariantFiltration extends RodWalker { initializeVcfWriter(); } - public Integer reduceInit() { return 0; } - /** * * @param tracker the meta-data tracker * @param ref the reference base * @param context the context for the given locus - * @return 1 if the locus was successfully processed, 0 if otherwise + * @return 1 if the locus was successfully processed, 0 otherwise */ public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - if ( tracker == null ) + if ( tracker == null ) { return 0; + } - Collection VCs = tracker.getValues(variantCollection.variants, context.getLocation()); + final Collection VCs = tracker.getValues(variantCollection.variants, context.getLocation()); // is there a SNP mask present? - boolean hasMask = (tracker.hasValues(mask) && !filterRecordsNotInMask) || (filterRecordsNotInMask && !tracker.hasValues(mask)); - if ( hasMask ) + final boolean hasMask = (tracker.hasValues(mask) && !filterRecordsNotInMask) || (filterRecordsNotInMask && !tracker.hasValues(mask)); + if ( hasMask ) { previousMaskPosition = ref.getLocus(); // multi-base masks will get triggered over all bases of the mask + } for ( VariantContext vc : VCs ) { if ( invalidatePrevious ) { - vc = (new VariantContextBuilder(vc)).filters(new HashSet()).make(); + vc = (new VariantContextBuilder(vc)).filters(new HashSet<>()).make(); } // filter based on previous mask position vc = addMaskIfCoversVariant(vc, previousMaskPosition, maskName, maskExtension, false); - FiltrationContext varContext = new FiltrationContext(ref, vc); + final FiltrationContext varContext = new FiltrationContext(ref, vc); // if we're still initializing the context, do so if ( windowInitializer != null ) { // if this is a mask position, filter previous records if ( hasMask ) { - for ( FiltrationContext prevVC : windowInitializer ) + for ( final FiltrationContext prevVC : windowInitializer ) { prevVC.setVariantContext(addMaskIfCoversVariant(prevVC.getVariantContext(), ref.getLocus(), maskName, maskExtension, true)); + } } windowInitializer.add(varContext); @@ -339,9 +305,10 @@ public class VariantFiltration extends RodWalker { // if this is a mask position, filter previous records if ( hasMask ) { - for ( FiltrationContext prevVC : variantContextWindow.getWindow(10, 10) ) { - if ( prevVC != null ) + for ( final FiltrationContext prevVC : variantContextWindow.getWindow(10, 10) ) { + if ( prevVC != null ) { prevVC.setVariantContext(addMaskIfCoversVariant(prevVC.getVariantContext(), ref.getLocus(), maskName, maskExtension, true)); + } } } @@ -353,6 +320,244 @@ public class VariantFiltration extends RodWalker { return 1; } + public Integer reduce(Integer value, Integer sum) { + return sum + value; + } + + public Integer reduceInit() { return 0; } + + /** + * Tell the user the number of loci processed and close out the new variants file. + * + * @param result the number of loci seen. + */ + public void onTraversalDone(Integer result) { + // move the window over so that we can filter the last few variants + if ( windowInitializer != null ) { + while ( windowInitializer.size() < WINDOW_SIZE ) { + windowInitializer.add(null); + } + variantContextWindow = new FiltrationContextWindow(windowInitializer); + } + for (int i=0; i < WINDOW_SIZE; i++) { + variantContextWindow.moveWindow(null); + filter(); + } + } + + // ----------------------------------------------------------------------------------------------- + // main filtering steps + // ----------------------------------------------------------------------------------------------- + + /** + * Organizing filters: genotype filters and normal filters. + */ + private void filter() { + // get the current context + final FiltrationContext context = variantContextWindow.getContext(); + if ( context == null ) { + return; + } + + final VariantContext vc = context.getVariantContext(); + + // make new Genotypes based on genotype filters + final VariantContextBuilder builder = ( genotypeFilterExps.isEmpty() && !setFilteredGenotypesToNocall ) ? new VariantContextBuilder(vc) + : applyGenotypeFilters(vc, genotypeFilterExps, invertGenotypeFilterExpression, failMissingValues, setFilteredGenotypesToNocall); + + // extract filters already in VC and append new filters + final Set filters = buildVCfilters(vc, filterExps, invertFilterExpression, failMissingValues); + // test for clustered SNPs if requested + if ( clusteredSNPs != null && clusteredSNPs.filter(variantContextWindow) ) { + filters.add(CLUSTERED_SNP_FILTER_NAME); + } + + // make a new variant context based on all filters, and write + writer.add(filters.isEmpty() ? builder.passFilters().make() : builder.filters(filters).make()); + } + + /** + * Given a VC builder and a vc (which was used to construct the builder), update the properties that the builder + * will use to construct a new VC, based on some of the attributes/annotations of the old VC. + * @param vc variant context holding genotypes to be filtered + * @param genotypeFilterExpressions genotype filter expressions + * @param invertGenotypeFilterExpression should invert the genotype filter expression or not + * @param failIfMissingValues if sample misses the corresponding annotation(s) the filter(s) work by, should we fail them or not + * @param setFilteredGenotypesToNocall if sample is filtered should we set genotype to non-call or not + */ + @VisibleForTesting + static VariantContextBuilder applyGenotypeFilters(final VariantContext vc, + final List genotypeFilterExpressions, + final boolean invertGenotypeFilterExpression, + final boolean failIfMissingValues, + final boolean setFilteredGenotypesToNocall) { + + final VariantContextBuilder builder = new VariantContextBuilder(vc); + + final GenotypesContext genotypes = GenotypesContext.create(vc.getGenotypes().size()); + + // recompute AC, AN and AF if filtered genotypes are set to no-call + // occurrences of alternate alleles over all genotypes + final Map calledAltAlleles = new LinkedHashMap<>(vc.getNAlleles()-1); + for ( final Allele altAllele : vc.getAlternateAlleles() ) { + calledAltAlleles.put(altAllele, 0); + } + + int calledAlleles = 0; + boolean haveFilteredNoCallAlleles = false; + + // for each genotype, check filters then create a new object + for ( final Genotype g : vc.getGenotypes() ) { + if ( g.isCalled() ) { + final List filters = new ArrayList<>(); + if ( g.isFiltered() ) filters.add(g.getFilters()); + + // Add if expression filters the variant context + for ( final VariantContextUtils.JexlVCMatchExp exp : genotypeFilterExpressions ) { + try { + if (Utils.invertLogic(VariantContextUtils.match(vc, g, exp), invertGenotypeFilterExpression)) { + filters.add(exp.name); + } + } catch (final IllegalArgumentException e) { + // logic: right now (2016/08/18) if a filter is applied based on specific annotation and some sample contains missing value for such annotation, + // lower level code will throw IllegalArgumentException, therefore we specifically catch this type of exception + // do nothing unless specifically asked to; it just means that the expression isn't defined for this context + if ( failIfMissingValues ) { + filters.add(exp.name); + } + } + } + + // if sample is filtered and --setFilteredGtToNocall, set genotype to non-call + if ( !filters.isEmpty() && setFilteredGenotypesToNocall ) { + haveFilteredNoCallAlleles = true; + genotypes.add(new GenotypeBuilder(g).filters(filters).alleles(DIPLOID_NO_CALL_ALLELES).make()); + } + else { + genotypes.add(new GenotypeBuilder(g).filters(filters).make()); + calledAlleles = GATKVariantContextUtils.incrementChromosomeCountsInfo(calledAltAlleles, calledAlleles, g); + } + } else { + genotypes.add(g); + } + } + + builder.genotypes(genotypes); + // if filtered genotypes are set to no-call, output recomputed AC, AN, AF + if ( haveFilteredNoCallAlleles ) { + GATKVariantContextUtils.updateChromosomeCountsInfo(calledAltAlleles, calledAlleles, builder); + } + + return builder; + } + + /** + * Extract filters already present in the {@code vc}, and append user provided expressions. + * For user provided genotype filter expressions, see {@link #applyGenotypeFilters(VariantContext, List, boolean, boolean, boolean)} + * @param vc VC from which filters to be extracted + * @param vcFilterExpressions more filter expressions provided by user + * @param invertVCfilterExpression should we invert the logic in expressions provided in {@code vcFilterExpressions} + * @param failIfMissingValues should we mark the VC as failing if it misses the value the filters work on + * + * @return filters already in the provided vc and user-provided filters + */ + @VisibleForTesting + static Set buildVCfilters(final VariantContext vc, + final List vcFilterExpressions, + final boolean invertVCfilterExpression, + final boolean failIfMissingValues) { + + final Set filters = new LinkedHashSet<>(vc.getFilters()); + + for ( final VariantContextUtils.JexlVCMatchExp exp : vcFilterExpressions ) { + try { + if ( Utils.invertLogic(VariantContextUtils.match(vc, exp), invertVCfilterExpression) ) { + filters.add(exp.name); + } + } catch (final Exception e) { + // do nothing unless specifically asked to; it just means that the expression isn't defined for this context + if ( failIfMissingValues ) { + filters.add(exp.name); + } + } + } + return filters; + } + + // ----------------------------------------------------------------------------------------------- + // some other complications besides main stuff + // ----------------------------------------------------------------------------------------------- + + private void initializeVcfWriter() { + + final List inputNames = Arrays.asList(variantCollection.variants.getName()); + + // setup the header fields + final Set hInfo = new HashSet(); + hInfo.addAll(GATKVCFUtils.getHeaderFields(getToolkit(), inputNames)); + + // need AC, AN and AF since output if set filtered genotypes to no-call + if ( setFilteredGenotypesToNocall ) { + hInfo.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_COUNT_KEY)); + hInfo.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_NUMBER_KEY)); + hInfo.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_FREQUENCY_KEY)); + } + + if ( clusterWindow > 0 ) { + hInfo.add(new VCFFilterHeaderLine(CLUSTERED_SNP_FILTER_NAME, "SNPs found in clusters")); + } + + if ( !genotypeFilterExps.isEmpty() ) { + hInfo.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_FILTER_KEY)); + } + + try { + for ( final VariantContextUtils.JexlVCMatchExp exp : filterExps ) { + hInfo.add(new VCFFilterHeaderLine(exp.name, possiblyInvertFilterExpression(exp.exp.toString()))); + } + for ( final VariantContextUtils.JexlVCMatchExp exp : genotypeFilterExps ) { + hInfo.add(new VCFFilterHeaderLine(exp.name, possiblyInvertFilterExpression(exp.exp.toString()))); + } + + if ( mask.isBound() ) { + hInfo.add(new VCFFilterHeaderLine(maskName, filterRecordsNotInMask ? "Doesn't overlap a user-input mask" : "Overlaps a user-input mask")); + } + } catch (final IllegalArgumentException e) { + throw new UserException.BadInput(e.getMessage()); + } + + writer.writeHeader(new VCFHeader(hInfo, SampleUtils.getUniqueSamplesFromRods(getToolkit(), inputNames))); + } + + /** + * Prepend inverse phrase to description if {@code --invertFilterExpression}. + * + * @param description the description + * @return the description with inverse prepended if {@code --invert_filter_expression}. + */ + private String possiblyInvertFilterExpression( final String description ){ + return invertFilterExpression ? "Inverse of: " + description : description; + } + + /** + * Add mask to variant context filters if it covers its location. + * @param vc VariantContext + * @param genomeLoc genome location + * @param maskName name of the mask + * @param maskExtension bases beyond the mask + * @param locStart if true, start at genome location and end at VariantContext. If false, do the opposite. + * @return VariantContext with the mask added if the VariantContext is within the extended mask area + */ + private VariantContext addMaskIfCoversVariant(VariantContext vc, final GenomeLoc genomeLoc, final String maskName, final int maskExtension, final boolean locStart) { + if (doesMaskCoverVariant(vc, genomeLoc, maskName, maskExtension, locStart) ) { + final Set filters = new LinkedHashSet<>(vc.getFilters()); + filters.add(maskName); + vc = new VariantContextBuilder(vc).filters(filters).make(); + } + + return vc; + } + /** * Helper function to check if a mask covers the variant location. * @@ -364,141 +569,14 @@ public class VariantFiltration extends RodWalker { * @return true if the genome location is within the extended mask area, false otherwise */ protected static boolean doesMaskCoverVariant(VariantContext vc, GenomeLoc genomeLoc, String maskName, int maskExtension, boolean vcBeforeLoc) { - boolean logic = genomeLoc != null && // have a location - genomeLoc.getContig().equals(vc.getChr()) && // it's on the same contig + final boolean needToCheckOveralpping = genomeLoc != null && // have a location + genomeLoc.getContig().equals(vc.getChr()) && // it's on the same contig (vc.getFilters() == null || !vc.getFilters().contains(maskName)); // the filter hasn't already been applied - if ( logic ) { - if (vcBeforeLoc) - return genomeLoc.getStart() - vc.getEnd() <= maskExtension; // it's within the mask area (multi-base VCs that overlap this site will always give a negative distance) - else - return vc.getStart() - genomeLoc.getStop() <= maskExtension; + if ( needToCheckOveralpping ) { + return vcBeforeLoc ? (genomeLoc.getStart() - vc.getEnd() <= maskExtension) // it's within the mask area (multi-base VCs that overlap this site will always give a negative distance) + : (vc.getStart() - genomeLoc.getStop() <= maskExtension); } else { return false; } } - - /** - * Add mask to variant context filters if it covers the it's location - * - * @param vc VariantContext - * @param genomeLoc genome location - * @param maskName name of the mask - * @param maskExtension bases beyond the mask - * @param locStart if true, start at genome location and end at VariantContext. If false, do the opposite. - * @return VariantContext with the mask added if the VariantContext is within the extended mask area - */ - private VariantContext addMaskIfCoversVariant(VariantContext vc, GenomeLoc genomeLoc, String maskName, int maskExtension, boolean locStart) { - if (doesMaskCoverVariant(vc, genomeLoc, maskName, maskExtension, locStart) ) { - Set filters = new LinkedHashSet(vc.getFilters()); - filters.add(maskName); - vc = new VariantContextBuilder(vc).filters(filters).make(); - } - - return vc; - } - - private void filter() { - // get the current context - FiltrationContext context = variantContextWindow.getContext(); - if ( context == null ) - return; - - final VariantContext vc = context.getVariantContext(); - final VariantContextBuilder builder = new VariantContextBuilder(vc); - - // make new Genotypes based on filters - if ( !genotypeFilterExps.isEmpty() || setFilteredGenotypesToNocall ) { - GenotypesContext genotypes = GenotypesContext.create(vc.getGenotypes().size()); - - // - // recompute AC, AN and AF if filtered genotypes are set to no-call - // - // occurrences of alternate alleles over all genotypes - final Map calledAltAlleles = new LinkedHashMap(vc.getNAlleles()-1); - for ( final Allele altAllele : vc.getAlternateAlleles() ) { - calledAltAlleles.put(altAllele, 0); - } - - int calledAlleles = 0; - boolean haveFilteredNoCallAlleles = false; - - // for each genotype, check filters then create a new object - for ( final Genotype g : vc.getGenotypes() ) { - if ( g.isCalled() ) { - final List filters = new ArrayList(); - if ( g.isFiltered() ) filters.add(g.getFilters()); - - // Add if expression filters the variant context - for ( VariantContextUtils.JexlVCMatchExp exp : genotypeFilterExps ) { - if ( Utils.invertLogic(VariantContextUtils.match(vc, g, exp), invertGenotypeFilterExpression) ) - filters.add(exp.name); - } - - // if sample is filtered and --setFilteredGtToNocall, set genotype to non-call - if ( !filters.isEmpty() && setFilteredGenotypesToNocall ) { - haveFilteredNoCallAlleles = true; - genotypes.add(new GenotypeBuilder(g).filters(filters).alleles(diploidNoCallAlleles).make()); - } - else { - genotypes.add(new GenotypeBuilder(g).filters(filters).make()); - calledAlleles = GATKVariantContextUtils.incrementChromosomeCountsInfo(calledAltAlleles, calledAlleles, g); - } - } else { - genotypes.add(g); - } - } - - builder.genotypes(genotypes); - // if filtered genotypes are set to no-call, output recomputed AC, AN, AF - if ( haveFilteredNoCallAlleles ) - GATKVariantContextUtils.updateChromosomeCountsInfo(calledAltAlleles, calledAlleles, builder); - } - - // make a new variant context based on filters - Set filters = new LinkedHashSet(vc.getFilters()); - - // test for clustered SNPs if requested - if ( clusteredSNPs != null && clusteredSNPs.filter(variantContextWindow) ) - filters.add(CLUSTERED_SNP_FILTER_NAME); - - for ( VariantContextUtils.JexlVCMatchExp exp : filterExps ) { - try { - if ( Utils.invertLogic(VariantContextUtils.match(vc, exp), invertFilterExpression) ) - filters.add(exp.name); - } catch (Exception e) { - // do nothing unless specifically asked to; it just means that the expression isn't defined for this context - if ( failMissingValues ) - filters.add(exp.name); - } - } - - if ( filters.isEmpty() ) - builder.passFilters(); - else - builder.filters(filters); - - writer.add(builder.make()); - } - - public Integer reduce(Integer value, Integer sum) { - return sum + value; - } - - /** - * Tell the user the number of loci processed and close out the new variants file. - * - * @param result the number of loci seen. - */ - public void onTraversalDone(Integer result) { - // move the window over so that we can filter the last few variants - if ( windowInitializer != null ) { - while ( windowInitializer.size() < WINDOW_SIZE ) - windowInitializer.add(null); - variantContextWindow = new FiltrationContextWindow(windowInitializer); - } - for (int i=0; i < WINDOW_SIZE; i++) { - variantContextWindow.moveWindow(null); - filter(); - } - } } diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/tools/walkers/filters/VariantFiltrationUnitTest.java b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/tools/walkers/filters/VariantFiltrationUnitTest.java index 3ec696592..5262345fc 100644 --- a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/tools/walkers/filters/VariantFiltrationUnitTest.java +++ b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/tools/walkers/filters/VariantFiltrationUnitTest.java @@ -26,21 +26,24 @@ package org.broadinstitute.gatk.tools.walkers.filters; import htsjdk.samtools.reference.ReferenceSequenceFile; +import htsjdk.variant.variantcontext.*; +import htsjdk.variant.vcf.VCFConstants; import org.broadinstitute.gatk.utils.BaseTest; import org.broadinstitute.gatk.utils.GenomeLoc; import org.broadinstitute.gatk.utils.GenomeLocParser; import org.broadinstitute.gatk.utils.Utils; -import htsjdk.variant.variantcontext.Allele; -import htsjdk.variant.variantcontext.VariantContext; -import htsjdk.variant.variantcontext.VariantContextBuilder; import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile; +import org.testng.Assert; +import org.testng.annotations.BeforeTest; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; import java.io.File; import java.util.ArrayList; import java.util.Arrays; import java.util.List; -import org.testng.Assert; -import org.testng.annotations.*; +import java.util.Set; +import java.util.stream.Collectors; public class VariantFiltrationUnitTest extends BaseTest { @@ -58,7 +61,7 @@ public class VariantFiltrationUnitTest extends BaseTest { } @DataProvider(name = "VariantMaskData") - public Object[][] DoesMaskCoverVariantTestData() { + public Object[][] doesMaskCoverVariantTestData() { final String maskName = "testMask"; @@ -85,8 +88,8 @@ public class VariantFiltrationUnitTest extends BaseTest { * @param vcBeforeLoc if true, variant context is before the genome location; if false, the converse is true. * @param expectedValue return the expected return value from doesMaskCoverVariant() */ - @Test(dataProvider = "VariantMaskData") - public void TestDoesMaskCoverVariant(final String contig, final int start, final int stop, final String maskName, final int maskExtension, + @Test(dataProvider = "doesMaskCoverVariantTestData") + public void testDoesMaskCoverVariant(final String contig, final int start, final int stop, final String maskName, final int maskExtension, final boolean vcBeforeLoc, final boolean expectedValue) { // Build VariantContext @@ -104,4 +107,126 @@ public class VariantFiltrationUnitTest extends BaseTest { boolean coversVariant = VariantFiltration.doesMaskCoverVariant(vc, genomeLoc, maskName, maskExtension, vcBeforeLoc); Assert.assertEquals(coversVariant, expectedValue); } + + @Test + public void testApplyGenotypeFilters(){ + + final VariantContext vc = buildDataForFilters().make(); + + final String filterName = "LowDP"; + final String filterExpr = "DP < 10"; + + final List genotypeFilterExps = VariantContextUtils.initializeMatchExps(Arrays.asList(filterName), Arrays.asList(filterExpr)); + + final VariantContextBuilder anotherVCBuilder = VariantFiltration.applyGenotypeFilters(vc, genotypeFilterExps, false, false, false); + final VariantContext anotherVC = anotherVCBuilder.filters().make(); + + Assert.assertEquals(anotherVC.getGenotype("one").isFiltered(), true); + Assert.assertTrue(anotherVC.getGenotype("one").getFilters().equals(filterName)); + + Assert.assertEquals(anotherVC.getGenotype("two").isFiltered(), false); + + Assert.assertEquals(anotherVC.getGenotype("three").isFiltered(), false); + + Assert.assertEquals(anotherVC.getGenotype("four").isFiltered(), false); + + Assert.assertEquals(anotherVC.getGenotype("five").isFiltered(), false); + + Assert.assertEquals(anotherVC.getGenotype("six").isFiltered(), false); + + final VariantContextBuilder yetAnotherVCBuilder = VariantFiltration.applyGenotypeFilters(anotherVC, genotypeFilterExps, false, true, false); + final VariantContext yetAnotherVC = yetAnotherVCBuilder.filters().make(); + Assert.assertEquals(yetAnotherVC.getGenotype("six").isFiltered(), true); + Assert.assertTrue(yetAnotherVC.getGenotype("six").getFilters().equals(filterName)); + } + + @Test + public void testApplyVCFilters(){ + + final VariantContext vcNoFilters = buildDataForFilters().make(); // assumes this vc doesn't hold any filters yet + + String filterName = "LowDP"; + String filterExpr = "DP < 23"; + List vcFilterExps = VariantContextUtils.initializeMatchExps(Arrays.asList(filterName), Arrays.asList(filterExpr)); + + final Set filters = VariantFiltration.buildVCfilters(vcNoFilters, vcFilterExps, false, false); + Assert.assertFalse(vcNoFilters.isFiltered()); + Assert.assertEquals(filters.size(), 1); + Assert.assertTrue(filters.contains(filterName)); + + filterName = "ID"; + filterExpr = "ID = rs123"; + vcFilterExps = VariantContextUtils.initializeMatchExps(Arrays.asList(filterName), Arrays.asList(filterExpr)); + Set filterWhenFailMissing = VariantFiltration.buildVCfilters(vcNoFilters, vcFilterExps, false, true); +// Assert.assertEquals(filterWhenFailMissing.size(), 1); +// Assert.assertTrue(filterWhenFailMissing.contains(filterName)); + filterWhenFailMissing = VariantFiltration.buildVCfilters(vcNoFilters, vcFilterExps, false, false); + Assert.assertTrue(filterWhenFailMissing.isEmpty()); + } + + + private static VariantContextBuilder buildDataForFilters() { + /** + * Uses (part of) the following (semi fake) data for testing (data was modified from real data so expect some minor inconsistencies in annotations) + * 1 1234567 . T C 152.03 . + * AC=6;AF=1.00;AN=6;DP=22;ExcessHet=3.0103;FS=0.000;MLEAC=2;MLEAF=1.00;SOR=0.693;set=variant3-variant4-variant6 + * GT:AD:DP:GQ:PGT:PID:PL:RGQ + * 1/1:0,2:9:6:1|1:15870493_CT_C:90,6,0 + * 1/1:0,4:10:12:1|1:15870493_CT_C:180,12,0 + * ./.:0:3:.:.:.:.:0 + * ./.:0:0:.:.:.:.:0 + * ./.:0:0:.:.:.:.:0 + * 1/1:0,0:.:6:1|1:15870493_CT_C:90,6,0 + */ + + final Allele refT = Allele.create("T", true); + final Allele altC = Allele.create("C", false); + final Allele nocall = Allele.NO_CALL; + + final VariantContextBuilder vcBuilder = new VariantContextBuilder("", "1", 1234567, 1234567, Arrays.asList(refT, altC)); + + vcBuilder.noID(); + vcBuilder.attribute("AC", 6); + vcBuilder.attribute("AF", 1.00); + vcBuilder.attribute("AN", 6); + vcBuilder.attribute("DP", 22); + vcBuilder.attribute("ExcessHet", 3.0103); + vcBuilder.attribute("FS", 0.000); + vcBuilder.attribute("MLEAC", 2); + vcBuilder.attribute("MLEAF", 1.00); + vcBuilder.attribute("SOR", 0.693); + + GenotypeBuilder gtBuilder = new GenotypeBuilder("one", Arrays.asList(altC,altC)); + final Genotype firstSample = gtBuilder.attribute(VCFConstants.GENOTYPE_KEY, GenotypeType.HOM_VAR) + .DP(9) // edge case not passing "DP < 10" + .make(); + + gtBuilder = new GenotypeBuilder("two", Arrays.asList(altC,altC)); + final Genotype secondSample = gtBuilder.attribute(VCFConstants.GENOTYPE_KEY, GenotypeType.HOM_VAR) + .DP(10) // edge case passing "DP < 10" + .make(); + + gtBuilder = new GenotypeBuilder("three", Arrays.asList(nocall,nocall)); + final Genotype thirdSample = gtBuilder.attribute(VCFConstants.GENOTYPE_KEY, GenotypeType.NO_CALL) + .DP(3) + .make(); + + gtBuilder = new GenotypeBuilder("four", Arrays.asList(nocall,nocall)); + final Genotype fourthSample = gtBuilder.attribute(VCFConstants.GENOTYPE_KEY, GenotypeType.NO_CALL) + .DP(0) + .make(); + + gtBuilder = new GenotypeBuilder("five", Arrays.asList(nocall,nocall)); + final Genotype fifthSample = gtBuilder.attribute(VCFConstants.GENOTYPE_KEY, GenotypeType.NO_CALL) + .DP(0) + .make(); + + gtBuilder = new GenotypeBuilder("six", Arrays.asList(altC,altC)); + final Genotype sixthSample = gtBuilder.attribute(VCFConstants.GENOTYPE_KEY, GenotypeType.HOM_VAR) + .DP(-1) + .make(); + + vcBuilder.genotypes(firstSample, secondSample, thirdSample, fourthSample, fifthSample, sixthSample); + return vcBuilder; + } } From 956b18fbb729ad526ca9a2c076337b56bf5db047 Mon Sep 17 00:00:00 2001 From: Ron Levine Date: Tue, 6 Sep 2016 10:49:20 -0400 Subject: [PATCH 38/68] Make ReadPosRankSumTest.isUsableRead() account for deletions --- .../tools/walkers/annotator/ReadPosRankSumTest.java | 2 +- .../annotator/VariantAnnotatorIntegrationTest.java | 2 +- ...GenotyperGeneralPloidySuite1IntegrationTest.java | 2 +- ...GenotyperGeneralPloidySuite2IntegrationTest.java | 2 +- ...nifiedGenotyperNormalCallingIntegrationTest.java | 2 +- ...erComplexAndSymbolicVariantsIntegrationTest.java | 6 +++--- .../HaplotypeCallerGVCFIntegrationTest.java | 12 ++++++------ .../HaplotypeCallerIntegrationTest.java | 13 ++++++++++++- .../variantutils/GenotypeGVCFsIntegrationTest.java | 2 +- 9 files changed, 27 insertions(+), 16 deletions(-) diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/ReadPosRankSumTest.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/ReadPosRankSumTest.java index 62256c207..c4cb8c3b6 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/ReadPosRankSumTest.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/ReadPosRankSumTest.java @@ -129,7 +129,7 @@ public class ReadPosRankSumTest extends RankSumTest implements StandardAnnotatio @Override protected boolean isUsableRead(final GATKSAMRecord read, final int refLoc) { - return super.isUsableRead(read, refLoc) && read.getSoftStart() + read.getCigar().getReadLength() > refLoc; + return super.isUsableRead(read, refLoc) && read.getSoftEnd() >= refLoc; } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/annotator/VariantAnnotatorIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/annotator/VariantAnnotatorIntegrationTest.java index 0029f78a6..bb71cc1c4 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/annotator/VariantAnnotatorIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/annotator/VariantAnnotatorIntegrationTest.java @@ -423,7 +423,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { @Test public void testStrandAlleleCountsBySample() { - final String MD5 = "564aeeefad92353d66dbb2a2222d5108"; + final String MD5 = "994d1d3c53e3878e17d20e1327506d77"; final WalkerTestSpec spec = new WalkerTestSpec( "-T HaplotypeCaller --disableDithering " + String.format("-R %s -I %s ", REF, CEUTRIO_BAM) + diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite1IntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite1IntegrationTest.java index 456f5e8ca..5de6bcebf 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite1IntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite1IntegrationTest.java @@ -88,6 +88,6 @@ public class UnifiedGenotyperGeneralPloidySuite1IntegrationTest extends WalkerTe //TODO the old MD5 is kept for the record. //TODO this should be revisit once we get into addressing inaccuracies by the independent allele approach. // executor.PC_LSV_Test_NoRef("-A AlleleCountBySample -maxAltAlleles 2 -ploidy 1", "LSV_INDEL_DISC_NOREF_p1", "INDEL", "b5ff7530827f4b9039a58bdc8a3560d2"); - executor.PC_LSV_Test_NoRef("-A AlleleCountBySample -maxAltAlleles 2 -ploidy 1", "LSV_INDEL_DISC_NOREF_p1", "INDEL", "7421a776c75d0ab5a2ff89d9e7f105ff"); + executor.PC_LSV_Test_NoRef("-A AlleleCountBySample -maxAltAlleles 2 -ploidy 1", "LSV_INDEL_DISC_NOREF_p1", "INDEL", "5b76f96b6b74944e0c0d9914700588f0"); } } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite2IntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite2IntegrationTest.java index ff0f23666..627ab6576 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite2IntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite2IntegrationTest.java @@ -63,7 +63,7 @@ public class UnifiedGenotyperGeneralPloidySuite2IntegrationTest extends WalkerTe @Test(enabled = true) public void testINDEL_maxAltAlleles2_ploidy3_Pools_noRef() { - executor.PC_LSV_Test_NoRef("-A AlleleCountBySample -maxAltAlleles 2 -ploidy 3","LSV_INDEL_DISC_NOREF_p3","INDEL","f4092488c9785d800c3f6470af7119ce"); + executor.PC_LSV_Test_NoRef("-A AlleleCountBySample -maxAltAlleles 2 -ploidy 3","LSV_INDEL_DISC_NOREF_p3","INDEL","cdaa55c53005deb132f600fa5539c254"); } @Test(enabled = true) diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperNormalCallingIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperNormalCallingIntegrationTest.java index b6cc66d1c..a8f3f6187 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperNormalCallingIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperNormalCallingIntegrationTest.java @@ -126,7 +126,7 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{ public void testMismatchedPLs() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -I " + privateTestDir + "mismatchedPLs.bam -o %s -L 1:24020341", 1, - Arrays.asList("1759c156bc45528504398a7ef4ce5bf8")); + Arrays.asList("c41ff9e1e3cfb6bd45d772787dd8e2d3")); executeTest("test mismatched PLs", spec); } } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java index d5c1eba67..39d2cf8c8 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java @@ -72,7 +72,7 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa @Test public void testHaplotypeCallerMultiSampleComplex1() { - HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "b01df95864808dc67295efc6db37983d"); + HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "79567a4e4307495e880e9782b3a88f7d"); } private void HCTestSymbolicVariants(String bam, String args, String md5) { @@ -96,7 +96,7 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa @Test public void testHaplotypeCallerMultiSampleGGAComplex() { HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:119673-119823 -L 20:121408-121538", - "828ef27284bd4045148728952b3a7d94"); + "558820f3b67f4434a41e0cb96b6469c7"); } @Test @@ -114,7 +114,7 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa @Test public void testHaplotypeCallerMultiSampleConsensusModeComplex() { HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:119673-119823 -L 20:121408-121538 -L 20:133041-133161 -L 20:300207-300337", - "060eed2610eed818b2ab55d582eb22ec"); + "47894766b0ce7d4aecd89e4938ac1c85"); } } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java index 739371f39..b2a86148e 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java @@ -107,8 +107,8 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest { // this functionality can be adapted to provide input data for whatever you might want in your data tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.NONE, PCRFreeIntervals, "3ae2c7e570855f6d6ca58ddd1089a970"}); - tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "8bb824886fb0e77d0e8317d69f9d1b62"}); - tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "1f19c2b2b528dff502bc1a47701edde7"}); + tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "1961007bd98a174a4a1b3e76a9c2f156"}); + tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "b34b0b61583628fbd51221627adcdb81"}); tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.NONE, WExIntervals, "63ff771eed3e62340c8938b4963d0add"}); tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "1122a0b3849f42d1c4a654f93b660e1b"}); @@ -130,8 +130,8 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest { // this functionality can be adapted to provide input data for whatever you might want in your data tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.NONE, PCRFreeIntervals, "8bf132d73cf6b0851ae73c6799f19ba9"}); - tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "450906ce3c11860c25b90cf0a56bb1a0"}); - tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "49f41972e19f6897659e497d32730dde"}); + tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "654c8264cfcbcb71da479761912fbd71"}); + tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "4959f20a8bd3327760d94ccc40157f81"}); tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.NONE, WExIntervals, "6ad7855dbf6dda2060aa93a3ee010b3e"}); tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "50e628de2a79cd6887af020b713ca3b8"}); tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.GVCF, WExIntervals, "e48bbcf453e63a6ea5eeda05f6865f94"}); @@ -148,8 +148,8 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest { // this functionality can be adapted to provide input data for whatever you might want in your data tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.NONE, PCRFreeIntervals, "6662cfc41393257dfd6c39f1af1e3843"}); - tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "0bc1ca3bff07381a344685b048e76ee4"}); - tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "3ff7e3cd9f6b1949d19f52fab53bdb5e"}); + tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "dd9fdcae44ab316c04650bf50c38e4b2"}); + tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "cb318100ae15cb3dcc342b6142ac6361"}); tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.NONE, WExIntervals, "af0fe243e3b96e59097187cd16ba1597"}); tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "8a094080fb25bbcd39325dcdd62bcf65"}); tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.GVCF, WExIntervals, "685025831ac783784d7838e568e35f46"}); diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index d170e3667..5303ecdf7 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -107,7 +107,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeBAMOutFlags() throws IOException { - HCTestWithBAMOut(NA12878_BAM, " -L 20:10000000-10100000 ", "729ebefdce0d5ea6f535c354c329e6b9", "d38aab5bf8ef0bc7c18e8c909819da84"); + HCTestWithBAMOut(NA12878_BAM, " -L 20:10000000-10100000 ", "a6abb0aa68d3b4d15185a119350e76dc", "d38aab5bf8ef0bc7c18e8c909819da84"); } @Test @@ -501,5 +501,16 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList(md5)); executeTest("testSetZeroGQsToNoCall", spec); } + + @Test + public void testHaplotypeCallerReadPosRankSum() throws IOException { + final File testBAM = new File(privateTestDir + "testReadPos.snippet.bam"); + final String md5Variants = "03b3c464f22a3572f7d66890c18bdda4"; + final String md5BAMOut = "2e0843f6e8e90c407825e9c47ce4a32d"; + final String base = String.format("-T HaplotypeCaller -R %s -I %s -L 1:3753063 -ip 100 ", REF, testBAM) + + " --no_cmdline_in_header -o %s -bamout %s"; + final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList(md5Variants, md5BAMOut)); + executeTest("testHaplotypeCallerReadPosRankSum", spec); + } } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFsIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFsIntegrationTest.java index f511990c7..cb903ba79 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFsIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFsIntegrationTest.java @@ -282,7 +282,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { final WalkerTestSpec spec = new WalkerTestSpec( baseTestString(" -V " + gVCF.getAbsolutePath(), b37KGReference), 1, - Collections.singletonList("5d8fff160ec6eedb8e02c9207e256073")); + Collections.singletonList("7dfe841940c63415bd5d07ae5d0c69d7")); spec.disableShadowBCF(); //TODO: Remove when BaseTest.assertAttributesEquals() works with SAC executeTest("testStrandAlleleCountsBySample", spec); } From ab454e8812fcb7b0f222ae21048c5a3102e0a014 Mon Sep 17 00:00:00 2001 From: Ron Levine Date: Tue, 6 Sep 2016 18:57:42 -0400 Subject: [PATCH 39/68] Upgrade Apache Commons Collections to version 3.2.2 --- public/gatk-root/pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/gatk-root/pom.xml b/public/gatk-root/pom.xml index aa12e0180..cd238569d 100644 --- a/public/gatk-root/pom.xml +++ b/public/gatk-root/pom.xml @@ -184,7 +184,7 @@ commons-collections commons-collections - 3.2.1 + 3.2.2 commons-httpclient From a7b1130224e910cbee46daed7a047d307e157e3e Mon Sep 17 00:00:00 2001 From: Ron Levine Date: Thu, 8 Sep 2016 16:21:32 -0400 Subject: [PATCH 40/68] Do not emit GVCF block definitions in the header --- .../walkers/variantutils/GenotypeGVCFs.java | 11 ++++ .../GenotypeGVCFsIntegrationTest.java | 62 +++++++++---------- 2 files changed, 42 insertions(+), 31 deletions(-) diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFs.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFs.java index feb2fbb2e..7e716a979 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFs.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFs.java @@ -133,6 +133,9 @@ import java.util.*; @Reference(window=@Window(start=-10,stop=10)) @SuppressWarnings("unused") public class GenotypeGVCFs extends RodWalker implements AnnotatorCompatible, TreeReducible { + + private static String GVCF_BLOCK = "GVCFBlock"; + /** * The gVCF files to merge together */ @@ -229,6 +232,14 @@ public class GenotypeGVCFs extends RodWalker headerLines = VCFUtils.smartMergeHeaders(vcfRods.values(), true); + + // Remove GCVFBlocks + for ( final Iterator iter = headerLines.iterator(); iter.hasNext(); ) { + if ( iter.next().getKey().contains(GVCF_BLOCK) ) { + iter.remove(); + } + } + headerLines.addAll(annotationEngine.getVCFAnnotationDescriptions()); headerLines.addAll(genotypingEngine.getAppropriateVCFInfoHeaders()); diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFsIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFsIntegrationTest.java index cb903ba79..659d6e2d5 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFsIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFsIntegrationTest.java @@ -83,7 +83,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { final WalkerTestSpec spec = new WalkerTestSpec( baseTestString(" -V " + privateTestDir + "testUpdatePGT.vcf", b37KGReference), 1, - Collections.singletonList("8d9788afd0de26bd9d9e55dd0e9fc3ed")); + Collections.singletonList("de36b46bc523b305bf344591a285c0d9")); executeTest("testUpdatePGT", spec); } @@ -93,7 +93,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { final WalkerTestSpec spec = new WalkerTestSpec( baseTestString(" -V " + privateTestDir + "testUpdatePGT.vcf -A StrandAlleleCountsBySample -log " + logFileName, b37KGReference), 1, - Collections.singletonList("5dd4698da963a423446bb1e183eb75aa")); + Collections.singletonList("c8eba89f434ca5e9dc0f157bcd4bea11")); executeTest("testUpdatePGTStrandAlleleCountsBySample", spec); final File file = new File(logFileName); @@ -108,7 +108,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { " -V:sample3 " + privateTestDir + "combine.single.sample.pipeline.3.vcf" + " -L 20:10,000,000-11,000,000", b37KGReference), 1, - Collections.singletonList("61dd2aaabf94a8f5b87d5069a75d84d7")); + Collections.singletonList("b82f29eee8b1369b376ace857bf9b55a")); executeTest("combineSingleSamplePipelineGVCF", spec); } @@ -120,7 +120,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { " -V:sample3 " + privateTestDir + "tetraploid-gvcf-3.vcf" + " -L " + privateTestDir + "tetraploid-gvcfs.intervals", b37KGReference), 1, - Collections.singletonList("64fa89f20ee25df21ad20ce4ada7e7ad")); + Collections.singletonList("d6ef5e411ac5829a12d825a0fefac883")); executeTest("testTetraploidRun", spec); } @@ -132,7 +132,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { " -V:sample3 " + privateTestDir + "diploid-gvcf-3.vcf" + " -L " + privateTestDir + "tetraploid-gvcfs.intervals", b37KGReference), 1, - Collections.singletonList("b1d93f4cd93093c208be2c9842f38d12")); + Collections.singletonList("b497f16cd9eb99e353d9430fe7f34635")); executeTest("testMixedPloidyRun", spec); } @@ -145,7 +145,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { " -L " + privateTestDir + "tetraploid-gvcfs.intervals" + " -maxNumPLValues 3", b37KGReference), 1, - Collections.singletonList("c0dcf62fb116c4c0baabe432eceea52c")); + Collections.singletonList("8c8ebe2069977ba13024a95827c6c50d")); executeTest("testMixedPloidyMaxNumPLValuesRun", spec); } @@ -157,7 +157,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { " -V:sample3 " + privateTestDir + "combine.single.sample.pipeline.3.vcf" + " --includeNonVariantSites -L 20:10,030,000-10,033,000 -L 20:10,386,000-10,386,500", b37KGReference), 1, - Collections.singletonList("af19ee0d7e739143be4e252c48701c45")); + Collections.singletonList("edf083b3bf9cdec31b997a70fd56a7b2")); executeTest("combineSingleSamplePipelineGVCF_includeNonVariants", spec); } @@ -170,7 +170,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { " -V:sample3 " + privateTestDir + "combine.single.sample.pipeline.3.vcf" + " -L 20:10,000,000-20,000,000", b37KGReference), 1, - Collections.singletonList("3943e70eed48618040469e157509868e")); + Collections.singletonList("b304c7e3bb3625a1cdb5531c77b13bcd")); executeTest("combineSingleSamplePipelineGVCFHierarchical", spec); } @@ -182,7 +182,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { " -V:sample3 " + privateTestDir + "combine.single.sample.pipeline.3.vcf" + " -L 20:10,000,000-11,000,000 --dbsnp " + b37dbSNP132, b37KGReference), 1, - Collections.singletonList("51d498327342bd3b0b092845b437aad5")); + Collections.singletonList("08adc638b9539fd275836ed008d900ee")); executeTest("combineSingleSamplePipelineGVCF_addDbsnp", spec); } @@ -203,7 +203,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { "-T GenotypeGVCFs --no_cmdline_in_header -L 1:69485-69791 -o %s -R " + b37KGReference + " -V " + privateTestDir + "gvcfExample1.vcf", 1, - Collections.singletonList("9ff344a5ab87a2c3b128e435e2e86db0")); + Collections.singletonList("df88bbf2eea39a06f2bcc47d9379e5fa")); executeTest("testJustOneSample", spec); } @@ -214,14 +214,14 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { " -V " + privateTestDir + "gvcfExample1.vcf" + " -V " + privateTestDir + "gvcfExample2.vcf", 1, - Collections.singletonList("0c07ed795562ea96eab427e63a970384")); + Collections.singletonList("933c3ec48870c54f7f74b259272d6645")); executeTest("testSamplesWithDifferentLs", spec); } @Test public void testNoPLsException() { // Test with input files with (1) 0/0 and (2) ./. - final String md5 = "2f3d71272fdac19ac861cc7159edfb08"; + final String md5 = "91038469a8133feb05038528f8565840"; final WalkerTestSpec spec1 = new WalkerTestSpec( "-T GenotypeGVCFs --no_cmdline_in_header -L 1:1115550-1115551 -o %s -R " + hg19Reference + " --variant " + privateTestDir + "combined_genotype_gvcf_exception.vcf", @@ -282,7 +282,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { final WalkerTestSpec spec = new WalkerTestSpec( baseTestString(" -V " + gVCF.getAbsolutePath(), b37KGReference), 1, - Collections.singletonList("7dfe841940c63415bd5d07ae5d0c69d7")); + Collections.singletonList("8a6e69c8d0b4dd9bf0646173f1b1f32c")); spec.disableShadowBCF(); //TODO: Remove when BaseTest.assertAttributesEquals() works with SAC executeTest("testStrandAlleleCountsBySample", spec); } @@ -299,7 +299,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { " -V:combined2 " + privateTestDir + "combine.single.sample.pipeline.combined.vcf" + " --uniquifySamples", b37KGReference), 1, - Collections.singletonList("c23b1e3f9a960e022038768998a8df82")); + Collections.singletonList("1cb3bddf47c620d294b08acd70d35fa3")); executeTest("testUniquifiedSamples", spec); } @@ -471,7 +471,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { } - private static final String simpleSpanningDeletionsMD5 = "4629c2f02ff58c111828269091cded82"; + private static final String simpleSpanningDeletionsMD5 = "53f2b8991e49a47efc44b8e02ebb8d91"; @Test public void testSpanningDeletionsMD5() { @@ -501,7 +501,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { "-T GenotypeGVCFs --no_cmdline_in_header -o %s -R " + b37KGReference + " -V " + privateTestDir + "spanningDel.1.g.vcf -V " + privateTestDir + "spanningDel.2.g.vcf -V " + privateTestDir + "spanningDel.3.g.vcf", 1, - Collections.singletonList("7fe5364565585d31a0bb6a9dfa4a01d4")); + Collections.singletonList("907dfaa4d31c22705eadd5890ae23929")); spec.disableShadowBCF(); executeTest("testMultipleSpanningDeletionsMD5", spec); } @@ -512,7 +512,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { "-T GenotypeGVCFs --no_cmdline_in_header -o %s -R " + b37KGReference + " -V " + privateTestDir + "spanningDel.delOnly.g.vcf", 1, - Collections.singletonList("057f9368f380bf3c12b539a749deac61")); + Collections.singletonList("b923e5c6d5dbce62034178bd5234b932")); spec.disableShadowBCF(); executeTest("testSpanningDeletionDoesNotGetGenotypedWithNoOtherAlleles", spec); } @@ -523,7 +523,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { "-T GenotypeGVCFs --no_cmdline_in_header -o %s -R " + b37KGReference + " -V " + privateTestDir + "spanningDel.depr.delOnly.g.vcf", 1, - Collections.singletonList("e8f5186718050fe0784416e41425563f")); + Collections.singletonList("01ae75dfe5c0c2350fcef0f4cdca36b2")); spec.disableShadowBCF(); executeTest("testSpanningDeletionDoesNotGetGenotypedWithNoOtherAlleles", spec); } @@ -546,7 +546,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { "-T GenotypeGVCFs --no_cmdline_in_header -o %s -R " + b37KGReference + " -V " + privateTestDir + "ad-bug-input.vcf", 1, - Collections.singletonList("5ed5cb6aac68aa8943dc45b8b90eb508")); + Collections.singletonList("4d6cbd8d666a43fc136d73de2b217719")); spec.disableShadowBCF(); executeTest("testBadADPropagationHaploidBugTest", spec); } @@ -557,7 +557,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { "-T GenotypeGVCFs --no_cmdline_in_header -o %s -R " + b37KGReference + " -V " + privateTestDir + "261_S01_raw_variants_gvcf.vcf", 1, - Collections.singletonList("37eec6aedd26aa3430a15d90d7f8a011")); + Collections.singletonList("ea96440b537dd1b2b25ea565dfaa71fc")); spec.disableShadowBCF(); executeTest("testSAC", spec); } @@ -568,7 +568,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { "-T GenotypeGVCFs --no_cmdline_in_header -o %s -R " + b37KGReference + " -V " + privateTestDir + "tetraploid-multisample-sac.g.vcf", 1, - Collections.singletonList("76532a74d4ba49f23362c149ad31a229")); + Collections.singletonList("c21c847ef794c11e249985a16893b2fa")); spec.disableShadowBCF(); executeTest("testSACMultisampleTetraploid", spec); } @@ -579,7 +579,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { baseTestString(" -V " + privateTestDir + "set.zero.RGQs.no.call.sample1.g.vcf" + " -V " + privateTestDir + "set.zero.RGQs.no.call.sample2.g.vcf" + " -L chr16:1279274-1279874 -allSites", hg19ReferenceWithChrPrefixInChromosomeNames), - Collections.singletonList("92c097d8b6074d40f8d1385bc92a0a5d")); + Collections.singletonList("fc7016c0cd5cfa186bab80329eb0bc13")); spec.disableShadowBCF(); executeTest("testSetZeroRGQsToNoCall", spec); } @@ -588,7 +588,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { public void testAlleleSpecificAnnotations() { final String cmd = "-T GenotypeGVCFs -R " + b37KGReference + " -o %s --no_cmdline_in_header -G Standard -G AS_Standard --disableDithering -V " + privateTestDir + "NA12878.AS.chr20snippet.g.vcf -V " + privateTestDir + "NA12891.AS.chr20snippet.g.vcf"; - final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Collections.singletonList("89712a9fe5b6db16be2257be2b0b4759")); + final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Collections.singletonList("29d6db0a93abd72d64fb1e82da65c715")); spec.disableShadowBCF(); executeTest("testAlleleSpecificAnnotations", spec); } @@ -597,7 +597,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { public void testASMateRankSumAnnotation() { final String cmd = "-T GenotypeGVCFs -R " + b37KGReference + " -o %s --no_cmdline_in_header -G Standard -G AS_Standard -A AS_MQMateRankSumTest --disableDithering -V " + privateTestDir + "NA12878.AS.MateRankSum.chr20snippet.g.vcf -V " + privateTestDir + "NA12891.AS.MateRankSum.chr20snippet.g.vcf"; - final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Collections.singletonList("8e41a139600ab58a67910cdc60053726")); + final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Collections.singletonList("2a330015a7db9f9aee9bc5b776698f73")); spec.disableShadowBCF(); executeTest("testASMateRankSumAnnotation", spec); } @@ -606,7 +606,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { public void testASInsertSizeRankSumAnnotation() { final String cmd = "-T GenotypeGVCFs -R " + b37KGReference + " -o %s --no_cmdline_in_header -G Standard -G AS_Standard --disableDithering -V " + privateTestDir + "NA12878.AS.InsertSizeRankSum.chr20snippet.g.vcf -V " + privateTestDir + "NA12891.AS.InsertSizeRankSum.chr20snippet.g.vcf"; - final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Collections.singletonList("b1334fbfbf21934aac1c1eda0b5062d5")); + final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Collections.singletonList("75aee1e0c8c3528180e344ec6c0d8ffd")); spec.disableShadowBCF(); executeTest("testASInsertSizeRankSumAnnotation", spec); } @@ -619,7 +619,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { public void testAlleleSpecificAnnotations_oneSample() { final String cmd = "-T GenotypeGVCFs -R " + b37KGReference + " -o %s --no_cmdline_in_header -G Standard -G AS_Standard --disableDithering -V " + privateTestDir + "NA12878.AS.chr20snippet.g.vcf"; - final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Collections.singletonList("7d86260e91fe74588e01339a2064b59c")); + final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Collections.singletonList("f4fa3acec2b21037368898e913b7a3fa")); spec.disableShadowBCF(); executeTest("testAlleleSpecificAnnotations_oneSample", spec); } @@ -629,7 +629,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { public void testAlleleSpecificAnnotations_elevenSamples() { final String cmd = "-T GenotypeGVCFs -R " + b37KGReference + " -o %s --no_cmdline_in_header -G Standard -G AS_Standard --disableDithering -V " + privateTestDir + "multiSamples.g.vcf"; - final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Collections.singletonList("a889fe6775575513e84905b4fa98f8b3")); + final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Collections.singletonList("4e90f6908248fac9b3ce3e545180a8e5")); spec.disableShadowBCF(); executeTest("testAlleleSpecificAnnotations_elevenSamples", spec); } @@ -638,7 +638,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { public void testMonomorphicVCwithAlt() { final String cmd = "-T GenotypeGVCFs -R " + b37KGReference + " -G AS_Standard -o %s --no_cmdline_in_header --disableDithering -V " + privateTestDir + "monomorphicGVCwithAlt.vcf"; - final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Collections.singletonList("8bf329a40637623515972dcc0e09a49e")); + final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Collections.singletonList("43953b3e75a4d470b65773b1b5bea066")); spec.disableShadowBCF(); executeTest("testAlleleSpecificAnnotations", spec); } @@ -647,7 +647,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { public void testFractionInformativeReads() { final String cmd = "-T GenotypeGVCFs -R " + b37KGReference + " -G AS_Standard -o %s --no_cmdline_in_header -A FractionInformativeReads --disableDithering -V " + privateTestDir + "NA12878.AS.chr20snippet.g.vcf -V " + privateTestDir + "NA12891.AS.chr20snippet.g.vcf"; - final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Collections.singletonList("b338bf1807791b23255b8cb1947c01b2")); + final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Collections.singletonList("0b1bbcc7d24f8b0945c97907b1cdd974")); spec.disableShadowBCF(); executeTest("testAlleleSpecificAnnotations", spec); } @@ -657,7 +657,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { public void testGenotypingSpanningDeletionWithAllSites() { final WalkerTestSpec spec = new WalkerTestSpec( baseTestString(" -V " + privateTestDir + "spanningDel.genotyping.g.vcf -allSites", b37KGReference), - Collections.singletonList("7cc3b08a37ed0c2e556debc1023cff2b")); + Collections.singletonList("e2370ba728cc9b73950b2ed616ef669f")); spec.disableShadowBCF(); executeTest("testGenotypingSpanningDeletionWithAllSites", spec); } @@ -666,7 +666,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { public void testGenotypingSpanningDeletionAcrossLines() { final WalkerTestSpec spec = new WalkerTestSpec( baseTestString(" -V " + privateTestDir + "input-1_2256566.vcf", b37KGReference), - Collections.singletonList("24ac243e77e679508c6554194923317b")); + Collections.singletonList("152c8e07e35c592868f43626f27365de")); spec.disableShadowBCF(); executeTest("testGenotypingSpanningDeletionAcrossLines", spec); } From 091d05370b6beb627e2f6eee990e78cf7e433cac Mon Sep 17 00:00:00 2001 From: Steve Huang Date: Tue, 13 Sep 2016 21:20:44 -0400 Subject: [PATCH 41/68] Fix 1448 to make alt allele removal by likelihoods robust to ref allele indices (#1475) * alt alle removal by likelihoods now robust to ref allele indices (no longer assumes 0-indexed ref) --- .../HaplotypeCallerGenotypingEngine.java | 10 ++++-- ...plotypeCallerGenotypingEngineUnitTest.java | 34 +++++++++++++++++++ 2 files changed, 41 insertions(+), 3 deletions(-) diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngine.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngine.java index ad0f2b773..1f31e7cd6 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngine.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngine.java @@ -487,7 +487,8 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine excessAlternativeAlleles(final GenotypingLikelihoods genotypeLikelihoods, final int maxAlternativeAlleles) { + @VisibleForTesting + static Set excessAlternativeAlleles(final GenotypingLikelihoods genotypeLikelihoods, final int maxAlternativeAlleles) { final int alleleCount = genotypeLikelihoods.alleleCount(); final int excessAlternativeAlleleCount = Math.max(0, alleleCount - 1 - maxAlternativeAlleles); if (excessAlternativeAlleleCount <= 0) { @@ -539,8 +540,11 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine result = new HashSet<>(excessAlternativeAlleleCount); diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngineUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngineUnitTest.java index f8dd4b39c..25ed5e59e 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngineUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngineUnitTest.java @@ -59,10 +59,15 @@ package org.broadinstitute.gatk.tools.walkers.haplotypecaller; import htsjdk.samtools.reference.ReferenceSequenceFile; import htsjdk.variant.variantcontext.*; +import org.broadinstitute.gatk.tools.walkers.genotyper.*; import org.broadinstitute.gatk.utils.BaseTest; import org.broadinstitute.gatk.utils.*; import org.broadinstitute.gatk.utils.collections.Pair; import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile; +import org.broadinstitute.gatk.utils.genotyper.AlleleList; +import org.broadinstitute.gatk.utils.genotyper.IndexedAlleleList; +import org.broadinstitute.gatk.utils.genotyper.IndexedSampleList; +import org.broadinstitute.gatk.utils.genotyper.ReadLikelihoods; import org.broadinstitute.gatk.utils.haplotype.EventMap; import org.broadinstitute.gatk.utils.haplotype.Haplotype; import org.broadinstitute.gatk.utils.pileup.ReadBackedPileup; @@ -531,4 +536,33 @@ public class HaplotypeCallerGenotypingEngineUnitTest extends BaseTest { Assert.assertEquals(uniqueGroups.size(), expectedNumGroups); Assert.assertEquals(counter, expectedGroupSize); } + + @Test + public void testExcessAlternativeAllelesKeepRef(){ + + // prep data + final Allele ref = Allele.create("A", true); + final Allele altC = Allele.create("C", false); + final Allele altG = Allele.create("G", false); + final Allele altT = Allele.create("T", false); + final AlleleList indexedAlleleList = new IndexedAlleleList<>(altC, altG, altT, ref);// specifically make the ref allele not at index 0 + + final IndexedSampleList indexedSampleList = new IndexedSampleList("Dummy"); + + final List reads = new ArrayList<>(); + for (int i=0; i<10; ++i) { + reads.add(GATKSAMRecord.createRandomRead(101)); + } + final Map> sampleToReads = Collections.singletonMap(indexedSampleList.sampleAt(0), reads); + final ReadLikelihoods readLikelihoods = new ReadLikelihoods<>(indexedSampleList, indexedAlleleList, sampleToReads); + final PloidyModel ploidyModel = new HomogeneousPloidyModel(indexedSampleList, 2); + final GenotypingModel genotypingModel = new InfiniteRandomMatingPopulationModel(); + + final GenotypingLikelihoods genotypeLikelihoods = genotypingModel.calculateLikelihoods(readLikelihoods, new GenotypingData<>(ploidyModel,readLikelihoods)); + + // test + final Set excessAltAlleles = HaplotypeCallerGenotypingEngine.excessAlternativeAlleles(genotypeLikelihoods, 2); + Assert.assertFalse(excessAltAlleles.contains(ref)); + Assert.assertEquals(excessAltAlleles.size(), 1); + } } From 6564066de3c28304e767d8575e42f075ba23b9cc Mon Sep 17 00:00:00 2001 From: Ron Levine Date: Tue, 6 Sep 2016 18:57:42 -0400 Subject: [PATCH 42/68] Remove NON_REF from allSites VCF output --- .../walkers/variantutils/GenotypeGVCFs.java | 46 ++++++++++++++++++- .../GenotypeGVCFsIntegrationTest.java | 17 +++++-- 2 files changed, 59 insertions(+), 4 deletions(-) diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFs.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFs.java index 7e716a979..53a2f1790 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFs.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFs.java @@ -157,7 +157,7 @@ public class GenotypeGVCFs extends RodWalker> getResourceRodBindings() { return Collections.emptyList(); } public boolean alwaysAppendDbsnpId() { return false; } + // INFO Header names that require alt alleles + final Set infoHeaderAltAllelesLineNames = new LinkedHashSet<>(); public void initialize() { boolean inputsAreTagged = false; @@ -248,6 +250,18 @@ public class GenotypeGVCFs extends RodWalker newAlleles = new ArrayList<>(); + // Only keep alleles that are not NON-REF + for ( final Allele allele : vc.getAlleles() ) { + if ( !allele.equals(GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE) ) { + newAlleles.add(allele); + } + } + + final VariantContextBuilder builder = new VariantContextBuilder(vc).alleles(newAlleles); + + // No alt allele, so remove INFO fields that require alt alleles + if ( newAlleles.size() == 1 ) { + for ( final String name : infoHeaderAltAllelesLineNames ) { + builder.rmAttributes(Arrays.asList(name)); + } + } + + return builder.make(); + } + /** * Determines whether the provided VariantContext has real alternate alleles. * diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFsIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFsIntegrationTest.java index 659d6e2d5..98b806f40 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFsIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFsIntegrationTest.java @@ -157,7 +157,8 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { " -V:sample3 " + privateTestDir + "combine.single.sample.pipeline.3.vcf" + " --includeNonVariantSites -L 20:10,030,000-10,033,000 -L 20:10,386,000-10,386,500", b37KGReference), 1, - Collections.singletonList("edf083b3bf9cdec31b997a70fd56a7b2")); + Collections.singletonList("ea11554de21ef8f25e9983db8b5a8480")); + spec.disableShadowBCF(); executeTest("combineSingleSamplePipelineGVCF_includeNonVariants", spec); } @@ -245,6 +246,16 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { executeTest("testNDA", spec); } + @Test + public void testAllSitesNonBiallelic() { + final WalkerTestSpec spec = new WalkerTestSpec( + baseBPResolutionString("-allSites"), + 1, + Collections.singletonList("77924e6b958a30f954e1c3a9f504a6a7")); + spec.disableShadowBCF(); + executeTest("testAllSitesNonBiallelic", spec); + } + @Test public void testMaxAltAlleles() { final WalkerTestSpec spec = new WalkerTestSpec( @@ -579,7 +590,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { baseTestString(" -V " + privateTestDir + "set.zero.RGQs.no.call.sample1.g.vcf" + " -V " + privateTestDir + "set.zero.RGQs.no.call.sample2.g.vcf" + " -L chr16:1279274-1279874 -allSites", hg19ReferenceWithChrPrefixInChromosomeNames), - Collections.singletonList("fc7016c0cd5cfa186bab80329eb0bc13")); + Collections.singletonList("e88db6e49c12487c55de42769d2f8c6c")); spec.disableShadowBCF(); executeTest("testSetZeroRGQsToNoCall", spec); } @@ -657,7 +668,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { public void testGenotypingSpanningDeletionWithAllSites() { final WalkerTestSpec spec = new WalkerTestSpec( baseTestString(" -V " + privateTestDir + "spanningDel.genotyping.g.vcf -allSites", b37KGReference), - Collections.singletonList("e2370ba728cc9b73950b2ed616ef669f")); + Collections.singletonList("d3d862faf954f9bb8b1619c3e889ad8c")); spec.disableShadowBCF(); executeTest("testGenotypingSpanningDeletionWithAllSites", spec); } From c13e3752d67e90143635e93ca36a0849204bc295 Mon Sep 17 00:00:00 2001 From: Ron Levine Date: Fri, 16 Sep 2016 11:30:07 -0400 Subject: [PATCH 43/68] Move htsjdk to ver 2.6.1 and picard to ver 2.6.0 --- .../DiagnoseTargetsIntegrationTest.java | 4 ++-- .../VariantFiltrationIntegrationTest.java | 20 +++++++++---------- .../SelectVariantsIntegrationTest.java | 8 ++++---- public/gatk-root/pom.xml | 4 ++-- .../indels/IndelRealignerIntegrationTest.java | 14 ++++++------- 5 files changed, 25 insertions(+), 25 deletions(-) diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/diagnostics/diagnosetargets/DiagnoseTargetsIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/diagnostics/diagnosetargets/DiagnoseTargetsIntegrationTest.java index 6ac91a350..ce54ece5f 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/diagnostics/diagnosetargets/DiagnoseTargetsIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/diagnostics/diagnosetargets/DiagnoseTargetsIntegrationTest.java @@ -71,11 +71,11 @@ public class DiagnoseTargetsIntegrationTest extends WalkerTest { @Test(enabled = true) public void testSingleSample() { - DTTest("testSingleSample ", "-I " + singleSample + " -max 75", "13bfe41ef083d2716e07d35223916a4e"); + DTTest("testSingleSample ", "-I " + singleSample + " -max 75", "19c56b853b20ac674b6de1332043586d"); } @Test(enabled = true) public void testMultiSample() { - DTTest("testMultiSample ", "-I " + multiSample, "64b4fa6cf4c4d16e822289990ee88240"); + DTTest("testMultiSample ", "-I " + multiSample, "90770023666f3c1d6a3f35e5ecada4a8"); } } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/filters/VariantFiltrationIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/filters/VariantFiltrationIntegrationTest.java index 9f2678d20..89ffe527f 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/filters/VariantFiltrationIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/filters/VariantFiltrationIntegrationTest.java @@ -164,7 +164,7 @@ public class VariantFiltrationIntegrationTest extends WalkerTest { public void testGenotypeFilters1() { WalkerTestSpec spec1 = new WalkerTestSpec( baseTestString() + " -G_filter 'GQ == 0.60' -G_filterName foo --variant " + privateTestDir + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1, - Arrays.asList("ced70cfb4e6681a3aa0633cd0510ada0")); + Arrays.asList("b6e8d70223826000ea1a6d6bc9c4fc65")); executeTest("test genotype filter #1", spec1); } @@ -172,7 +172,7 @@ public class VariantFiltrationIntegrationTest extends WalkerTest { public void testGenotypeFilters2() { WalkerTestSpec spec2 = new WalkerTestSpec( baseTestString() + " -G_filter 'isHomVar == 1' -G_filterName foo --variant " + privateTestDir + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1, - Arrays.asList("837b6a3ce3fad3bd77ec3e870c4d2f10")); + Arrays.asList("9cd315a433ab7d9da637156011328509")); executeTest("test genotype filter #2", spec2); } @@ -207,7 +207,7 @@ public class VariantFiltrationIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( "-T VariantFiltration -o %s --no_cmdline_in_header -R " + b37KGReference + " --genotypeFilterExpression 'DP < 8' --genotypeFilterName lowDP -V " + privateTestDir + "filteringDepthInFormat.vcf", 1, - Arrays.asList("260dd9d7e35737fe695b241b7a5a52a2")); + Arrays.asList("b0016040127766a4163fcbd91afff3ea")); executeTest("testFilteringDPfromFORMAT", spec); } @@ -216,7 +216,7 @@ public class VariantFiltrationIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( "-T VariantFiltration -o %s --no_cmdline_in_header -R " + b37KGReference + " --genotypeFilterExpression 'DP < 10' --genotypeFilterName lowDP -V " + privateTestDir + "filteringDepthInFormatWithMissing.vcf", 1, - Arrays.asList("4bf46103a71bac92a11eae04b97f9877")); + Arrays.asList("cc55e6a7bae2ab3503ecefc973ec1c2d")); executeTest("testFilteringDPfromFORMATWithMissing", spec); } @@ -225,7 +225,7 @@ public class VariantFiltrationIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( "-T VariantFiltration -o %s --no_cmdline_in_header -R " + b37KGReference + " --missingValuesInExpressionsShouldEvaluateAsFailing --genotypeFilterExpression 'DP < 10' --genotypeFilterName lowDP -V " + privateTestDir + "filteringDepthInFormatWithMissing.vcf", 1, - Arrays.asList("baeda696c92adc8745ac4ebbdead6c91")); + Arrays.asList("521e6f33325a051ced28152a1e7c273d")); executeTest("testFilteringDPfromFORMATAndFailMissing", spec); } @@ -234,7 +234,7 @@ public class VariantFiltrationIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( "-T VariantFiltration -o %s --no_cmdline_in_header -R " + b37KGReference + " --genotypeFilterExpression 'DP < 8' --genotypeFilterName highDP -V " + privateTestDir + "filteringDepthInFormat.vcf --invertGenotypeFilterExpression", 1, - Arrays.asList("907527b89d3f819cc3f6f88f51fcaaf6")); + Arrays.asList("c6bc275c97a9e737748d16132ee76f48")); executeTest("testInvertGenotypeFilterExpression", spec); } @@ -243,7 +243,7 @@ public class VariantFiltrationIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( "-T VariantFiltration -o %s --no_cmdline_in_header -R " + b37KGReference + " --genotypeFilterExpression 'DP >= 8' --genotypeFilterName highDP -V " + privateTestDir + "filteringDepthInFormat.vcf", 1, - Arrays.asList("d79b2e5a7502a6d6e902bc40d74cc826")); // Differs from testInvertFilter because FILTER description uses the -genotypeFilterExpression argument + Arrays.asList("9321b5993d51a4da02f69e5467164587")); // Differs from testInvertFilter because FILTER description uses the -genotypeFilterExpression argument executeTest("testInvertJexlGenotypeFilterExpression", spec); } @@ -252,7 +252,7 @@ public class VariantFiltrationIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( "-T VariantFiltration -o %s --no_cmdline_in_header -R " + b37KGReference + " --genotypeFilterExpression 'DP < 8' --genotypeFilterName lowDP -V " + privateTestDir + "filteringDepthInFormat.vcf --setFilteredGtToNocall", 1, - Arrays.asList("2ff3753215d418712309e50da323f6e8")); + Arrays.asList("00990d54017b7384ce9f979d796b9d16")); executeTest("testSetFilteredGtoNocall", spec); } @@ -263,7 +263,7 @@ public class VariantFiltrationIntegrationTest extends WalkerTest { "-T VariantFiltration -o %s --no_cmdline_in_header -R " + b37KGReference + " -G_filter 'GQ < 20' -G_filterName lowDP -G_filter 'DP<10' -G_filterName lowGQ -V " + privateTestDir + "variantFiltrationInfoField.vcf --setFilteredGtToNocall", 1, - Arrays.asList("3b074975bb6f70c84b2dd81695bb89ff")); + Arrays.asList("0f8ed3a62a53feca0c4b86671e4b53e4")); executeTest("testSetFilteredGtoNocallUpdateInfo", spec); } @@ -274,7 +274,7 @@ public class VariantFiltrationIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( "-T SelectVariants --setFilteredGtToNocall -R " + b37KGReference + " --variant " + testfile + " -o %s --no_cmdline_in_header", 1, - Arrays.asList("410c6b7bb62fc43bb41eee627670f757") + Arrays.asList("cb5ef9233503bebc81593e436a6de943") ); spec.disableShadowBCF(); diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariantsIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariantsIntegrationTest.java index 2709eb3b2..ad97cf4a5 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariantsIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariantsIntegrationTest.java @@ -67,8 +67,8 @@ public class SelectVariantsIntegrationTest extends WalkerTest { private static final String SAMPLE_EXCLUSION_MD5 = "2e52f21e7dcc67151a51630807a4eef2"; private static final String INVERT_SELECTION_MD5 = "26d192b868746ab14133f145ae812e7c"; - private static final String MAX_FILTERED_GT_SELECTION_MD5 = "f83ac0deb7a8b022d6d40a85627a71ec"; - private static final String MIN_FILTERED_GT_SELECTION_MD5 = "346620b7a5d66dabf89d3f42d6e27db7"; + private static final String MAX_FILTERED_GT_SELECTION_MD5 = "66d92fac72b339195b393c9915643a14"; + private static final String MIN_FILTERED_GT_SELECTION_MD5 = "965c0cf7daa03a1731b371bb20b582d4"; private static final String NO_CALL_FILTERING_KEEP_ONE = "6e2401190c5ada6a3bed2640c068f43b"; private static final String NO_CALL_FILTERING_KEEP_TWO = "6bced1ab6a3d58f1fd905b7f601987a3"; @@ -744,7 +744,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( "-T SelectVariants --setFilteredGtToNocall -R " + b37KGReference + " --variant " + testfile + " -o %s --no_cmdline_in_header", 1, - Arrays.asList("410c6b7bb62fc43bb41eee627670f757") + Arrays.asList("cb5ef9233503bebc81593e436a6de943") ); spec.disableShadowBCF(); @@ -759,7 +759,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest { "-T SelectVariants --setFilteredGtToNocall --removeUnusedAlternates --excludeNonVariants -R " + b37KGReference + " --variant " + testfile + " -o %s --no_cmdline_in_header", 1, - Arrays.asList("349136d92f915f8c7ba8a2f92e51d6b7")); + Arrays.asList("f5b2592361d8ab0d47e5047e63f78e4c")); executeTest("testSetFilteredGtoNocallUpdateInfo", spec); } diff --git a/public/gatk-root/pom.xml b/public/gatk-root/pom.xml index cd238569d..a6ed90656 100644 --- a/public/gatk-root/pom.xml +++ b/public/gatk-root/pom.xml @@ -44,8 +44,8 @@ org.testng.reporters.FailedReporter,org.testng.reporters.JUnitXMLReporter,org.broadinstitute.gatk.utils.TestNGTestTransformer,org.broadinstitute.gatk.utils.GATKTextReporter,org.uncommons.reportng.HTMLReporter - 2.5.0 - 2.5.0 + 2.6.1 + 2.6.0 diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/tools/walkers/indels/IndelRealignerIntegrationTest.java b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/tools/walkers/indels/IndelRealignerIntegrationTest.java index 81b2c457c..0c3974b6c 100644 --- a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/tools/walkers/indels/IndelRealignerIntegrationTest.java +++ b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/tools/walkers/indels/IndelRealignerIntegrationTest.java @@ -40,8 +40,8 @@ public class IndelRealignerIntegrationTest extends WalkerTest { private static final String knownIndels = validationDataLocation + "indelRealignerTest.pilot1.ceu.vcf"; private static final String baseCommandPrefix = "-T IndelRealigner -noPG -R " + b36KGReference + " -I " + mainTestBam + " -targetIntervals " + mainTestIntervals + " -compress 0 -L 20:49,500-55,500 "; private static final String baseCommand = baseCommandPrefix + "-o %s "; - private static final String base_md5 = "ab7407d2299d9ba73449cea376eeb9c4"; - private static final String base_md5_with_SW_or_VCF = "fa57bd96b83038ac6a70e58e11bf5364"; + private static final String base_md5 = "12e7c9fd7af4fc9184c5f58a1660eac5"; + private static final String base_md5_with_SW_or_VCF = "2d3f79298687da007da52286b5c7261d"; @Test public void testDefaults() { @@ -64,7 +64,7 @@ public class IndelRealignerIntegrationTest extends WalkerTest { WalkerTestSpec spec1 = new WalkerTestSpec( baseCommand + "--consensusDeterminationModel KNOWNS_ONLY -known " + knownIndels, 1, - Arrays.asList("c42b6f3e1270e43cce2b6f75b6a38f30")); + Arrays.asList("3d028025dcb8d268262274d8ffc42635")); executeTest("realigner known indels only from VCF", spec1); } @@ -81,7 +81,7 @@ public class IndelRealignerIntegrationTest extends WalkerTest { public void testLods() { HashMap e = new HashMap(); e.put("-LOD 60", base_md5); - e.put( "-LOD 1 --consensusDeterminationModel USE_SW", "0c4597e48b4e194de32ebe494704ea6b" ); + e.put( "-LOD 1 --consensusDeterminationModel USE_SW", "44868da9b026201572cbfaaedacc57eb" ); for ( Map.Entry entry : e.entrySet() ) { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( @@ -97,7 +97,7 @@ public class IndelRealignerIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( "-T IndelRealigner -noPG -R " + b36KGReference + " -I " + validationDataLocation + "NA12878.chrom1.SLX.SRP000032.2009_06.bam -L 1:10,000,000-11,000,000 -targetIntervals " + validationDataLocation + "indelRealignerTest.NA12878.chrom1.intervals -compress 0 -o %s", 1, - Arrays.asList("19e6859b9ef09c7e0a79a19626908b17")); + Arrays.asList("c40aa32bca520015acb175fde52b4ed4")); executeTest("realigner long run", spec); } @@ -106,7 +106,7 @@ public class IndelRealignerIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( baseCommand + "--noOriginalAlignmentTags --consensusDeterminationModel USE_SW", 1, - Arrays.asList("8f5684359d7b26acaacfa657ef395a0c")); + Arrays.asList("3adc7711a163a65a570a47fe28eb4d24")); executeTest("realigner no output tags", spec); } @@ -128,7 +128,7 @@ public class IndelRealignerIntegrationTest extends WalkerTest { @Test public void testMaxReadsInMemory() { HashMap e = new HashMap(); - e.put("--maxReadsInMemory 10000", "236c64f2da0047534b44444d9d699378"); + e.put("--maxReadsInMemory 10000", "b8a4491506303dc96cf105ba069dd928"); e.put( "--maxReadsInMemory 40000", base_md5 ); for ( Map.Entry entry : e.entrySet() ) { From 318bee2269ba08f2bf3dd79a5f88a3abb0ddfb98 Mon Sep 17 00:00:00 2001 From: David Benjamin Date: Fri, 26 Aug 2016 14:05:49 -0400 Subject: [PATCH 44/68] Backport new AFCalculator --- ...GenotypeCalculationArgumentCollection.java | 13 + .../genotyper/GenotypeAlleleCounts.java | 72 +++-- .../walkers/genotyper/GenotypingEngine.java | 21 +- .../genotyper/afcalc/AFCalculationResult.java | 2 +- .../genotyper/afcalc/AFCalculator.java | 2 +- .../afcalc/AlleleFrequencyCalculator.java | 236 +++++++++++++++++ .../broadinstitute/gatk/utils/Dirichlet.java | 121 +++++++++ .../broadinstitute/gatk/utils/IndexRange.java | 248 +++++++++++++++++ .../GenotypeAlleleCountsUnitTest.java | 27 ++ .../AlleleFrequencyCalculatorUnitTest.java | 250 ++++++++++++++++++ .../broadinstitute/gatk/utils/MathUtils.java | 72 +++++ .../org/broadinstitute/gatk/utils/Utils.java | 51 ++++ 12 files changed, 1085 insertions(+), 30 deletions(-) create mode 100644 protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/AlleleFrequencyCalculator.java create mode 100644 protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/Dirichlet.java create mode 100644 protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/IndexRange.java create mode 100644 protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/AlleleFrequencyCalculatorUnitTest.java diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/arguments/GenotypeCalculationArgumentCollection.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/arguments/GenotypeCalculationArgumentCollection.java index d879eec82..9e14efaac 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/arguments/GenotypeCalculationArgumentCollection.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/arguments/GenotypeCalculationArgumentCollection.java @@ -71,6 +71,12 @@ public class GenotypeCalculationArgumentCollection implements Cloneable{ @Argument(fullName = "annotateNDA", shortName = "nda", doc = "If provided, we will annotate records with the number of alternate alleles that were discovered (but not necessarily genotyped) at a given site", required = false) public boolean ANNOTATE_NUMBER_OF_ALLELES_DISCOVERED = false; + /** + * Use the new allele frequency / QUAL score model + */ + @Argument(fullName = "useNewAFCalculator", shortName = "newQual", doc = "If provided, we will use the new AF model instead of the so-called exact model", required = false) + public boolean USE_NEW_AF_CALCULATOR = false; + /** * The expected heterozygosity value used to compute prior probability that a locus is non-reference. * @@ -102,6 +108,13 @@ public class GenotypeCalculationArgumentCollection implements Cloneable{ @Argument(fullName = "indel_heterozygosity", shortName = "indelHeterozygosity", doc = "Heterozygosity for indel calling", required = false) public double indelHeterozygosity = HomoSapiensConstants.INDEL_HETEROZYGOSITY; + /** + * The standard deviation of the distribution of alt allele fractions. The above heterozygosity parameters give the + * *mean* of this distribution; this parameter gives its spread. + */ + @Argument(fullName = "heterozygosity_stdev", shortName = "heterozygosityStandardDeviation", doc = "Standard deviation of eterozygosity for SNP and indel calling.", required = false) + public double heterozygosityStandardDeviation = 0.01; + /** * The minimum phred-scaled Qscore threshold to separate high confidence from low confidence calls. Only genotypes with * confidence >= this threshold are emitted as called sites. A reasonable threshold is 30 for high-pass calling (this diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypeAlleleCounts.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypeAlleleCounts.java index f1edc7eae..daf7e653c 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypeAlleleCounts.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypeAlleleCounts.java @@ -52,6 +52,7 @@ package org.broadinstitute.gatk.tools.walkers.genotyper; import htsjdk.variant.variantcontext.Allele; +import org.broadinstitute.gatk.utils.IndexRange; import org.broadinstitute.gatk.utils.MathUtils; import java.util.Arrays; @@ -108,7 +109,18 @@ import java.util.List; */ public class GenotypeAlleleCounts implements Comparable, Cloneable { - private double log10CombinationCount; + private static final double UNCOMPUTED_LOG_10_COMBINATION_COUNT = -1; + + /** + * The log10 number of phased genotypes corresponding to this unphased genotype. For example, + * [0, 1, 1, 1] = AB: log10(2) + * [0, 2] = AA: log10(1) + * [0, 1, 1, 1, 2, 1] = ABC: log10(6) + * [0, 2, 1, 2] = AABB: log10(4!/(2!2!)) + * This is evaluated lazily i.e. it is initialized to {@link GenotypeAlleleCounts::UNCOMPUTED_LOG_10_COMBINATION_COUNT} + * and only calculated if its getter is invoked. + */ + private double log10CombinationCount = UNCOMPUTED_LOG_10_COMBINATION_COUNT; /** * The ploidy of the genotype. @@ -156,38 +168,30 @@ public class GenotypeAlleleCounts implements Comparable, C * @param index the genotype index. */ private GenotypeAlleleCounts(final int ploidy, final int index, final int... sortedAlleleCounts) { + this(ploidy, index, sortedAlleleCounts, sortedAlleleCounts.length >> 1); + } + + private GenotypeAlleleCounts(final int ploidy, final int index, final int[] sortedAlleleCounts, final int distinctAlleleCount){ this.ploidy = ploidy; - this.sortedAlleleCounts = sortedAlleleCounts; - distinctAlleleCount = sortedAlleleCounts.length >> 1; - log10CombinationCount = -1; this.index = index; + this.sortedAlleleCounts = sortedAlleleCounts; + this.distinctAlleleCount = distinctAlleleCount; } /** - * Returns the log10 of the number of possible allele combinations that would give raise to this allele count. - * @return 0 or less. + * Gets the log10 combination count, computing it if uninitialized. Note that the invoked MathUtils method uses fast cached + * log10 values of integers for any reasonable ploidy. + * + * This method should be invoked on instances of {@link GenotypeAlleleCounts} cached in {@link GenotypeLikelihoodCalculators::genotypeTableByPloidy}. + * Such usage allows the result of this computation to be cached once for an entire run of HaplotypeCaller. + * @return */ public double log10CombinationCount() { - if (log10CombinationCount == -1) - return log10CombinationCount = calculateLog10CombinationCount(); - else - return log10CombinationCount; - } - - /** - * Calculates log10 combination count. - * - * @return 0 or less. - */ - private double calculateLog10CombinationCount() { - if (ploidy <= 1) - return 0; - else { - final int[] counts = new int[distinctAlleleCount]; - for (int i = 0, j = 1; i < distinctAlleleCount; i++, j+=2) - counts[i] = sortedAlleleCounts[j]; - return MathUtils.log10MultinomialCoefficient(ploidy, counts); + if (log10CombinationCount == UNCOMPUTED_LOG_10_COMBINATION_COUNT) { + log10CombinationCount = MathUtils.log10Factorial(ploidy) + - new IndexRange(0, distinctAlleleCount).sum(n -> MathUtils.log10Factorial(sortedAlleleCounts[2*n+1])); } + return log10CombinationCount; } /** @@ -785,4 +789,22 @@ public class GenotypeAlleleCounts implements Comparable, C } return result; } + + @FunctionalInterface + public interface IntBiConsumer { + void accept(final int alleleIndex, final int alleleCount); + } + + @FunctionalInterface + public interface IntToDoubleBiFunction { + double apply(final int alleleIndex, final int alleleCount); + } + + public void forEachAlleleIndexAndCount(final IntBiConsumer action) { + new IndexRange(0, distinctAlleleCount).forEach(n -> action.accept(sortedAlleleCounts[2*n], sortedAlleleCounts[2*n+1])); + } + + public double sumOverAlleleIndicesAndCounts(final IntToDoubleBiFunction func) { + return new IndexRange(0, distinctAlleleCount).sum(n -> func.apply(sortedAlleleCounts[2*n], sortedAlleleCounts[2*n+1])); + } } diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypingEngine.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypingEngine.java index a9ec24c4f..04e33c53b 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypingEngine.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypingEngine.java @@ -60,6 +60,7 @@ import org.broadinstitute.gatk.tools.walkers.annotator.VariantAnnotatorEngine; import org.broadinstitute.gatk.tools.walkers.genotyper.afcalc.AFCalculationResult; import org.broadinstitute.gatk.tools.walkers.genotyper.afcalc.AFCalculator; import org.broadinstitute.gatk.tools.walkers.genotyper.afcalc.AFCalculatorProvider; +import org.broadinstitute.gatk.tools.walkers.genotyper.afcalc.AlleleFrequencyCalculator; import org.broadinstitute.gatk.utils.GenomeLoc; import org.broadinstitute.gatk.utils.GenomeLocParser; import org.broadinstitute.gatk.utils.MathUtils; @@ -107,6 +108,8 @@ public abstract class GenotypingEngine upstreamDeletionsLoc = new LinkedList<>(); + protected final AFCalculator newAFCalculator; + /** * Construct a new genotyper engine, on a specific subset of samples. * @@ -139,6 +142,11 @@ public abstract class GenotypingEngine alleles = vc.getAlleles(); + Utils.validateArg( numAlleles > 1, "VariantContext has only a single reference allele, but getLog10PNonRef requires at least one at all " + vc); + + final double[] priorPseudocounts = alleles.stream() + .mapToDouble(a -> a.isReference() ? refPseudocount : (a.length() > 1 ? snpPseudocount : indelPseudocount)).toArray(); + + double[] alleleCounts = new double[numAlleles]; + final double flatLog10AlleleFrequency = -MathUtils.Log10Cache.get(numAlleles); // log10(1/numAlleles) + double[] log10AlleleFrequencies = new IndexRange(0, numAlleles).mapToDouble(n -> flatLog10AlleleFrequency); + double alleleCountsMaximumDifference = Double.POSITIVE_INFINITY; + + while (alleleCountsMaximumDifference > THRESHOLD_FOR_ALLELE_COUNT_CONVERGENCE) { + final double[] newAlleleCounts = effectiveAlleleCounts(vc, log10AlleleFrequencies); + alleleCountsMaximumDifference = Arrays.stream(MathArrays.ebeSubtract(alleleCounts, newAlleleCounts)).map(Math::abs).max().getAsDouble(); + alleleCounts = newAlleleCounts; + final double[] posteriorPseudocounts = MathArrays.ebeAdd(priorPseudocounts, alleleCounts); + + // first iteration uses flat prior in order to avoid local minimum where the prior + no pseudocounts gives such a low + // effective allele frequency that it overwhelms the genotype likelihood of a real variant + // basically, we want a chance to get non-zero pseudocounts before using a prior that's biased against a variant + log10AlleleFrequencies = new Dirichlet(posteriorPseudocounts).log10MeanWeights(); + } + + double[] log10POfZeroCountsByAllele = new double[numAlleles]; + double log10PNoVariant = 0; + + for (final Genotype g : vc.getGenotypes()) { + if (!g.hasLikelihoods()) { + continue; + } + final int ploidy = g.getPloidy() == 0 ? defaultPloidy : g.getPloidy(); + final GenotypeLikelihoodCalculator glCalc = GL_CALCS.getInstance(ploidy, numAlleles); + final double[] genotypePosteriors = normalizedGenotypePosteriors(g, glCalc, log10AlleleFrequencies); + + //the total probability + log10PNoVariant += Math.log10(genotypePosteriors[HOM_REF_GENOTYPE_INDEX]); + + // per allele non-log space probabilities of zero counts for this sample + // for each allele calculate the total probability of genotypes containing at least one copy of the allele + final double[] pOfNonZeroAltAlleles = new double[numAlleles]; + + for (int genotype = 0; genotype < glCalc.genotypeCount(); genotype++) { + final double genotypePosterior = genotypePosteriors[genotype]; + glCalc.genotypeAlleleCountsAt(genotype).forEachAlleleIndexAndCount((alleleIndex, count) -> + pOfNonZeroAltAlleles[alleleIndex] += genotypePosterior); + } + + for (int allele = 0; allele < numAlleles; allele++) { + log10POfZeroCountsByAllele[allele] += Math.log10(1 - pOfNonZeroAltAlleles[allele]); + } + } + + // unfortunately AFCalculationResult expects integers for the MLE. We really should emit the EM no-integer values + // which are valuable (eg in CombineGVCFs) as the sufficient statistics of the Dirichlet posterior on allele frequencies + final int[] integerAlleleCounts = Arrays.stream(alleleCounts).mapToInt(x -> (int) Math.round(x)).toArray(); + final int[] integerAltAlleleCounts = Arrays.copyOfRange(integerAlleleCounts, 1, numAlleles); + + //skip the ref allele (index 0) + final Map log10PRefByAllele = IntStream.range(1, numAlleles).boxed() + .collect(Collectors.toMap(alleles::get, a -> log10POfZeroCountsByAllele[a])); + + // we compute posteriors here and don't have the same prior that AFCalculationResult expects. Therefore, we + // give it our posterior as its "likelihood" along with a flat dummy prior + final double[] dummyFlatPrior = {-1e-10, -1e-10}; //TODO: HACK must be negative for AFCalcResult + final double[] log10PosteriorOfNoVariantYesVariant = {log10PNoVariant, Math.log10(1 - Math.pow(10, log10PNoVariant))}; + + return new AFCalculationResult(integerAltAlleleCounts, DUMMY_N_EVALUATIONS, alleles, log10PosteriorOfNoVariantYesVariant, dummyFlatPrior, log10PRefByAllele); + } + + private double[] effectiveAlleleCounts(final VariantContext vc, final double[] log10AlleleFrequencies) { + final int numAlleles = vc.getNAlleles(); + Utils.validateArg(numAlleles == log10AlleleFrequencies.length, "number of alleles inconsistent"); + final double[] result = new double[numAlleles]; + for (final Genotype g : vc.getGenotypes()) { + if (!g.hasLikelihoods()) { + continue; + } + final GenotypeLikelihoodCalculator glCalc = GL_CALCS.getInstance(g.getPloidy(), numAlleles); + final double[] genotypePosteriors = normalizedGenotypePosteriors(g, glCalc, log10AlleleFrequencies); + + new IndexRange(0, glCalc.genotypeCount()).forEach(genotypeIndex -> + glCalc.genotypeAlleleCountsAt(genotypeIndex).forEachAlleleIndexAndCount((alleleIndex, count) -> + result[alleleIndex] += genotypePosteriors[genotypeIndex] * count)); + } + return result; + } + + private static double[] normalizedGenotypePosteriors(final Genotype g, final GenotypeLikelihoodCalculator glCalc, final double[] log10AlleleFrequencies) { + final double[] log10Likelihoods = g.getLikelihoods().getAsVector(); + final double[] unnormalizedLog10Likelihoods = new IndexRange(0, glCalc.genotypeCount()).mapToDouble(genotypeIndex -> { + final GenotypeAlleleCounts gac = glCalc.genotypeAlleleCountsAt(genotypeIndex); + return gac.log10CombinationCount() + log10Likelihoods[genotypeIndex] + + gac.sumOverAlleleIndicesAndCounts((index, count) -> count * log10AlleleFrequencies[index]); + }); + return MathUtils.normalizeFromLog10(unnormalizedLog10Likelihoods); + } + + @Override //Note: unused + protected AFCalculationResult getResultFromFinalState(final VariantContext vc, final double[] priors, final StateTracker st) { return null; } + + @Override//Note: unused + protected AFCalculationResult computeLog10PNonRef(final VariantContext vc, final int defaultPloidy, + final double[] priors, final StateTracker st) { return null; } + + @Override //Note: unused + protected StateTracker getStateTracker(final boolean reset, final int maximumAlternativeAlleleCount) { return null; } + + @Override //trivial implementation -- new AFCalculator can handle multiallelics so we're not afraid + protected VariantContext reduceScope(final VariantContext vc, final int defaultPloidy, final int maximumAlternativeAlleles) { + return vc; + } + + @Override //also trivial + public GenotypesContext subsetAlleles(final VariantContext vc, + final int defaultPloidy, + final List allelesToUse, + final boolean assignGenotypes) { + return vc.getGenotypes(); + } + + +} diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/Dirichlet.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/Dirichlet.java new file mode 100644 index 000000000..c111ccb3a --- /dev/null +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/Dirichlet.java @@ -0,0 +1,121 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 ("BROAD") and the LICENSEE and is effective at the date the downloading is completed ("EFFECTIVE DATE"). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system ("PHONE-HOME") which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE'S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2016 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.utils; + +import org.apache.commons.math3.special.Gamma; +import org.apache.commons.math3.util.MathArrays; + +import java.util.Arrays; +import java.util.Collections; +import java.util.stream.IntStream; + +/** + * The Dirichlet distribution is a distribution on multinomial distributions: if pi is a vector of positive multinomial weights + * such that sum_i pi[i] = 1, the Dirichlet pdf is P(pi) = [prod_i Gamma(alpha[i]) / Gamma(sum_i alpha[i])] * prod_i pi[i]^(alpha[i] - 1) + * + * The vector alpha comprises the sufficient statistics for the Dirichlet distribution. + * + * Since the Dirichlet is the conjugate prior to the multinomial, if one has a Dirichlet prior with concentration alpha + * and observes each category i n_i times (assuming categories are drawn from a multinomial distribution pi) + * the posterior is alpha_i -> alpha_i + n_i + * + * + * @author David Benjamin <davidben@broadinstitute.org> + */ +public class Dirichlet { + final double[] alpha; + + public Dirichlet(final double... alpha) { + Utils.nonNull(alpha); + Utils.validateArg(alpha.length >= 1, "Dirichlet parameters must have at least one element"); + Utils.validateArg(MathUtils.allMatch(alpha, x -> x >= 0), "Dirichlet parameters may not be negative"); + Utils.validateArg(MathUtils.allMatch(alpha, Double::isFinite), "Dirichlet parameters must be finite"); + this.alpha = alpha.clone(); + } + + /** + * Create a symmetric distribution Dir(a/K, a/K, a/K . . .) where K is the number of states and + * a is the concentration. + */ + public static Dirichlet symmetricDirichlet(final int numStates, final double concentration) { + Utils.validateArg(numStates > 0, "Must have at leat one state"); + Utils.validateArg(concentration > 0, "concentration must be positive"); + return new Dirichlet(Collections.nCopies(numStates, concentration/numStates).stream().mapToDouble(x->x).toArray()); + } + + // in variational Bayes one often needs the effective point estimate of a multinomial distribution with a + // Dirichlet prior. This value is not the mode or mean of the Dirichlet but rather the exp of the expected log weights. + // note that these effective weights do not add up to 1. This is fine because in any probabilistic model scaling all weights + // amounts to an arbitrary normalization constant, but it's important to keep in mind because some classes may expect + // normalized weights. In that case the calling code must normalize the weights. + public double[] effectiveMultinomialWeights() { + final double digammaOfSum = Gamma.digamma(MathUtils.sum(alpha)); + return MathUtils.applyToArray(alpha, a -> Math.exp(Gamma.digamma(a) - digammaOfSum)); + } + + public double[] effectiveLog10MultinomialWeights() { + final double digammaOfSum = Gamma.digamma(MathUtils.sum(alpha)); + return MathUtils.applyToArray(alpha, a -> (Gamma.digamma(a) - digammaOfSum) * MathUtils.LOG10_OF_E); + } + + public double[] meanWeights() { + final double sum = MathUtils.sum(alpha); + return MathUtils.applyToArray(alpha, x -> x / sum); + } + + public double[] log10MeanWeights() { + final double sum = MathUtils.sum(alpha); + return MathUtils.applyToArray(alpha, x -> Math.log10(x / sum)); + } + + public int size() { return alpha.length; } +} diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/IndexRange.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/IndexRange.java new file mode 100644 index 000000000..4cd5a3632 --- /dev/null +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/IndexRange.java @@ -0,0 +1,248 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 ("BROAD") and the LICENSEE and is effective at the date the downloading is completed ("EFFECTIVE DATE"). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system ("PHONE-HOME") which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE'S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2016 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.utils; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.function.*; + +/** + * Represents 0-based integer index range. + * + *

    + * It represents an integer index range as the pair values: + *

    + *
    {@link #from}
    + *
    - index of the first element in range (i.e. inclusive).
    + *
    {@link #to}
    + *
    - index of the element following the last element in range (i.e. exclusive).
    + *
    + *

    + * + *

    + * This class is intended to specify a valid index range in arrays or ordered collections. + *

    + * + *

    + * All instances are constraint so that neither from nor to can + * be negative nor from can be larger than to. + *

    + * + *

    + * You can use {@link #isValidLength(int) isValidFor(length)} to verify that a range instance represents a valid + * range for an 0-based indexed object with {@code length} elements. + *

    + */ +public final class IndexRange { + + /** + * First index in the range. + *

    + * It won't ever be negative nor greater than {@link #to}. + *

    + */ + public final int from; + + /** + * Index following the last index included in the range. + * + *

    + * It won't ever be negative nor less than {@link #from}. + *

    + */ + public final int to; + + /** + * Creates a new range given its {@code from} and {@code to} indices. + * + * @param fromIndex the {@code from} index value. + * @param toIndex the {@code to} index value. + * @throws IllegalArgumentException if {@code fromIndex} is larger than {@code toIndex} or either is + * negative. + */ + public IndexRange(final int fromIndex, final int toIndex) { + Utils.validateArg(fromIndex <= toIndex, "the range size cannot be negative"); + Utils.validateArg(fromIndex >= 0, "the range cannot contain negative indices"); + from = fromIndex; + to = toIndex; + } + + /** + * Checks whether this range is valid for a collection or array of a given size. + * + *

    + * It assume that 0 is the first valid index for target indexed object which is true + * for Java Arrays and mainstream collections. + *

    + * + *

    + * If the input length is less than 0, thus incorrect, this method is guaranteed to return + * {@code false}. No exception is thrown. + *

    + * + * + * @param length the targeted collection or array length. + * @return {@code true} if this range is valid for that {@code length}, {@code false} otherwise. + */ + public boolean isValidLength(final int length) { + return to <= length; + } + + /** + * Returns number indexes expanded by this range. + * + * @return 0 or greater. + */ + public int size() { + return to - from; + } + + /** + * Iterate through all indexes in the range in ascending order to be processed by the + * provided {@link IntConsumer integer consumer} lambda function. + * + *

    + * Exceptions thrown by the execution of the index consumer {@code lambda} + * will be propagated to the caller immediately thus stopping early and preventing + * further indexes to be processed. + *

    + * @param lambda the index consumer lambda. + * @throws IllegalArgumentException if {@code lambda} is {@code null}. + * @throws RuntimeException if thrown by {@code lambda} for some index. + * @throws Error if thrown by {@code lambda} for some index. + */ + public void forEach(final IntConsumer lambda) { + Utils.nonNull(lambda, "the lambda function cannot be null"); + for (int i = from; i < to; i++) { + lambda.accept(i); + } + } + + /** + * Apply an int -> double function to this range, producing a double[] + * + * @param lambda the int -> double function + */ + public double[] mapToDouble(final IntToDoubleFunction lambda) { + Utils.nonNull(lambda, "the lambda function cannot be null"); + final double[] result = new double[size()]; + for (int i = from; i < to; i++) { + result[i - from] = lambda.applyAsDouble(i); + } + return result; + } + + /** + * Sums the values of an int -> double function applied to this range + * + * @param lambda the int -> double function + */ + public double sum(final IntToDoubleFunction lambda) { + Utils.nonNull(lambda, "the lambda function cannot be null"); + double result = 0; + for (int i = from; i < to; i++) { + result += lambda.applyAsDouble(i); + } + return result; + } + + /** + * Apply an int -> int function to this range, producing an int[] + * + * @param lambda the int -> int function + */ + public int[] mapToInteger(final IntUnaryOperator lambda) { + Utils.nonNull(lambda, "the lambda function cannot be null"); + final int[] result = new int[size()]; + for (int i = from; i < to; i++) { + result[i - from] = lambda.applyAsInt(i); + } + return result; + } + + /** + * Find the elements of this range for which an int -> boolean predicate is true + * + * @param predicate the int -> boolean predicate + * @return + */ + public List filter(final IntPredicate predicate) { + Utils.nonNull(predicate, "predicate may not be null"); + final List result = new ArrayList<>(); + forEach(i -> {if (predicate.test(i)) result.add(i); } ); + return result; + } + + @Override + public boolean equals(final Object other) { + if (other == this) { + return true; + } else if (!(other instanceof IndexRange)) { + return false; + } else { + final IndexRange otherCasted = (IndexRange) other; + return otherCasted.from == this.from && otherCasted.to == this.to; + } + } + + @Override + public int hashCode() { + // Inspired on {@link Arrays#hashCode(Object[])}. + return (( 31 + Integer.hashCode(from) ) * 31 ) + Integer.hashCode(to); + } + + @Override + public String toString() { + return String.format("%d-%d",from,to); + } +} diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypeAlleleCountsUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypeAlleleCountsUnitTest.java index 5f7213ce7..74ed7288e 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypeAlleleCountsUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypeAlleleCountsUnitTest.java @@ -52,6 +52,7 @@ package org.broadinstitute.gatk.tools.walkers.genotyper; import htsjdk.variant.variantcontext.Allele; +import org.broadinstitute.gatk.utils.MathUtils; import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; @@ -76,6 +77,7 @@ public class GenotypeAlleleCountsUnitTest { Assert.assertEquals(subject.distinctAlleleCount(),1); Assert.assertEquals(subject.alleleCountAt(0),ploidy); Assert.assertEquals(subject.alleleCountFor(0),ploidy); + Assert.assertEquals(subject.log10CombinationCount(), 0.0); Assert.assertEquals(subject.alleleRankFor(0),0); Assert.assertEquals(subject.alleleRankFor(1),-2); Assert.assertTrue(subject.containsAllele(0)); @@ -175,6 +177,31 @@ public class GenotypeAlleleCountsUnitTest { while (!current.containsAllele(MAXIMUM_ALLELE_INDEX + 1)) { final GenotypeAlleleCounts next = current.next(); + + // test log10CombinationCount + if (ploidy == 2) { + Assert.assertEquals(next.log10CombinationCount(), next.distinctAlleleCount() == 2 ? Math.log10(2) : 0.0); + } else if (ploidy == 3) { + Assert.assertEquals(next.log10CombinationCount(), + next.distinctAlleleCount() == 3 ? Math.log10(6) : (next.distinctAlleleCount() == 2 ? Math.log10(6) - Math.log10(2) : 0.0)); + } else { + if (next.distinctAlleleCount() == 1) { + Assert.assertEquals(next.log10CombinationCount(), 0.0); + } else if (next.distinctAlleleCount() == ploidy) { + Assert.assertEquals(next.log10CombinationCount(), MathUtils.log10Factorial(ploidy)); + } + } + + //test forEach + final List alleleCountsAsList = new ArrayList<>(next.distinctAlleleCount()*2); + next.forEachAlleleIndexAndCount((alleleIndex, alleleCount) -> { + alleleCountsAsList.add(alleleIndex); + alleleCountsAsList.add(alleleCount);}); + final int[] actualAlleleCounts = new int[next.distinctAlleleCount()*2]; + next.copyAlleleCounts(actualAlleleCounts, 0); + + Assert.assertEquals(alleleCountsAsList.stream().mapToInt(n->n).toArray(), actualAlleleCounts); + if (current.distinctAlleleCount() == 1) { Assert.assertEquals(next.maximumAlleleIndex(),current.maximumAlleleIndex() + 1); Assert.assertEquals(next.distinctAlleleCount(), 2 ); diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/AlleleFrequencyCalculatorUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/AlleleFrequencyCalculatorUnitTest.java new file mode 100644 index 000000000..2edfdb81e --- /dev/null +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/AlleleFrequencyCalculatorUnitTest.java @@ -0,0 +1,250 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 ("BROAD") and the LICENSEE and is effective at the date the downloading is completed ("EFFECTIVE DATE"). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system ("PHONE-HOME") which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE'S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2016 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.tools.walkers.genotyper.afcalc; + +import htsjdk.variant.variantcontext.*; +import org.apache.commons.math3.util.Pair; +import org.broadinstitute.gatk.tools.walkers.genotyper.GenotypeLikelihoodCalculator; +import org.broadinstitute.gatk.tools.walkers.genotyper.GenotypeLikelihoodCalculators; +import org.broadinstitute.gatk.utils.BaseTest; +import org.testng.Assert; +import org.testng.annotations.Test; + +import java.util.*; +import java.util.stream.Collectors; +import java.util.stream.IntStream; + +/** + * Created by davidben on 7/28/16. + */ +public class AlleleFrequencyCalculatorUnitTest extends BaseTest { + private static final double EPS = 1.0e-8; + private static final GenotypeLikelihoodCalculators GL_CALCS = new GenotypeLikelihoodCalculators(); + + private static final Allele A = Allele.create("A", true); + private static final Allele B = Allele.create("C"); + private static final Allele C = Allele.create("G"); + private static final Allele indel1 = Allele.create("AA"); + + private static final int HAPLOID = 1; + private static final int DIPLOID = 2; + private static final int TRIPLOID = 3; + + private static final int BIALLELIC = 2; + private static final int TRIALLELIC = 3; + + private static final int EXTREMELY_CONFIDENT_PL = 1000; + private static final int FAIRLY_CONFIDENT_PL = 20; + private static final int LOW_CONFIDENCE_PL = 10; + + private static final int DEFAULT_PLOIDY = 2; + + private static int sampleNameCounter = 0; + + @Test + public void testSymmetries() { + final AlleleFrequencyCalculator afCalc = new AlleleFrequencyCalculator(1, 0.1, 0.1, DEFAULT_PLOIDY); + final List alleles = Arrays.asList(A,B,C); + final Genotype AA = genotypeWithObviousCall(DIPLOID, TRIALLELIC, new int[] {0,2}, FAIRLY_CONFIDENT_PL); + final Genotype BB = genotypeWithObviousCall(DIPLOID, TRIALLELIC, new int[] {1,2}, FAIRLY_CONFIDENT_PL); + final Genotype CC = genotypeWithObviousCall(DIPLOID, TRIALLELIC, new int[] {2,2}, FAIRLY_CONFIDENT_PL); + final Genotype AB = genotypeWithObviousCall(DIPLOID, TRIALLELIC, new int[] {0,1,1,1}, FAIRLY_CONFIDENT_PL); + final Genotype AC = genotypeWithObviousCall(DIPLOID, TRIALLELIC, new int[] {0,1,2,1}, FAIRLY_CONFIDENT_PL); + + final Genotype BBB = genotypeWithObviousCall(TRIPLOID, TRIALLELIC, new int[] {1,3}, FAIRLY_CONFIDENT_PL); + final Genotype CCC = genotypeWithObviousCall(TRIPLOID, TRIALLELIC, new int[] {2,3}, FAIRLY_CONFIDENT_PL); + + // make pairs of VCs tht differ only by B <--> C + final List> switchBWithCPairs = Arrays.asList( + new Pair<>(makeVC(alleles, AA, BB), makeVC(alleles, AA, CC)), + new Pair<>(makeVC(alleles, AA, AB), makeVC(alleles, AA, AC)), + new Pair<>(makeVC(alleles, AB, AB), makeVC(alleles, AC, AC)), + new Pair<>(makeVC(alleles, AA, AA, BB), makeVC(alleles, AA, AA, CC)), + new Pair<>(makeVC(alleles, AA, AB, AB), makeVC(alleles, AA, AC, AC)), + new Pair<>(makeVC(alleles, AA, BBB), makeVC(alleles, AA, CCC)) + ); + for (final Pair pair : switchBWithCPairs) { + final VariantContext vc1 = pair.getFirst(); + final VariantContext vc2 = pair.getSecond(); + final AFCalculationResult result1 = afCalc.getLog10PNonRef(vc1); + final AFCalculationResult result2 = afCalc.getLog10PNonRef(vc2); + Assert.assertEquals(result1.getLog10PosteriorOfAFEq0(), result2.getLog10PosteriorOfAFEq0(), EPS); + Assert.assertEquals(result1.getLog10PosteriorOfAFEq0ForAllele(B), result2.getLog10PosteriorOfAFEq0ForAllele(C), EPS); + Assert.assertEquals(result1.getLog10PosteriorOfAFEq0ForAllele(C), result2.getLog10PosteriorOfAFEq0ForAllele(B), EPS); + } + } + + @Test + public void testMLECounts() { + final AlleleFrequencyCalculator afCalc = new AlleleFrequencyCalculator(1, 1, 1, DEFAULT_PLOIDY); + final List alleles = Arrays.asList(A,B,C); + final Genotype AA = genotypeWithObviousCall(DIPLOID, TRIALLELIC, new int[] {0,2}, FAIRLY_CONFIDENT_PL); + final Genotype BB = genotypeWithObviousCall(DIPLOID, TRIALLELIC, new int[] {1,2}, FAIRLY_CONFIDENT_PL); + final Genotype AB = genotypeWithObviousCall(DIPLOID, TRIALLELIC, new int[] {0,1,1,1}, FAIRLY_CONFIDENT_PL); + final Genotype AC = genotypeWithObviousCall(DIPLOID, TRIALLELIC, new int[] {0,1,2,1}, FAIRLY_CONFIDENT_PL); + + final Genotype BBB = genotypeWithObviousCall(TRIPLOID, TRIALLELIC, new int[] {1,3}, FAIRLY_CONFIDENT_PL); + final Genotype CCC = genotypeWithObviousCall(TRIPLOID, TRIALLELIC, new int[] {2,3}, FAIRLY_CONFIDENT_PL); + + final List> vcWithExpectedCounts = Arrays.asList( + new Pair<>(makeVC(alleles, AA, BB), new int[] {2,0}), + new Pair<>(makeVC(alleles, AA, AB), new int[] {1,0}), + new Pair<>(makeVC(alleles, AB, AB), new int[] {2,0}), + new Pair<>(makeVC(alleles, AA, AA, BB), new int[] {2,0}), + new Pair<>(makeVC(alleles, AA, AB, AB), new int[] {2,0}), + new Pair<>(makeVC(alleles, AA, BBB), new int[] {3,0}), + new Pair<>(makeVC(alleles, AA, BBB, CCC), new int[] {3,3}), + new Pair<>(makeVC(alleles, AA, AB, AC), new int[] {1,1}), + new Pair<>(makeVC(alleles, AA, AB, AC, BBB, CCC), new int[] {4,4}) + + ); + for (final Pair pair : vcWithExpectedCounts) { + final VariantContext vc = pair.getFirst(); + final int[] expected = pair.getSecond(); + final int[] actual = afCalc.getLog10PNonRef(vc).getAlleleCountsOfMLE(); + Assert.assertEquals(Arrays.asList(expected), Arrays.asList(actual)); + } + } + + // many samples with low confidence should yield a non-zero MLE, in contrast to the old exact model + @Test + public void testManySamplesWithLowConfidence() { + // prior corresponding to 1000 observations of ref, 1 of a SNP + // for this test, we want many pseudocounts in the prior because the new AF calculator learns the allele frequency + // and we don't want the complication of the posterior being differetn from the prior + final AlleleFrequencyCalculator afCalc = new AlleleFrequencyCalculator(1000, 1, 1, DEFAULT_PLOIDY); //prior corresponding to 1000 observations of ref, 1 of a SNP + final List alleles = Arrays.asList(A,B); + + // for FAIRLY_CONFIDENT_PL = 20, this genotype has about 100 times greater likelihood to be het than hom ref + // with our prior giving 1000 times as much weight to ref, this implies a 1 in 5 chance of each sample having a copy of the alt allele + // (that is, 100/1000 times the combinatorial factor of 2). Thus the MLE for up to 2 samples should be zero + // for five samples we should have one + // for ten samples we will have more than twice as many as for five since the counts fromt he samples start to influence + // the estimated allele frequency + final Genotype AB = genotypeWithObviousCall(DIPLOID, BIALLELIC, new int[] {0,1,1,1}, FAIRLY_CONFIDENT_PL); + + final List vcsWithDifferentNumbersOfSamples = IntStream.range(1, 11) + .mapToObj(n -> makeVC(alleles, Collections.nCopies(n, AB))).collect(Collectors.toList()); + final int[] counts = vcsWithDifferentNumbersOfSamples.stream().mapToInt(vc -> afCalc.getLog10PNonRef(vc).getAlleleCountAtMLE(B)).toArray(); + Assert.assertEquals(counts[0],0); // one sample + Assert.assertEquals(counts[1],0); // two samples + Assert.assertEquals(counts[4],2); // five samples + Assert.assertTrue(counts[8] >= 3); // ten samples + } + + @Test + public void testApproximateMultiplicativeConfidence() { + final AlleleFrequencyCalculator afCalc = new AlleleFrequencyCalculator(1, 1, 1, DEFAULT_PLOIDY); //flat prior -- we will choose genotypes such that the posterior remains flat + final List alleles = Arrays.asList(A,B); + + final Genotype AA = genotypeWithObviousCall(DIPLOID, BIALLELIC, new int[] {0,2}, FAIRLY_CONFIDENT_PL); + final Genotype BB = genotypeWithObviousCall(DIPLOID, BIALLELIC, new int[] {1,2}, FAIRLY_CONFIDENT_PL); + + final List vcsWithDifferentNumbersOfSamples = new ArrayList<>(); + final List genotypeList = new ArrayList<>(); + + for (int n = 0; n < 10; n++) { + genotypeList.add(AA); + genotypeList.add(BB); //adding both keeps the flat prior. Thus the posterior will equal the likelihood + vcsWithDifferentNumbersOfSamples.add(makeVC(alleles, genotypeList)); + } + + // since we maintain a flat allele frequency distribution, the probability of being ref as each successive sample is added + // is multiplied by the probability of any one. Thus we get an arithmetic series in log space + final double[] log10PRefs = vcsWithDifferentNumbersOfSamples.stream() + .mapToDouble(vc -> afCalc.getLog10PNonRef(vc).getLog10LikelihoodOfAFEq0()).toArray(); + + for (int n = 0; n < 9; n++) { + Assert.assertEquals(log10PRefs[n+1] - log10PRefs[n], log10PRefs[0], 0.01); + } + } + + @Test + public void testManyRefSamplesDontKillGoodVariant() { + final AlleleFrequencyCalculator afCalc = new AlleleFrequencyCalculator(1, 0.1, 0.1, DEFAULT_PLOIDY); + final List alleles = Arrays.asList(A,B); + final Genotype AA = genotypeWithObviousCall(DIPLOID, BIALLELIC, new int[] {0,2}, EXTREMELY_CONFIDENT_PL); + final Genotype AB = genotypeWithObviousCall(DIPLOID, BIALLELIC, new int[] {0,1,1,1}, EXTREMELY_CONFIDENT_PL); + for (final int numRef : new int[]{1, 10, 100, 1000, 10000, 100000}) { + final List genotypeList = new ArrayList<>(Collections.nCopies(numRef, AA)); + genotypeList.add(AB); + final VariantContext vc = makeVC(alleles, genotypeList); + final double log10PRef = afCalc.getLog10PNonRef(vc).getLog10LikelihoodOfAFEq0(); + Assert.assertTrue(log10PRef < (-EXTREMELY_CONFIDENT_PL/10) + Math.log10(numRef) + 1); + } + } + + // make PLs that correspond to an obvious call i.e. one PL is relatively big and the rest are zero + // alleleCounts is the GenotypeAlleleCounts format for the obvious genotype, with repeats but in no particular order + private static int[] PLsForObviousCall(final int ploidy, final int numAlleles, final int[] alleleCounts, final int PL) { + final GenotypeLikelihoodCalculator glCalc = GL_CALCS.getInstance(ploidy, numAlleles); + final int[] result = Collections.nCopies(glCalc.genotypeCount(), PL).stream().mapToInt(n->n).toArray(); + result[glCalc.alleleCountsToIndex(alleleCounts)] = 0; + return result; + } + + private static Genotype genotypeWithObviousCall(final int ploidy, final int numAlleles, final int[] alleles, final int PL) { + return makeGenotype(ploidy, PLsForObviousCall(ploidy, numAlleles, alleles, PL)); + } + //note the call is irrelevant to the AFCalculator, which only looks at PLs + private static Genotype makeGenotype(final int ploidy, int ... pls) { + return new GenotypeBuilder("sample" + sampleNameCounter++).alleles(Collections.nCopies(ploidy, Allele.NO_CALL)).PL(pls).make(); + } + + private static VariantContext makeVC(final List alleles, final Genotype... genotypes) { + return new VariantContextBuilder().chr("chr1").alleles(alleles).genotypes(genotypes).make(); + } + + private static VariantContext makeVC(final List alleles, final Collection genotypes) { + return new VariantContextBuilder().chr("chr1").alleles(alleles).genotypes(genotypes).make(); + } +} \ No newline at end of file diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/MathUtils.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/MathUtils.java index 1bbb4c0c9..27cb6a6aa 100644 --- a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/MathUtils.java +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/MathUtils.java @@ -33,6 +33,9 @@ import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; import java.math.BigDecimal; import java.util.*; +import java.util.function.DoublePredicate; +import java.util.function.DoubleUnaryOperator; +import java.util.function.IntPredicate; /** * MathUtils is a static class (no instantiation allowed!) with some useful math methods. @@ -57,6 +60,10 @@ public class MathUtils { public static final double LOG_ONE_THIRD = -Math.log10(3.0); private static final double NATURAL_LOG_OF_TEN = Math.log(10.0); private static final double SQUARE_ROOT_OF_TWO_TIMES_PI = Math.sqrt(2.0 * Math.PI); + /** + * Log10 of the e constant. + */ + public static final double LOG10_OF_E = Math.log10(Math.E); /** * A helper class to maintain a cache of log10 values @@ -1686,4 +1693,69 @@ public class MathUtils { public static ExponentialDistribution exponentialDistribution( final double mean ) { return new ExponentialDistributionImpl(mean); } + + /** + * The following method implements Arrays.stream(array).map(func).toArray(), which is concise but performs poorly due + * to the overhead of creating a stream, especially with small arrays. Thus we wrap the wordy but fast array code + * in the following method which permits concise Java 8 code. + * + * Returns a new array -- the original array in not modified. + * + * This method has been benchmarked and performs as well as array-only code. + */ + public static double[] applyToArray(final double[] array, final DoubleUnaryOperator func) { + Utils.nonNull(func, "function may not be null"); + Utils.nonNull(array, "array may not be null"); + final double[] result = new double[array.length]; + for (int m = 0; m < result.length; m++) { + result[m] = func.applyAsDouble(array[m]); + } + return result; + } + + /** + * The following method implements Arrays.stream(array).map(func).toArray(), which is concise but performs poorly due + * to the overhead of creating a stream, especially with small arrays. Thus we wrap the wordy but fast array code + * in the following method which permits concise Java 8 code. + * + * The original array is modified in place. + * + * This method has been benchmarked and performs as well as array-only code. + */ + public static double[] applyToArrayInPlace(final double[] array, final DoubleUnaryOperator func) { + Utils.nonNull(array, "array may not be null"); + Utils.nonNull(func, "function may not be null"); + for (int m = 0; m < array.length; m++) { + array[m] = func.applyAsDouble(array[m]); + } + return array; + } + + /** + * Test whether all elements of a double[] array satisfy a double -> boolean predicate + */ + public static boolean allMatch(final double[] array, final DoublePredicate pred) { + Utils.nonNull(array, "array may not be null"); + Utils.nonNull(pred, "predicate may not be null"); + for (final double x : array) { + if (!pred.test(x)) { + return false; + } + } + return true; + } + + /** + * Test whether all elements of an int[] array satisfy an int -> boolean predicate + */ + public static boolean allMatch(final int[] array, final IntPredicate pred) { + Utils.nonNull(array, "array may not be null"); + Utils.nonNull(pred, "predicate may not be null"); + for (final int x : array) { + if (!pred.test(x)) { + return false; + } + } + return true; + } } diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/Utils.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/Utils.java index f4bbb48b2..2a8cd3760 100644 --- a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/Utils.java +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/Utils.java @@ -35,6 +35,7 @@ import java.net.InetAddress; import java.security.MessageDigest; import java.security.NoSuchAlgorithmException; import java.util.*; +import java.util.function.Supplier; /** * Created by IntelliJ IDEA. @@ -1171,4 +1172,54 @@ public class Utils { return result; } + + /** + * Checks that an Object {@code object} is not null and returns the same object or throws an {@link IllegalArgumentException} + * @param object any Object + * @return the same object + * @throws IllegalArgumentException if a {@code o == null} + */ + public static T nonNull(final T object) { + return Utils.nonNull(object, "Null object is not allowed here."); + } + + /** + * Checks that an {@link Object} is not {@code null} and returns the same object or throws an {@link IllegalArgumentException} + * @param object any Object + * @param message the text message that would be passed to the exception thrown when {@code o == null}. + * @return the same object + * @throws IllegalArgumentException if a {@code o == null} + */ + public static T nonNull(final T object, final String message) { + if (object == null) { + throw new IllegalArgumentException(message); + } + return object; + } + + /** + * Checks that an {@link Object} is not {@code null} and returns the same object or throws an {@link IllegalArgumentException} + * @param object any Object + * @param message the text message that would be passed to the exception thrown when {@code o == null}. + * @return the same object + * @throws IllegalArgumentException if a {@code o == null} + */ + public static T nonNull(final T object, final Supplier message) { + if (object == null) { + throw new IllegalArgumentException(message.get()); + } + return object; + } + + public static void validateArg(final boolean condition, final String msg){ + if (!condition){ + throw new IllegalArgumentException(msg); + } + } + + public static void validateArg(final boolean condition, final Supplier msg){ + if (!condition){ + throw new IllegalArgumentException(msg.get()); + } + } } From 4ef396f72e231024f32e19108005153e2b44e036 Mon Sep 17 00:00:00 2001 From: Ron Levine Date: Fri, 9 Sep 2016 14:20:18 -0400 Subject: [PATCH 45/68] Assign correct ambiguity code for * allele --- ...astaAlternateReferenceIntegrationTest.java | 30 ++++++++++++ .../fasta/FastaAlternateReferenceMaker.java | 47 ++++++++++++++++--- 2 files changed, 71 insertions(+), 6 deletions(-) diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/fasta/FastaAlternateReferenceIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/fasta/FastaAlternateReferenceIntegrationTest.java index fe807874b..929583452 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/fasta/FastaAlternateReferenceIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/fasta/FastaAlternateReferenceIntegrationTest.java @@ -53,6 +53,7 @@ package org.broadinstitute.gatk.tools.walkers.fasta; import org.broadinstitute.gatk.engine.walkers.WalkerTest; import org.broadinstitute.gatk.utils.exceptions.UserException; +import org.testng.annotations.DataProvider; import org.testng.annotations.Test; import java.util.Arrays; @@ -176,4 +177,33 @@ public class FastaAlternateReferenceIntegrationTest extends WalkerTest { Arrays.asList("8fd887bca9f3949f2c23c3565f7dcc1b")); executeTest("test iupac", spec); } + + @Test + void testSpanDel() { + + WalkerTestSpec spec = new WalkerTestSpec( + "-T FastaAlternateReferenceMaker -R " + b37KGReference + " -V " + privateTestDir + "spanningDel.delOnly.starFirst.vcf -L 1:1273247 -o %s", + 1, + Arrays.asList("69852222a8c9c9e1604808b62df96f8a")); + executeTest("test spanning deletion", spec); + } + + @DataProvider(name = "iupacSsample") + public Object[][] getIupacSampleData() { + return new Object[][]{ + {"NA1", "b5d95e28263c88b20325d7a545576ad4"}, + {"NA2", "a8b4b79dea8ad1fde2c0d8ff42ca132d"}, + {"NA3", "69852222a8c9c9e1604808b62df96f8a"} + }; + } + + @Test(dataProvider = "iupacSsample") + void testSpanDelIUPAC(final String sample, final String md5) { + + WalkerTestSpec spec = new WalkerTestSpec( + "-T FastaAlternateReferenceMaker -R " + b37KGReference + " --use_IUPAC_sample " + sample + " -V " + privateTestDir + "spanningDel.delOnly.starFirst.vcf -L 1:1273247 -o %s", + 1, + Arrays.asList(md5)); + executeTest("test spanning deletion using IUPAC codes", spec); + } } diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/fasta/FastaAlternateReferenceMaker.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/fasta/FastaAlternateReferenceMaker.java index 70354081f..e165a87c3 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/fasta/FastaAlternateReferenceMaker.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/fasta/FastaAlternateReferenceMaker.java @@ -42,12 +42,15 @@ import org.broadinstitute.gatk.utils.collections.Pair; import org.broadinstitute.gatk.utils.exceptions.UserException; import org.broadinstitute.gatk.utils.help.DocumentedGATKFeature; import org.broadinstitute.gatk.utils.help.HelpConstants; +import htsjdk.variant.variantcontext.Allele; import htsjdk.variant.variantcontext.Genotype; import htsjdk.variant.variantcontext.VariantContext; +import org.broadinstitute.gatk.utils.variant.GATKVCFConstants; import java.util.Arrays; import java.util.List; import java.util.Set; +import java.util.Optional; /** @@ -122,6 +125,8 @@ public class FastaAlternateReferenceMaker extends FastaReferenceMaker { private int deletionBasesRemaining = 0; + private static final String EMPTY_BASE = " "; + @Override public void initialize() { super.initialize(); @@ -159,11 +164,16 @@ public class FastaAlternateReferenceMaker extends FastaReferenceMaker { deletionBasesRemaining = vc.getReference().length() - 1; // delete the next n bases, not this one return new Pair<>(context.getLocation(), refBase); - } else if ( vc.isSimpleInsertion()) { - return new Pair<>(context.getLocation(), vc.getAlternateAllele(0).toString()); - } else if (vc.isSNP()) { - final String base = (iupacSample != null) ? getIUPACbase(vc.getGenotype(iupacSample), refBase) : vc.getAlternateAllele(0).toString(); - return new Pair<>(context.getLocation(), base); + } else if ( vc.isSimpleInsertion() || vc.isSNP() ) { + // Get the first alt allele that is not a spanning deletion. If none present, use the empty allele + final Optional optionalAllele = getFirstNonSpanDelAltAllele(vc.getAlternateAlleles()); + final Allele allele = optionalAllele.isPresent() ? optionalAllele.get() : Allele.create(EMPTY_BASE, false); + if ( vc.isSimpleInsertion() ) { + return new Pair<>(context.getLocation(), allele.toString()); + } else { + final String base = (iupacSample != null) ? getIUPACbase(vc.getGenotype(iupacSample), refBase) : allele.toString(); + return new Pair<>(context.getLocation(), base); + } } } @@ -177,6 +187,21 @@ public class FastaAlternateReferenceMaker extends FastaReferenceMaker { return new Pair<>(context.getLocation(), refBase); } + /** + * Get the first non spanning deletion (* or <*:DEL>) alt allele + * @param altAlleles the alternate alleles + * @return the first non spanning deletion allele or null + */ + private Optional getFirstNonSpanDelAltAllele( final List altAlleles ) { + for (final Allele allele : altAlleles) { + if (!allele.equals(Allele.SPAN_DEL) && !allele.equals(GATKVCFConstants.SPANNING_DELETION_SYMBOLIC_ALLELE_DEPRECATED)) { + return Optional.of(allele); + } + } + + return Optional.empty(); + } + /** * Mask a SNP (inserting N's in the sequence) * @@ -199,12 +224,22 @@ public class FastaAlternateReferenceMaker extends FastaReferenceMaker { * * @param genotype the genotype to encode * @param ref the reference base - * @return non-null, non-empty String + * @return non-null, non-empty String of bases */ private String getIUPACbase(final Genotype genotype, final String ref) { if ( genotype == null ) throw new IllegalStateException("The genotype is null for sample " + iupacSample); + // If have a spanning deletion, if both alleles are spanning deletions, use the empty allele. Otherwise, use the allele is not a + // spanning deletion. + if ( genotype.getAlleles().contains(Allele.SPAN_DEL) ) { + if ( genotype.isHomVar() ) { + return EMPTY_BASE; + } else { + return genotype.getAllele(0).equals(Allele.SPAN_DEL) ? genotype.getAllele(1).getBaseString() : genotype.getAllele(0).getBaseString(); + } + } + if ( !genotype.isHet() ) return genotype.isHom() ? genotype.getAllele(0).getBaseString() : ref; From 2c83081560901742b0bf4d360facd9b5085e4829 Mon Sep 17 00:00:00 2001 From: Ron Levine Date: Fri, 23 Sep 2016 17:39:10 -0400 Subject: [PATCH 46/68] Remove sra group --- public/gatk-root/pom.xml | 1 - 1 file changed, 1 deletion(-) diff --git a/public/gatk-root/pom.xml b/public/gatk-root/pom.xml index a6ed90656..1ece58126 100644 --- a/public/gatk-root/pom.xml +++ b/public/gatk-root/pom.xml @@ -418,7 +418,6 @@ ${gatk.queuetests.run} ${java.io.tmpdir} - sra From 81a9ece45b0fefc2a83a6e9f08d09cd7d4d0de10 Mon Sep 17 00:00:00 2001 From: Ron Levine Date: Sun, 2 Oct 2016 12:13:18 -0400 Subject: [PATCH 47/68] deprecate -stand_emit_conf --- .../GenotypeCalculationArgumentCollection.java | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/arguments/GenotypeCalculationArgumentCollection.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/arguments/GenotypeCalculationArgumentCollection.java index 9e14efaac..f3915f555 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/arguments/GenotypeCalculationArgumentCollection.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/arguments/GenotypeCalculationArgumentCollection.java @@ -54,6 +54,7 @@ package org.broadinstitute.gatk.engine.arguments; import org.broadinstitute.gatk.tools.walkers.genotyper.afcalc.AFCalculator; import org.broadinstitute.gatk.utils.commandline.Advanced; import org.broadinstitute.gatk.utils.commandline.Argument; +import org.broadinstitute.gatk.utils.commandline.Hidden; import org.broadinstitute.gatk.utils.variant.HomoSapiensConstants; import java.util.Collections; @@ -123,6 +124,15 @@ public class GenotypeCalculationArgumentCollection implements Cloneable{ @Argument(fullName = "standard_min_confidence_threshold_for_calling", shortName = "stand_call_conf", doc = "The minimum phred-scaled confidence threshold at which variants should be called", required = false) public double STANDARD_CONFIDENCE_FOR_CALLING = 30.0; + /** + * This argument allows you to emit low quality calls as filtered records. + */ + @Hidden + @Deprecated + @Argument(fullName = "standard_min_confidence_threshold_for_emitting", shortName = "stand_emit_conf", + doc = "This argument is no longer used in GATK versions 3.7 and newer. Please see the online documentation for the latest usage recommendations.", required = false) + public double STANDARD_CONFIDENCE_FOR_EMITTING = 30.0; + /** * If there are more than this number of alternate alleles presented to the genotyper (either through discovery or GENOTYPE_GIVEN_ALLELES), * then only this many alleles will be used. Note that genotyping sites with many alternate alleles is both CPU and memory intensive and it From 16417bbf34e918d51780be490fe43b52a7fb5fce Mon Sep 17 00:00:00 2001 From: Valentin Ruano Rubio Date: Tue, 11 Oct 2016 11:54:17 -0400 Subject: [PATCH 48/68] Fixes NaN issue in new Qual calculator Fixes issue #1491 --- .../genotyper/afcalc/AlleleFrequencyCalculator.java | 9 ++++++++- .../variantutils/GenotypeGVCFsIntegrationTest.java | 9 +++++++++ 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/AlleleFrequencyCalculator.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/AlleleFrequencyCalculator.java index febdebb9c..abfddab82 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/AlleleFrequencyCalculator.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/AlleleFrequencyCalculator.java @@ -159,8 +159,15 @@ public final class AlleleFrequencyCalculator extends AFCalculator { pOfNonZeroAltAlleles[alleleIndex] += genotypePosterior); } + // Make sure that we handle appropriately pOfNonZeroAltAlleles that are close to 1; values just over 1.0 due to + // rounding error would result in NaN. + // As every allele is present in at least one genotype, the p-non-zero-count for + // any allele is bound above by 1.0 - minimum genotype posterior because at least one genotype + // does not contain this allele. + final double maximumPNonZeroCount = 1.0 - MathUtils.arrayMin(genotypePosteriors); + for (int allele = 0; allele < numAlleles; allele++) { - log10POfZeroCountsByAllele[allele] += Math.log10(1 - pOfNonZeroAltAlleles[allele]); + log10POfZeroCountsByAllele[allele] += Math.log10(1.0 - Math.min(maximumPNonZeroCount, pOfNonZeroAltAlleles[allele])); } } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFsIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFsIntegrationTest.java index 98b806f40..47e813204 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFsIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFsIntegrationTest.java @@ -681,4 +681,13 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { spec.disableShadowBCF(); executeTest("testGenotypingSpanningDeletionAcrossLines", spec); } + + @Test + public void testNewQualNaNBugFix() { + final WalkerTestSpec spec = new WalkerTestSpec( + baseTestString(" -newQual -V " + privateTestDir + "input-newqual-nan-bug-fix.vcf", b37KGReferenceWithDecoy), + Collections.singletonList("503f4193c22fbcc451bd1c425b8b6bf8")); + spec.disableShadowBCF(); + executeTest("testNewQualNaNBugFix", spec); + } } \ No newline at end of file From a301ba1977817cab5b6cb1cc10bd306ee394a8ac Mon Sep 17 00:00:00 2001 From: Pierre Lindenbaum Date: Fri, 23 Sep 2016 11:17:24 +0200 Subject: [PATCH 49/68] Add read group identifier to column names If two SAMReadGroupRecords share the same sample name, the output will contain duplicate column headers. This PR fix the problem. --- .../diagnostics/ReadLengthDistribution.java | 2 +- ...ReadLengthDistributionIntegrationTest.java | 42 +++++++++++++++++++ 2 files changed, 43 insertions(+), 1 deletion(-) create mode 100644 public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/tools/walkers/diagnostics/ReadLengthDistributionIntegrationTest.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/diagnostics/ReadLengthDistribution.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/diagnostics/ReadLengthDistribution.java index 098d4d4ba..1e7e48c5a 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/diagnostics/ReadLengthDistribution.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/diagnostics/ReadLengthDistribution.java @@ -163,7 +163,7 @@ public class ReadLengthDistribution extends ReadWalker { } else{ for (SAMReadGroupRecord rg : readGroups) - tableReport.addColumn(rg.getSample()); + tableReport.addColumn(rg.getSample() + "[rg:" + rg.getReadGroupId() +"]"); int rowIndex = 0; for (Integer length : table.keySet()){ tableReport.set(rowIndex,0,length); diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/tools/walkers/diagnostics/ReadLengthDistributionIntegrationTest.java b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/tools/walkers/diagnostics/ReadLengthDistributionIntegrationTest.java new file mode 100644 index 000000000..e25b1be8c --- /dev/null +++ b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/tools/walkers/diagnostics/ReadLengthDistributionIntegrationTest.java @@ -0,0 +1,42 @@ +/* +* Copyright 2012-2016 Broad Institute, Inc. +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.tools.walkers.diagnostics; + +import org.broadinstitute.gatk.engine.walkers.WalkerTest; +import org.testng.annotations.Test; + +import java.util.Arrays; + +public class ReadLengthDistributionIntegrationTest extends WalkerTest { + + @Test + public void testReadLengthDistributionMultiSample() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T ReadLengthDistribution -R " + b37KGReference + " -I " + privateTestDir + "testReadPos.snippet.bam -rf NotPrimaryAlignment -rf UnmappedRead -rf FailsVendorQualityCheck -rf DuplicateRead -o %s", 1, + Arrays.asList("9f0551e7116c7c8a56d6f7f756d02795")); + executeTest("test ReadLengthDistribution with a bame with multiple read groups", spec); + } +} From cfd3ffa2c00c3edd177895a302fd2d2b38d7350c Mon Sep 17 00:00:00 2001 From: Ron Levine Date: Tue, 20 Sep 2016 12:49:06 -0400 Subject: [PATCH 50/68] Replace SAMFileReader with calls to SamReaderFactory --- .../bwa/java/AlignerTestHarness.java | 10 +- .../datasources/reads/BlockInputStream.java | 2 +- .../reads/utilities/BAMFileStat.java | 18 +- .../reads/utilities/BAMTagRenamer.java | 15 +- .../reads/utilities/PrintBAMRegion.java | 12 +- .../reads/utilities/PrintBGZFBounds.java | 2 - .../gatk/engine/io/OutputTracker.java | 2 +- .../io/storage/SAMFileWriterStorage.java | 9 +- .../SAMReaderArgumentTypeDescriptor.java | 8 +- .../gatk/engine/CommandLineGATKUnitTest.java | 15 +- .../engine/EngineFeaturesIntegrationTest.java | 43 ++-- .../reads/DownsamplerBenchmark.java | 4 +- .../GATKBAMIndexFromDataSourceUnitTest.java | 4 +- .../reads/GATKBAMIndexFromFileUnitTest.java | 9 +- .../reads/PicardBaselineBenchmark.java | 18 +- .../reads/ReadProcessingBenchmark.java | 15 +- .../reads/TheoreticalMinimaBenchmark.java | 12 +- .../queue/extensions/gatk/ArgumentField.java | 4 +- .../gatk/queue/util/QScriptUtils.scala | 11 +- .../gatk/queue/util/VCF_BAM_utilities.scala | 4 +- .../utils/diffengine/BAMDiffableReader.java | 13 +- .../utils/locusiterator/LIBSPerformance.java | 7 +- .../locusiterator/LocusIteratorByState.java | 6 +- .../utils/sam/ArtificialSAMFileReader.java | 198 ++++++++++++++++-- .../gatk/utils/ExampleToCopyUnitTest.java | 12 +- .../sam/ArtificialBAMBuilderUnitTest.java | 12 +- 26 files changed, 339 insertions(+), 126 deletions(-) diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/bwa/java/AlignerTestHarness.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/bwa/java/AlignerTestHarness.java index db098e0e3..ff9f2dcc4 100644 --- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/bwa/java/AlignerTestHarness.java +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/bwa/java/AlignerTestHarness.java @@ -35,6 +35,7 @@ import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; import java.io.File; import java.io.FileNotFoundException; +import java.io.IOException; /** * A test harness to ensure that the perfect aligner works. @@ -63,8 +64,7 @@ public class AlignerTestHarness { Aligner aligner = new BWAJavaAligner(bwtFile,rbwtFile,suffixArrayFile,reverseSuffixArrayFile); int count = 0; - SAMFileReader reader = new SAMFileReader(bamFile); - reader.setValidationStringency(ValidationStringency.SILENT); + final SamReader reader = SamReaderFactory.makeDefault().validationStringency(ValidationStringency.SILENT).open(bamFile); int mismatches = 0; int failures = 0; @@ -160,6 +160,12 @@ public class AlignerTestHarness { System.out.printf("%d reads examined.%n",count); } + try { + reader.close(); + } catch ( IOException ex ) { + throw new ReviewedGATKException("Unable to close " + bamFile , ex); + } + System.out.printf("%d reads examined; %d mismatches; %d failures.%n",count,mismatches,failures); } diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/BlockInputStream.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/BlockInputStream.java index aa9462914..1ea1f4276 100644 --- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/BlockInputStream.java +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/BlockInputStream.java @@ -40,7 +40,7 @@ import java.util.LinkedList; import java.util.List; /** - * Presents decompressed blocks to the SAMFileReader. + * Presents decompressed blocks to the SamReader. */ public class BlockInputStream extends InputStream { /** diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/utilities/BAMFileStat.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/utilities/BAMFileStat.java index 32bb8368a..da31c49b3 100644 --- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/utilities/BAMFileStat.java +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/utilities/BAMFileStat.java @@ -25,15 +25,14 @@ package org.broadinstitute.gatk.engine.datasources.reads.utilities; -import htsjdk.samtools.BAMIndex; -import htsjdk.samtools.SAMFileReader; -import htsjdk.samtools.ValidationStringency; +import htsjdk.samtools.*; import org.broadinstitute.gatk.utils.commandline.Argument; import org.broadinstitute.gatk.utils.commandline.CommandLineProgram; import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; import org.broadinstitute.gatk.utils.instrumentation.Sizeof; import java.io.File; +import java.io.IOException; import java.lang.reflect.Field; import java.util.List; import java.util.Map; @@ -56,7 +55,7 @@ public class BAMFileStat extends CommandLineProgram { @Argument(doc="The range to inspect.",required=false) private String range; - public int execute() { + public int execute() throws IOException { switch(command) { case ShowBlocks: throw new ReviewedGATKException("The BAM block inspector has been disabled."); @@ -81,14 +80,11 @@ public class BAMFileStat extends CommandLineProgram { } } - private void showIndexBins(File bamFile,String contigName) { - SAMFileReader reader; - BAMIndex index; + private void showIndexBins(File bamFile,String contigName) throws IOException { - reader = new SAMFileReader(bamFile); - reader.setValidationStringency(ValidationStringency.SILENT); - reader.enableIndexCaching(true); - index = reader.getIndex(); + final SamReader reader = SamReaderFactory.makeDefault().enable(SamReaderFactory.Option.CACHE_FILE_BASED_INDEXES). + validationStringency(ValidationStringency.SILENT).open(bamFile); + final SamReader.Indexing index = reader.indexing(); reader.queryOverlapping(contigName,1,reader.getFileHeader().getSequence(contigName).getSequenceLength()).close(); diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/utilities/BAMTagRenamer.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/utilities/BAMTagRenamer.java index 385292d8b..17408c017 100644 --- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/utilities/BAMTagRenamer.java +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/utilities/BAMTagRenamer.java @@ -25,14 +25,17 @@ package org.broadinstitute.gatk.engine.datasources.reads.utilities; -import htsjdk.samtools.SAMFileReader; +import htsjdk.samtools.SamReader; +import htsjdk.samtools.SamReaderFactory; import htsjdk.samtools.SAMFileWriter; import htsjdk.samtools.SAMFileWriterFactory; import htsjdk.samtools.SAMRecord; import org.broadinstitute.gatk.utils.commandline.Argument; import org.broadinstitute.gatk.utils.commandline.CommandLineProgram; +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; import java.io.File; +import java.io.IOException; /** * A simple utility written directly in Picard that will rename tags @@ -62,7 +65,7 @@ public class BAMTagRenamer extends CommandLineProgram { long readsWritten = 0; long readsAltered = 0; - SAMFileReader reader = new SAMFileReader(input); + final SamReader reader = SamReaderFactory.makeDefault().open(input); SAMFileWriter writer = new SAMFileWriterFactory().makeBAMWriter(reader.getFileHeader(),true,output,compressionLevel); for(SAMRecord read: reader) { @@ -79,7 +82,13 @@ public class BAMTagRenamer extends CommandLineProgram { } writer.close(); - System.out.printf("%d reads written. %d tag names updated from %s to %s.%n",readsWritten,readsAltered,sourceTagName,targetTagName); + System.out.printf("%d reads written. %d tag names updated from %s to %s.%n",readsWritten,readsAltered,sourceTagName,targetTagName); + + try { + reader.close(); + } catch ( IOException ex ) { + throw new ReviewedGATKException("Unable to close " + input , ex); + } return 0; } diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/utilities/PrintBAMRegion.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/utilities/PrintBAMRegion.java index 26f0e4d29..d2cbc588b 100644 --- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/utilities/PrintBAMRegion.java +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/utilities/PrintBAMRegion.java @@ -31,6 +31,7 @@ import org.broadinstitute.gatk.utils.commandline.CommandLineProgram; import org.broadinstitute.gatk.utils.exceptions.UserException; import java.io.File; +import java.io.IOException; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -53,9 +54,8 @@ public class PrintBAMRegion extends CommandLineProgram { private static final int MIN_OFFSET_SIZE = 0; private static final int MAX_OFFSET_SIZE = (int)Math.pow(2,16)-1; - public int execute() { - SAMFileReader reader = new SAMFileReader(input); - reader.setValidationStringency(ValidationStringency.SILENT); + public int execute() throws IOException { + final SamReader reader = SamReaderFactory.makeDefault().validationStringency(ValidationStringency.SILENT).open(input); Pattern regionPattern = Pattern.compile("(\\d+):(\\d+)-(\\d+):(\\d+)"); Matcher matcher = regionPattern.matcher(region); @@ -76,10 +76,10 @@ public class PrintBAMRegion extends CommandLineProgram { if(lastOffset < MIN_OFFSET_SIZE || lastOffset > MAX_OFFSET_SIZE) throw new UserException(String.format("Last offset is invalid; must be between %d and %d; actually is %d",MIN_OFFSET_SIZE,MAX_OFFSET_SIZE,lastOffset)); - GATKChunk chunk = new GATKChunk(firstBlock<<16 | firstOffset,lastBlock<<16 | lastOffset); - GATKBAMFileSpan fileSpan = new GATKBAMFileSpan(chunk); + final GATKChunk chunk = new GATKChunk(firstBlock<<16 | firstOffset,lastBlock<<16 | lastOffset); + final GATKBAMFileSpan fileSpan = new GATKBAMFileSpan(chunk); - SAMRecordIterator iterator = reader.iterator(fileSpan); + final SAMRecordIterator iterator = ((SamReader.PrimitiveSamReaderToSamReaderAdapter) reader).iterator(fileSpan); long readCount = 0; while(iterator.hasNext()) { System.out.printf("%s%n",iterator.next().format()); diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/utilities/PrintBGZFBounds.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/utilities/PrintBGZFBounds.java index 5ec07390f..8c7b62fca 100644 --- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/utilities/PrintBGZFBounds.java +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/utilities/PrintBGZFBounds.java @@ -65,8 +65,6 @@ public class PrintBGZFBounds extends CommandLineProgram { float uncompressedSize = 0; long totalBlocks = 0; - //SAMFileReader reader = new SAMFileReader(input); - while(true) { final long blockStart = fis.getChannel().position(); diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/io/OutputTracker.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/io/OutputTracker.java index 87f601923..1db822192 100644 --- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/io/OutputTracker.java +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/io/OutputTracker.java @@ -98,7 +98,7 @@ public abstract class OutputTracker implements ReferenceBacked { ArgumentSource targetField = io.getKey(); Object targetValue = io.getValue(); - // Ghastly hack: reaches in and finishes building out the SAMFileReader. + // Ghastly hack: reaches in and finishes building out the SameReader. // TODO: Generalize this, and move it to its own initialization step. if( targetValue instanceof SAMReaderBuilder) { SAMReaderBuilder builder = (SAMReaderBuilder)targetValue; diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/io/storage/SAMFileWriterStorage.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/io/storage/SAMFileWriterStorage.java index b1aecb22c..acce6addb 100644 --- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/io/storage/SAMFileWriterStorage.java +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/io/storage/SAMFileWriterStorage.java @@ -31,6 +31,7 @@ import htsjdk.samtools.util.ProgressLoggerInterface; import htsjdk.samtools.util.RuntimeIOException; import org.apache.log4j.Logger; import org.broadinstitute.gatk.engine.io.stubs.SAMFileWriterStub; +import org.broadinstitute.gatk.utils.exceptions.GATKException; import org.broadinstitute.gatk.utils.exceptions.UserException; import org.broadinstitute.gatk.utils.sam.SimplifyingSAMFileWriter; @@ -112,7 +113,7 @@ public class SAMFileWriterStorage implements SAMFileWriter, Storage iterator = reader.iterator(); while( iterator.hasNext() ) @@ -120,7 +121,11 @@ public class SAMFileWriterStorage implements SAMFileWriter, Storage readGroupToNewSampleMap = new HashMap<>(); for ( String inputBamID : Arrays.asList("12878", "12891", "12892") ) { final File inputBam = new File(privateTestDir + String.format("CEUTrio.HiSeq.WGS.b37.NA%s.HEADERONLY.bam", inputBamID)); - final SAMFileReader inputBamReader = new SAMFileReader(inputBam); + final SamReader reader = SamReaderFactory.makeDefault().open(inputBam); final String newSampleName = String.format("newSampleFor%s", inputBamID); - for ( final SAMReadGroupRecord readGroup : inputBamReader.getFileHeader().getReadGroups() ) { + for ( final SAMReadGroupRecord readGroup : reader.getFileHeader().getReadGroups() ) { readGroupToNewSampleMap.put(readGroup.getId(), newSampleName); } - inputBamReader.close(); + reader.close(); } final WalkerTestSpec spec = new WalkerTestSpec(" -T TestPrintReadsWalker" + @@ -420,10 +419,10 @@ public class EngineFeaturesIntegrationTest extends WalkerTest { 1, Arrays.asList("")); // No MD5s; we only want to check the read groups final File outputBam = executeTest("testOnTheFlySampleRenamingWithMultipleBamFiles", spec).first.get(0); - final SAMFileReader outputBamReader = new SAMFileReader(outputBam); + final SamReader reader = SamReaderFactory.makeDefault().open(outputBam); int totalReadGroupsSeen = 0; - for ( final SAMReadGroupRecord readGroup : outputBamReader.getFileHeader().getReadGroups() ) { + for ( final SAMReadGroupRecord readGroup : reader.getFileHeader().getReadGroups() ) { Assert.assertEquals(readGroup.getSample(), readGroupToNewSampleMap.get(readGroup.getId()), String.format("Wrong sample for read group %s after on-the-fly renaming", readGroup.getId())); totalReadGroupsSeen++; @@ -431,7 +430,7 @@ public class EngineFeaturesIntegrationTest extends WalkerTest { Assert.assertEquals(totalReadGroupsSeen, readGroupToNewSampleMap.size(), "Wrong number of read groups encountered in output bam file"); - outputBamReader.close(); + reader.close(); } // On-the-fly sample renaming test case: three single-sample bams with multiple read groups per bam, @@ -446,15 +445,15 @@ public class EngineFeaturesIntegrationTest extends WalkerTest { final Map readGroupToNewSampleMap = new HashMap<>(); for ( String inputBamID : Arrays.asList("12878", "12891", "12892") ) { final File inputBam = new File(privateTestDir + String.format("CEUTrio.HiSeq.WGS.b37.NA%s.HEADERONLY.bam", inputBamID)); - final SAMFileReader inputBamReader = new SAMFileReader(inputBam); + final SamReader reader = SamReaderFactory.makeDefault().open(inputBam); // Special-case NA12891, which we're not renaming: final String newSampleName = inputBamID.equals("12891") ? "NA12891" : String.format("newSampleFor%s", inputBamID); - for ( final SAMReadGroupRecord readGroup : inputBamReader.getFileHeader().getReadGroups() ) { + for ( final SAMReadGroupRecord readGroup : reader.getFileHeader().getReadGroups() ) { readGroupToNewSampleMap.put(readGroup.getId(), newSampleName); } - inputBamReader.close(); + reader.close(); } final WalkerTestSpec spec = new WalkerTestSpec(" -T TestPrintReadsWalker" + @@ -467,10 +466,10 @@ public class EngineFeaturesIntegrationTest extends WalkerTest { 1, Arrays.asList("")); // No MD5s; we only want to check the read groups final File outputBam = executeTest("testOnTheFlySampleRenamingWithMultipleBamFilesPartialRename", spec).first.get(0); - final SAMFileReader outputBamReader = new SAMFileReader(outputBam); + final SamReader reader = SamReaderFactory.makeDefault().open(outputBam); int totalReadGroupsSeen = 0; - for ( final SAMReadGroupRecord readGroup : outputBamReader.getFileHeader().getReadGroups() ) { + for ( final SAMReadGroupRecord readGroup : reader.getFileHeader().getReadGroups() ) { Assert.assertEquals(readGroup.getSample(), readGroupToNewSampleMap.get(readGroup.getId()), String.format("Wrong sample for read group %s after on-the-fly renaming", readGroup.getId())); totalReadGroupsSeen++; @@ -478,7 +477,7 @@ public class EngineFeaturesIntegrationTest extends WalkerTest { Assert.assertEquals(totalReadGroupsSeen, readGroupToNewSampleMap.size(), "Wrong number of read groups encountered in output bam file"); - outputBamReader.close(); + reader.close(); } // On-the-fly sample renaming test case: two single-sample bams with read group collisions @@ -489,11 +488,11 @@ public class EngineFeaturesIntegrationTest extends WalkerTest { privateTestDir + "CEUTrio.HiSeq.WGS.b37.READ_GROUP_COLLISIONS_WITH_NA12878.HEADERONLY.bam newSampleForNot12878")); final Set na12878ReadGroups = new HashSet<>(); - final SAMFileReader inputBamReader = new SAMFileReader(new File(privateTestDir + "CEUTrio.HiSeq.WGS.b37.NA12878.HEADERONLY.bam")); - for ( final SAMReadGroupRecord readGroup : inputBamReader.getFileHeader().getReadGroups() ) { + final SamReader inpuBAMreader = SamReaderFactory.makeDefault().open(new File(privateTestDir + "CEUTrio.HiSeq.WGS.b37.NA12878.HEADERONLY.bam")); + for ( final SAMReadGroupRecord readGroup : inpuBAMreader.getFileHeader().getReadGroups() ) { na12878ReadGroups.add(readGroup.getId()); } - inputBamReader.close(); + inpuBAMreader.close(); final WalkerTestSpec spec = new WalkerTestSpec(" -T TestPrintReadsWalker" + " -R " + b37KGReference + @@ -504,10 +503,10 @@ public class EngineFeaturesIntegrationTest extends WalkerTest { 1, Arrays.asList("")); // No MD5s; we only want to check the read groups final File outputBam = executeTest("testOnTheFlySampleRenamingWithReadGroupCollisions", spec).first.get(0); - final SAMFileReader outputBamReader = new SAMFileReader(outputBam); + final SamReader outputBAMreader = SamReaderFactory.makeDefault().open(outputBam); int totalReadGroupsSeen = 0; - for ( final SAMReadGroupRecord readGroup : outputBamReader.getFileHeader().getReadGroups() ) { + for ( final SAMReadGroupRecord readGroup : outputBAMreader.getFileHeader().getReadGroups() ) { String expectedSampleName = ""; if ( na12878ReadGroups.contains(readGroup.getId()) ) { expectedSampleName = "newSampleFor12878"; @@ -523,7 +522,7 @@ public class EngineFeaturesIntegrationTest extends WalkerTest { Assert.assertEquals(totalReadGroupsSeen, na12878ReadGroups.size() * 2, "Wrong number of read groups encountered in output bam file"); - outputBamReader.close(); + outputBAMreader.close(); } // On-the-fly sample renaming test case: a multi-sample bam (this should generate a UserException) diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/DownsamplerBenchmark.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/DownsamplerBenchmark.java index b45db15b0..47d10d234 100644 --- a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/DownsamplerBenchmark.java +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/DownsamplerBenchmark.java @@ -55,12 +55,12 @@ public class DownsamplerBenchmark extends ReadProcessingBenchmark { // public void timeDownsampling(int reps) { // for(int i = 0; i < reps; i++) { -// SAMFileReader reader = new SAMFileReader(inputFile); +// SamReader reader = SamReaderFactory.makeDefault().open(inputFile); // ReadProperties readProperties = new ReadProperties(Collections.singletonList(new SAMReaderID(inputFile,new Tags())), // reader.getFileHeader(), // SAMFileHeader.SortOrder.coordinate, // false, -// SAMFileReader.ValidationStringency.SILENT, +// ValidationStringency.SILENT, // downsampling.create(), // new ValidationExclusion(Collections.singletonList(ValidationExclusion.TYPE.ALL)), // Collections.emptyList(), diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/GATKBAMIndexFromDataSourceUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/GATKBAMIndexFromDataSourceUnitTest.java index 3cd4ac6e4..5d051e37f 100644 --- a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/GATKBAMIndexFromDataSourceUnitTest.java +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/GATKBAMIndexFromDataSourceUnitTest.java @@ -25,7 +25,6 @@ package org.broadinstitute.gatk.engine.datasources.reads; -import htsjdk.samtools.SAMFileReader; import htsjdk.samtools.SamReader; import htsjdk.samtools.SamReaderFactory; import org.broadinstitute.gatk.utils.BaseTest; @@ -50,8 +49,7 @@ public class GATKBAMIndexFromDataSourceUnitTest extends BaseTest { @BeforeClass public void init() throws IOException { - final SAMFileReader reader = new SAMFileReader(bamFile); - reader.enableIndexCaching(true); // needed ot get BrowseableBAMIndex + final SamReader reader = SamReaderFactory.makeDefault().enable(SamReaderFactory.Option.CACHE_FILE_BASED_INDEXES).open(bamFile); Assert.assertTrue(reader.hasIndex()); Assert.assertTrue(reader.indexing().hasBrowseableIndex()); diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/GATKBAMIndexFromFileUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/GATKBAMIndexFromFileUnitTest.java index c13a0006d..bfa53ff74 100644 --- a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/GATKBAMIndexFromFileUnitTest.java +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/GATKBAMIndexFromFileUnitTest.java @@ -25,8 +25,9 @@ package org.broadinstitute.gatk.engine.datasources.reads; -import htsjdk.samtools.SAMFileReader; import htsjdk.samtools.SAMSequenceDictionary; +import htsjdk.samtools.SamReader; +import htsjdk.samtools.SamReaderFactory; import org.broadinstitute.gatk.utils.BaseTest; import org.broadinstitute.gatk.utils.exceptions.UserException; import org.testng.Assert; @@ -34,7 +35,7 @@ import org.testng.annotations.BeforeClass; import org.testng.annotations.Test; import java.io.File; -import java.io.FileNotFoundException; +import java.io.IOException; /** * Test basic functionality in the GATK's implementation of the BAM index classes. @@ -59,8 +60,8 @@ public class GATKBAMIndexFromFileUnitTest extends BaseTest { @BeforeClass - public void init() throws FileNotFoundException { - final SAMFileReader reader = new SAMFileReader(bamFile); + public void init() throws IOException { + final SamReader reader = SamReaderFactory.makeDefault().enable(SamReaderFactory.Option.CACHE_FILE_BASED_INDEXES).open(bamFile); sequenceDictionary = reader.getFileHeader().getSequenceDictionary(); reader.close(); diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/PicardBaselineBenchmark.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/PicardBaselineBenchmark.java index a05a852d2..0a118408c 100644 --- a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/PicardBaselineBenchmark.java +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/PicardBaselineBenchmark.java @@ -26,13 +26,13 @@ package org.broadinstitute.gatk.engine.datasources.reads; import com.google.caliper.Param; -import com.google.caliper.SimpleBenchmark; +import htsjdk.samtools.SamReader; +import htsjdk.samtools.SamReaderFactory; import htsjdk.samtools.util.SamLocusIterator; -import htsjdk.samtools.SAMFileReader; import htsjdk.samtools.SAMRecord; import htsjdk.samtools.util.CloseableIterator; -import java.io.File; +import java.io.IOException; import java.util.Iterator; /** @@ -55,9 +55,9 @@ public class PicardBaselineBenchmark extends ReadProcessingBenchmark { @Override public Integer getMaxReads() { return maxReads; } - public void timeDecompressBamFile(int reps) { + public void timeDecompressBamFile(int reps) throws IOException { for(int i = 0; i < reps; i++) { - SAMFileReader reader = new SAMFileReader(inputFile); + final SamReader reader = SamReaderFactory.makeDefault().open(inputFile); CloseableIterator iterator = reader.iterator(); while(iterator.hasNext()) iterator.next(); @@ -66,9 +66,9 @@ public class PicardBaselineBenchmark extends ReadProcessingBenchmark { } } - public void timeExtractTag(int reps) { + public void timeExtractTag(int reps) throws IOException { for(int i = 0; i < reps; i++) { - SAMFileReader reader = new SAMFileReader(inputFile); + final SamReader reader = SamReaderFactory.makeDefault().open(inputFile); CloseableIterator iterator = reader.iterator(); while(iterator.hasNext()) { SAMRecord read = iterator.next(); @@ -79,9 +79,9 @@ public class PicardBaselineBenchmark extends ReadProcessingBenchmark { } } - public void timeSamLocusIterator(int reps) { + public void timeSamLocusIterator(int reps) throws IOException { for(int i = 0; i < reps; i++) { - SAMFileReader reader = new SAMFileReader(inputFile); + final SamReader reader = SamReaderFactory.makeDefault().open(inputFile); long loci = 0; SamLocusIterator samLocusIterator = new SamLocusIterator(reader); diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/ReadProcessingBenchmark.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/ReadProcessingBenchmark.java index d176249d5..a7bf2ebc1 100644 --- a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/ReadProcessingBenchmark.java +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/ReadProcessingBenchmark.java @@ -25,11 +25,11 @@ package org.broadinstitute.gatk.engine.datasources.reads; -import com.google.caliper.Param; import com.google.caliper.SimpleBenchmark; -import htsjdk.samtools.SAMFileReader; import htsjdk.samtools.SAMFileWriter; import htsjdk.samtools.SAMFileWriterFactory; +import htsjdk.samtools.SamReader; +import htsjdk.samtools.SamReaderFactory; import htsjdk.samtools.SAMRecord; import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; @@ -51,7 +51,7 @@ public abstract class ReadProcessingBenchmark extends SimpleBenchmark { @Override public void setUp() { - SAMFileReader fullInputFile = new SAMFileReader(new File(getBAMFile())); + SamReader reader = SamReaderFactory.makeDefault().open(new File(getBAMFile())); File tempFile = null; try { @@ -62,15 +62,20 @@ public abstract class ReadProcessingBenchmark extends SimpleBenchmark { } SAMFileWriterFactory factory = new SAMFileWriterFactory(); factory.setCreateIndex(true); - SAMFileWriter writer = factory.makeBAMWriter(fullInputFile.getFileHeader(),true,tempFile); + SAMFileWriter writer = factory.makeBAMWriter(reader.getFileHeader(),true,tempFile); long numReads = 0; - for(SAMRecord read: fullInputFile) { + for(SAMRecord read: reader) { if(numReads++ >= getMaxReads()) break; writer.addAlignment(read); } + try { + reader.close(); + } catch ( IOException ex ) { + throw new ReviewedGATKException("Unable to close " + getBAMFile() , ex); + } writer.close(); inputFile = tempFile; diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/TheoreticalMinimaBenchmark.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/TheoreticalMinimaBenchmark.java index 01ec4238f..754b288a6 100644 --- a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/TheoreticalMinimaBenchmark.java +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/TheoreticalMinimaBenchmark.java @@ -28,11 +28,13 @@ package org.broadinstitute.gatk.engine.datasources.reads; import com.google.caliper.Param; import htsjdk.samtools.Cigar; import htsjdk.samtools.CigarElement; -import htsjdk.samtools.SAMFileReader; +import htsjdk.samtools.SamReader; +import htsjdk.samtools.SamReaderFactory; import htsjdk.samtools.SAMRecord; import htsjdk.samtools.util.CloseableIterator; import java.io.File; +import java.io.IOException; /** * Created by IntelliJ IDEA. @@ -54,10 +56,10 @@ public class TheoreticalMinimaBenchmark extends ReadProcessingBenchmark { @Override public Integer getMaxReads() { return maxReads; } - public void timeIterateOverEachBase(int reps) { + public void timeIterateOverEachBase(int reps) throws IOException { System.out.printf("Processing " + inputFile); for(int i = 0; i < reps; i++) { - SAMFileReader reader = new SAMFileReader(inputFile); + final SamReader reader = SamReaderFactory.makeDefault().open((inputFile)); CloseableIterator iterator = reader.iterator(); long As=0,Cs=0,Gs=0,Ts=0; @@ -78,14 +80,14 @@ public class TheoreticalMinimaBenchmark extends ReadProcessingBenchmark { } } - public void timeIterateOverCigarString(int reps) { + public void timeIterateOverCigarString(int reps) throws IOException { for(int i = 0; i < reps; i++) { long matchMismatches = 0; long insertions = 0; long deletions = 0; long others = 0; - SAMFileReader reader = new SAMFileReader(inputFile); + final SamReader reader = SamReaderFactory.makeDefault().open(inputFile); CloseableIterator iterator = reader.iterator(); while(iterator.hasNext()) { SAMRecord read = iterator.next(); diff --git a/public/gatk-queue-extensions-generator/src/main/java/org/broadinstitute/gatk/queue/extensions/gatk/ArgumentField.java b/public/gatk-queue-extensions-generator/src/main/java/org/broadinstitute/gatk/queue/extensions/gatk/ArgumentField.java index 9012e1d56..71aa55bdb 100644 --- a/public/gatk-queue-extensions-generator/src/main/java/org/broadinstitute/gatk/queue/extensions/gatk/ArgumentField.java +++ b/public/gatk-queue-extensions-generator/src/main/java/org/broadinstitute/gatk/queue/extensions/gatk/ArgumentField.java @@ -25,7 +25,7 @@ package org.broadinstitute.gatk.queue.extensions.gatk; -import htsjdk.samtools.SAMFileReader; +import htsjdk.samtools.SamReader; import htsjdk.samtools.SAMFileWriter; import org.apache.commons.lang.StringEscapeUtils; import org.apache.commons.lang.StringUtils; @@ -245,7 +245,7 @@ public abstract class ArgumentField { protected static Class mapType(Class clazz) { if (InputStream.class.isAssignableFrom(clazz)) return File.class; - if (SAMFileReader.class.isAssignableFrom(clazz)) return File.class; + if (SamReader.class.isAssignableFrom(clazz)) return File.class; if (OutputStream.class.isAssignableFrom(clazz)) return File.class; if (VariantContextWriter.class.isAssignableFrom(clazz)) return File.class; if (SAMFileWriter.class.isAssignableFrom(clazz)) return File.class; diff --git a/public/gatk-queue/src/main/scala/org/broadinstitute/gatk/queue/util/QScriptUtils.scala b/public/gatk-queue/src/main/scala/org/broadinstitute/gatk/queue/util/QScriptUtils.scala index b8923f212..b995c40f0 100644 --- a/public/gatk-queue/src/main/scala/org/broadinstitute/gatk/queue/util/QScriptUtils.scala +++ b/public/gatk-queue/src/main/scala/org/broadinstitute/gatk/queue/util/QScriptUtils.scala @@ -27,7 +27,7 @@ package org.broadinstitute.gatk.queue.util import java.io.File import io.Source._ -import htsjdk.samtools.{SAMReadGroupRecord, SAMFileReader} +import htsjdk.samtools.{SamReaderFactory, SAMReadGroupRecord} import collection.JavaConversions._ @@ -87,8 +87,10 @@ object QScriptUtils { * Returns the number of contigs in the BAM file header. */ def getNumberOfContigs(bamFile: File): Int = { - val samReader = new SAMFileReader(bamFile) - samReader.getFileHeader.getSequenceDictionary.getSequences.size() + val samReader = SamReaderFactory.makeDefault().open(bamFile) + val size = samReader.getFileHeader.getSequenceDictionary.getSequences.size() + samReader.close + return size } /** @@ -112,11 +114,12 @@ object QScriptUtils { * @return a set with all distinct samples (in no particular order) */ def getSamplesFromBAM(bam: File) : Set[String] = { - val reader = new SAMFileReader(bam) + val reader = SamReaderFactory.makeDefault().open(bam); var samples: Set[String] = Set() for (rg <- reader.getFileHeader.getReadGroups) { samples += rg.getSample } + reader.close samples } } diff --git a/public/gatk-queue/src/main/scala/org/broadinstitute/gatk/queue/util/VCF_BAM_utilities.scala b/public/gatk-queue/src/main/scala/org/broadinstitute/gatk/queue/util/VCF_BAM_utilities.scala index 099ab79e8..55d932056 100644 --- a/public/gatk-queue/src/main/scala/org/broadinstitute/gatk/queue/util/VCF_BAM_utilities.scala +++ b/public/gatk-queue/src/main/scala/org/broadinstitute/gatk/queue/util/VCF_BAM_utilities.scala @@ -28,7 +28,7 @@ package org.broadinstitute.gatk.queue.util import java.io.File import org.apache.commons.io.FilenameUtils import scala.io.Source._ -import htsjdk.samtools.SAMFileReader +import htsjdk.samtools.{SamReaderFactory} import htsjdk.variant.vcf.{VCFHeader, VCFCodec} import scala.collection.JavaConversions._ import htsjdk.tribble.AbstractFeatureReader @@ -40,7 +40,7 @@ object VCF_BAM_utilities { } def getSamplesInBAM(bam: File): List[String] = { - return new SAMFileReader(bam).getFileHeader().getReadGroups().toList.map(srgr => srgr.getSample()).toSet.toList + return SamReaderFactory.makeDefault().open(bam).getFileHeader().getReadGroups().toList.map(srgr => srgr.getSample()).toSet.toList } def parseBAMsInput(bamsIn: File): List[File] = FilenameUtils.getExtension(bamsIn.getPath) match { diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/diffengine/BAMDiffableReader.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/diffengine/BAMDiffableReader.java index c423d78fe..d8250b51f 100644 --- a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/diffengine/BAMDiffableReader.java +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/diffengine/BAMDiffableReader.java @@ -25,11 +25,13 @@ package org.broadinstitute.gatk.utils.diffengine; -import htsjdk.samtools.SAMFileReader; +import htsjdk.samtools.SamReader; +import htsjdk.samtools.SamReaderFactory; import htsjdk.samtools.SAMRecord; import htsjdk.samtools.SAMRecordIterator; import htsjdk.samtools.ValidationStringency; import htsjdk.samtools.util.BlockCompressedInputStream; +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; import java.io.*; import java.util.Arrays; @@ -49,8 +51,7 @@ public class BAMDiffableReader implements DiffableReader { @Override public DiffElement readFromFile(File file, int maxElementsToRead) { - final SAMFileReader reader = new SAMFileReader(file, null); // null because we don't want it to look for the index - reader.setValidationStringency(ValidationStringency.SILENT); + final SamReader reader = SamReaderFactory.makeDefault().validationStringency(ValidationStringency.SILENT).open(file); DiffNode root = DiffNode.rooted(file.getName()); SAMRecordIterator iterator = reader.iterator(); @@ -93,7 +94,11 @@ public class BAMDiffableReader implements DiffableReader { break; } - reader.close(); + try { + reader.close(); + } catch (final IOException ex ) { + throw new ReviewedGATKException("Unable to close " + file , ex); + } return root.getBinding(); } diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/locusiterator/LIBSPerformance.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/locusiterator/LIBSPerformance.java index 5bb518a50..c165b3120 100644 --- a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/locusiterator/LIBSPerformance.java +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/locusiterator/LIBSPerformance.java @@ -25,7 +25,8 @@ package org.broadinstitute.gatk.utils.locusiterator; -import htsjdk.samtools.SAMFileReader; +import htsjdk.samtools.SamReader; +import htsjdk.samtools.SamReaderFactory; import htsjdk.samtools.SAMReadGroupRecord; import htsjdk.samtools.SAMRecordIterator; import htsjdk.samtools.reference.ReferenceSequenceFile; @@ -65,7 +66,7 @@ public class LIBSPerformance extends CommandLineProgram { final ReferenceSequenceFile reference = new CachingIndexedFastaSequenceFile(referenceFile); final GenomeLocParser genomeLocParser = new GenomeLocParser(reference); - final SAMFileReader reader = new SAMFileReader(samFile); + final SamReader reader = SamReaderFactory.makeDefault().open(samFile); SAMRecordIterator rawIterator; if ( location == null ) @@ -81,6 +82,8 @@ public class LIBSPerformance extends CommandLineProgram { for ( final SAMReadGroupRecord rg : reader.getFileHeader().getReadGroups() ) samples.add(rg.getSample()); + reader.close(); + final LIBSDownsamplingInfo ds = new LIBSDownsamplingInfo(downsample, 250); final LocusIteratorByState libs = diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/locusiterator/LocusIteratorByState.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/locusiterator/LocusIteratorByState.java index 3ce8783b5..fe509948f 100644 --- a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/locusiterator/LocusIteratorByState.java +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/locusiterator/LocusIteratorByState.java @@ -28,7 +28,7 @@ package org.broadinstitute.gatk.utils.locusiterator; import com.google.java.contract.Ensures; import com.google.java.contract.Requires; import htsjdk.samtools.CigarOperator; -import htsjdk.samtools.SAMFileReader; +import htsjdk.samtools.SamReader; import htsjdk.samtools.SAMRecord; import htsjdk.samtools.util.CloseableIterator; import org.apache.log4j.Logger; @@ -149,7 +149,7 @@ public final class LocusIteratorByState extends LocusIterator { } /** - * Create a new LocusIteratorByState based on a SAMFileReader using reads in an iterator it + * Create a new LocusIteratorByState based on a SamReader using reads in an iterator it * * Simple constructor that uses the samples in the reader, doesn't do any downsampling, * and makes a new GenomeLocParser using the reader. This constructor will be slow(ish) @@ -158,7 +158,7 @@ public final class LocusIteratorByState extends LocusIterator { * @param reader a non-null reader * @param it an iterator from reader that has the reads we want to use to create ReadBackPileups */ - public LocusIteratorByState(final SAMFileReader reader, final CloseableIterator it) { + public LocusIteratorByState(final SamReader reader, final CloseableIterator it) { this(new GATKSAMRecordIterator(it), new LIBSDownsamplingInfo(false, 0), true, diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/sam/ArtificialSAMFileReader.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/sam/ArtificialSAMFileReader.java index 9491ed254..846cc7f3d 100644 --- a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/sam/ArtificialSAMFileReader.java +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/sam/ArtificialSAMFileReader.java @@ -26,16 +26,19 @@ package org.broadinstitute.gatk.utils.sam; import htsjdk.samtools.*; +import htsjdk.samtools.SamReader.Indexing; import org.broadinstitute.gatk.utils.GenomeLoc; import org.broadinstitute.gatk.utils.GenomeLocParser; import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; import java.io.ByteArrayInputStream; +import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; import java.util.Arrays; import java.util.Iterator; import java.util.List; + /** * User: hanna * Date: Jun 11, 2009 @@ -53,7 +56,13 @@ import java.util.List; * Pass specified reads into the given walker. */ -public class ArtificialSAMFileReader extends SAMFileReader { +public class ArtificialSAMFileReader implements SamReader, Indexing { + + /** + * The reader of SamRecords + */ + private SamReader reader; + /** * The parser, for GenomeLocs. */ @@ -64,15 +73,20 @@ public class ArtificialSAMFileReader extends SAMFileReader { */ private final List reads; + /** + * Input/custom SAM file header + */ private SAMFileHeader customHeader = null; /** * Construct an artificial SAM file reader. + * * @param sequenceDictionary sequence dictionary used to initialize our GenomeLocParser * @param reads Reads to use as backing data source. */ public ArtificialSAMFileReader(SAMSequenceDictionary sequenceDictionary,SAMRecord... reads) { - super( createEmptyInputStream(),true ); + final SamInputResource samInputResource = SamInputResource.of(createEmptyInputStream()); + reader = SamReaderFactory.makeDefault().open(samInputResource); this.genomeLocParser = new GenomeLocParser(sequenceDictionary); this.reads = Arrays.asList(reads); } @@ -84,30 +98,75 @@ public class ArtificialSAMFileReader extends SAMFileReader { * @param reads Reads to use as backing data source. */ public ArtificialSAMFileReader( SAMFileHeader customHeader, SAMRecord... reads ) { - super(createEmptyInputStream(),true); + final SamInputResource samInputResource = SamInputResource.of(createEmptyInputStream()); + reader = SamReaderFactory.makeDefault().open(samInputResource); this.customHeader = customHeader; this.genomeLocParser = new GenomeLocParser(customHeader.getSequenceDictionary()); this.reads = Arrays.asList(reads); } + @Override + public String getResourceDescription() { + return this.toString(); + } @Override - public SAMFileHeader getFileHeader() { - if ( customHeader != null ) { - return customHeader; - } + public boolean hasIndex() { + return this.reader.hasIndex(); + } - return super.getFileHeader(); + @Override + public Indexing indexing() { + return this; + } + + @Override + public BrowseableBAMIndex getBrowseableIndex() { + BAMIndex index = this.getIndex(); + if(!(index instanceof BrowseableBAMIndex)) { + throw new SAMException("Cannot return index: index created by BAM is not browseable."); + } else { + return BrowseableBAMIndex.class.cast(index); + } + } + + @Override + public boolean hasBrowseableIndex() { + return this.hasIndex() && this.getIndex() instanceof BrowseableBAMIndex; + } + + @Override + public BAMIndex getIndex() { + throw new UnsupportedOperationException(); + } + + @Override + public SAMRecordIterator iterator() { + return new SAMRecordIterator() { + private final Iterator iterator = reads.iterator(); + public boolean hasNext() { return iterator.hasNext(); } + public SAMRecord next() { return iterator.next(); } + public void close() {} + public void remove() { iterator.remove(); } + public SAMRecordIterator assertSorted(SAMFileHeader.SortOrder sortOrder) { return this; } + }; } /** - * @{inheritDoc} + * Iterate through the the file. + * + * @param chunks List of chunks for which to retrieve data. + * @return An iterator. */ @Override + public SAMRecordIterator iterator(SAMFileSpan chunks) { + return new SamReader.AssertingIterator(this.reader.iterator()); + } + public SAMRecordIterator query(final String sequence, final int start, final int end, final boolean contained) { GenomeLoc region = genomeLocParser.createGenomeLoc(sequence, start, end); - List coveredSubset = new ArrayList(); + List coveredSubset = new ArrayList<>(); for( SAMRecord read: reads ) { GenomeLoc readPosition = genomeLocParser.createGenomeLoc(read); @@ -126,15 +185,116 @@ public class ArtificialSAMFileReader extends SAMFileReader { } @Override - public SAMRecordIterator iterator() { - return new SAMRecordIterator() { - private final Iterator iterator = reads.iterator(); - public boolean hasNext() { return iterator.hasNext(); } - public SAMRecord next() { return iterator.next(); } - public void close() {} - public void remove() { iterator.remove(); } - public SAMRecordIterator assertSorted(SAMFileHeader.SortOrder sortOrder) { return this; } - }; + public SAMRecordIterator queryOverlapping(final String sequence, final int start, final int end) { + return this.query(sequence, start, end, false); + } + + @Override + public SAMRecordIterator queryContained(final String sequence, final int start, final int end) { + return this.query(sequence, start, end, true); + } + + @Override + public SAMRecordIterator query(final QueryInterval[] intervals, final boolean contained) { + return new AssertingIterator(this.reader.query(intervals, contained)); + } + + @Override + public SAMRecordIterator queryOverlapping(final QueryInterval[] intervals) { + return this.query(intervals, false); + } + + @Override + public SAMRecordIterator queryContained(final QueryInterval[] intervals) { + return this.query(intervals, true); + } + + @Override + public SAMRecordIterator queryUnmapped() { + return new AssertingIterator(this.reader.queryUnmapped()); + } + + @Override + public SAMRecordIterator queryAlignmentStart(final String sequence, final int start) { + return new AssertingIterator(this.reader.queryAlignmentStart(sequence, start)); + } + + @Override + public SAMRecord queryMate(final SAMRecord rec) { + if(!rec.getReadPairedFlag()) { + throw new IllegalArgumentException("queryMate called for unpaired read."); + } else if(rec.getFirstOfPairFlag() == rec.getSecondOfPairFlag()) { + throw new IllegalArgumentException("SAMRecord must be either first and second of pair, but not both."); + } else { + boolean firstOfPair = rec.getFirstOfPairFlag(); + SAMRecordIterator it; + if(rec.getMateReferenceIndex() == -1) { + it = this.queryUnmapped(); + } else { + it = this.queryAlignmentStart(rec.getMateReferenceName(), rec.getMateAlignmentStart()); + } + + try { + SAMRecord mateRec = null; + + while(true) { + SAMRecord next; + while(it.hasNext()) { + next = it.next(); + if(!next.getReadPairedFlag()) { + if(rec.getReadName().equals(next.getReadName())) { + throw new SAMFormatException("Paired and unpaired reads with same name: " + rec.getReadName()); + } + } else { + if(firstOfPair) { + if(next.getFirstOfPairFlag()) { + continue; + } + } else if(next.getSecondOfPairFlag()) { + continue; + } + + if(rec.getReadName().equals(next.getReadName())) { + if(mateRec != null) { + throw new SAMFormatException("Multiple SAMRecord with read name " + rec.getReadName() + " for " + (firstOfPair?"second":"first") + " end."); + } + + mateRec = next; + } + } + } + + next = mateRec; + return next; + } + } finally { + it.close(); + } + } + } + + @Override + public SAMFileSpan getFilePointerSpanningReads() { + return this.reader.indexing().getFilePointerSpanningReads(); + } + + @Override + public void close() throws IOException{ + if(this.reader != null) { + this.reader.close(); + } + + this.reader = null; + } + + @Override + public Type type() { + return this.reader.type(); + } + + @Override + public SAMFileHeader getFileHeader() { + return customHeader != null ? customHeader : this.reader.getFileHeader(); } /** diff --git a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/ExampleToCopyUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/ExampleToCopyUnitTest.java index 637539395..e9775d1fe 100644 --- a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/ExampleToCopyUnitTest.java +++ b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/ExampleToCopyUnitTest.java @@ -30,9 +30,11 @@ package org.broadinstitute.gatk.utils; import htsjdk.samtools.SAMFileHeader; -import htsjdk.samtools.SAMFileReader; +import htsjdk.samtools.SamReader; +import htsjdk.samtools.SamReaderFactory; import htsjdk.samtools.SAMRecord; import htsjdk.samtools.reference.ReferenceSequenceFile; +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile; import org.broadinstitute.gatk.utils.pileup.PileupElement; import org.broadinstitute.gatk.utils.pileup.ReadBackedPileup; @@ -50,6 +52,7 @@ import org.testng.annotations.Test; import java.io.File; import java.io.FileNotFoundException; +import java.io.IOException; import java.util.*; public class ExampleToCopyUnitTest extends BaseTest { @@ -217,13 +220,18 @@ public class ExampleToCopyUnitTest extends BaseTest { // create a fake BAM file, and iterate through it final ArtificialBAMBuilder bamBuilder = new ArtificialBAMBuilder(seq, 20, 10); final File bam = bamBuilder.makeTemporarilyBAMFile(); - final SAMFileReader reader = new SAMFileReader(bam); + final SamReader reader = SamReaderFactory.makeDefault().open(bam); final Iterator bamIt = reader.iterator(); while ( bamIt.hasNext() ) { final SAMRecord read = bamIt.next(); // all reads are actually GATKSAMRecords // TODO -- add some tests that use reads from a BAM } + try { + reader.close(); + } catch ( IOException ex ) { + throw new ReviewedGATKException("Unable to close " + bam , ex); + } } /** diff --git a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/sam/ArtificialBAMBuilderUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/sam/ArtificialBAMBuilderUnitTest.java index c4dfdbe65..048292e30 100644 --- a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/sam/ArtificialBAMBuilderUnitTest.java +++ b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/sam/ArtificialBAMBuilderUnitTest.java @@ -25,14 +25,17 @@ package org.broadinstitute.gatk.utils.sam; -import htsjdk.samtools.SAMFileReader; import htsjdk.samtools.SAMRecord; +import htsjdk.samtools.SamReader; +import htsjdk.samtools.SamReaderFactory; import org.broadinstitute.gatk.utils.BaseTest; +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; import java.io.File; +import java.io.IOException; import java.util.Arrays; import java.util.Iterator; import java.util.LinkedList; @@ -94,7 +97,7 @@ public class ArtificialBAMBuilderUnitTest extends BaseTest { } final File bam = bamBuilder.makeTemporarilyBAMFile(); - final SAMFileReader reader = new SAMFileReader(bam); + final SamReader reader = SamReaderFactory.makeDefault().open(bam); Assert.assertTrue(reader.hasIndex()); final Iterator bamIt = reader.iterator(); int nReadsFromBam = 0; @@ -106,6 +109,11 @@ public class ArtificialBAMBuilderUnitTest extends BaseTest { Assert.assertTrue(read.getAlignmentStart() >= lastStart); lastStart = read.getAlignmentStart(); } + try { + reader.close(); + } catch ( IOException ex ) { + throw new ReviewedGATKException("Unable to close " + bam , ex); + } Assert.assertEquals(nReadsFromBam, bamBuilder.expectedNumberOfReads()); } From 01a858542f871528454a2c4599cd4440ef3ad4ee Mon Sep 17 00:00:00 2001 From: Ron Levine Date: Fri, 7 Oct 2016 12:10:28 -0400 Subject: [PATCH 51/68] Remove RankSumTest and RMSAnnotation from hom-ref sites --- .../walkers/variantutils/GenotypeGVCFs.java | 63 +++++++++++++++---- .../GenotypeGVCFsIntegrationTest.java | 14 ++++- .../annotator/VariantAnnotatorEngine.java | 9 +++ 3 files changed, 72 insertions(+), 14 deletions(-) diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFs.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFs.java index 53a2f1790..b298ba25c 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFs.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFs.java @@ -64,9 +64,12 @@ import org.broadinstitute.gatk.engine.walkers.Reference; import org.broadinstitute.gatk.engine.walkers.RodWalker; import org.broadinstitute.gatk.engine.walkers.TreeReducible; import org.broadinstitute.gatk.engine.walkers.Window; +import org.broadinstitute.gatk.tools.walkers.annotator.RankSumTest; +import org.broadinstitute.gatk.tools.walkers.annotator.RMSAnnotation; import org.broadinstitute.gatk.tools.walkers.annotator.VariantAnnotatorEngine; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AS_StandardAnnotation; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible; +import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.StandardAnnotation; import org.broadinstitute.gatk.tools.walkers.genotyper.OutputMode; import org.broadinstitute.gatk.tools.walkers.genotyper.UnifiedArgumentCollection; @@ -188,6 +191,8 @@ public class GenotypeGVCFs extends RodWalker infoFieldAnnotationKeyNamesToRemove = new ArrayList<>(); public List> getCompRodBindings() { return Collections.emptyList(); } public RodBinding getSnpEffRodBinding() { return null; } @@ -223,6 +228,16 @@ public class GenotypeGVCFs extends RodWalkeremptyList(), this, toolkit); + // Request INFO field annotations inheriting from RankSumTest and RMSAnnotation added to remove list + for ( final InfoFieldAnnotation annotation : annotationEngine.getRequestedInfoAnnotations() ) { + if ( annotation instanceof RankSumTest || annotation instanceof RMSAnnotation ) { + final List keyNames = annotation.getKeyNames(); + if ( !keyNames.isEmpty() ) { + infoFieldAnnotationKeyNamesToRemove.add(keyNames.get(0)); + } + } + } + // create the genotyping engine // when checking for presence of AS_StandardAnnotation we must deal with annoying feature that // the class name with or without the trailing "Annotation" are both valid command lines @@ -321,7 +336,6 @@ public class GenotypeGVCFs extends RodWalker cleanupGenotypeAnnotations(final VariantContext VC, final boolean createRefGTs) { - final GenotypesContext oldGTs = VC.getGenotypes(); + private List cleanupGenotypeAnnotations(final VariantContext vc, final boolean createRefGTs) { + final GenotypesContext oldGTs = vc.getGenotypes(); final List recoveredGs = new ArrayList<>(oldGTs.size()); + for ( final Genotype oldGT : oldGTs ) { final Map attrs = new HashMap<>(oldGT.getExtendedAttributes()); @@ -455,15 +494,15 @@ public class GenotypeGVCFs extends RodWalker refAlleles = Collections.nCopies(ploidy,VC.getReference()); + final List refAlleles = Collections.nCopies(ploidy,vc.getReference()); //keep 0 depth samples and 0 GQ samples as no-call if (depth > 0 && oldGT.hasGQ() && oldGT.getGQ() > 0) { diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFsIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFsIntegrationTest.java index 47e813204..891705794 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFsIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFsIntegrationTest.java @@ -251,7 +251,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { final WalkerTestSpec spec = new WalkerTestSpec( baseBPResolutionString("-allSites"), 1, - Collections.singletonList("77924e6b958a30f954e1c3a9f504a6a7")); + Collections.singletonList("764ac46e0b985db187d85655240f7ec0")); spec.disableShadowBCF(); executeTest("testAllSitesNonBiallelic", spec); } @@ -685,9 +685,19 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { @Test public void testNewQualNaNBugFix() { final WalkerTestSpec spec = new WalkerTestSpec( - baseTestString(" -newQual -V " + privateTestDir + "input-newqual-nan-bug-fix.vcf", b37KGReferenceWithDecoy), + baseTestString(" -newQual -V " + privateTestDir + "input-newqual-nan-bug-fix.vcf", b37KGReferenceWithDecoy), Collections.singletonList("503f4193c22fbcc451bd1c425b8b6bf8")); spec.disableShadowBCF(); executeTest("testNewQualNaNBugFix", spec); } + + @Test + public void testHomRefHighMQ() { + final WalkerTestSpec spec = new WalkerTestSpec( + baseTestString(" -V " + privateTestDir + "NA18503.22.vcf -V " + privateTestDir + "NA18504.22.vcf -V " + + privateTestDir + "NA18505.22.vcf -allSites", b37KGReference), + Collections.singletonList("6d253024246e1024b9b6e8f885f53799")); + spec.disableShadowBCF(); + executeTest("testHomRefHighMQ", spec); + } } \ No newline at end of file diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/VariantAnnotatorEngine.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/VariantAnnotatorEngine.java index 193c01120..e978037b7 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/VariantAnnotatorEngine.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/VariantAnnotatorEngine.java @@ -62,6 +62,15 @@ public class VariantAnnotatorEngine { // Map of info field name to info field private final Map hInfoMap = new HashMap<>(); + /** + * Get the requested INFO field annotations + * + * @return requested INFO field annotations + */ + public List getRequestedInfoAnnotations() { + return requestedInfoAnnotations; + } + protected static class VAExpression { public String fullName, fieldName; From d7090ef1586d87b4574e8f6d0057a29648e9ba2f Mon Sep 17 00:00:00 2001 From: Ron Levine Date: Fri, 21 Oct 2016 13:01:07 -0400 Subject: [PATCH 52/68] Write saved WARN messages to stderr instead of stdout --- .../gatk/utils/commandline/CommandLineProgram.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/commandline/CommandLineProgram.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/commandline/CommandLineProgram.java index 4f5afcc08..0cb132bd6 100644 --- a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/commandline/CommandLineProgram.java +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/commandline/CommandLineProgram.java @@ -363,10 +363,10 @@ public abstract class CommandLineProgram { * @param listAppender Appender for saving logging messages to a list */ private static void printDoneAndLogMessages(final ListAppender listAppender) { - System.out.println("------------------------------------------------------------------------------------------"); - System.out.print("Done. "); + System.err.println("------------------------------------------------------------------------------------------"); + System.err.print("Done. "); listAppender.write(); - System.out.println("------------------------------------------------------------------------------------------"); + System.err.println("------------------------------------------------------------------------------------------"); } /** From cc91052e6961e4592aa8d3a977e246957ae43f2a Mon Sep 17 00:00:00 2001 From: Steve Huang Date: Thu, 27 Oct 2016 11:19:49 -0400 Subject: [PATCH 53/68] fixed a max priority Q error while removing alt alleles when faced with high ploidy and allele count; added hackish integration test (#1457) --- .../HaplotypeCallerGenotypingEngine.java | 164 +++++++++++------- ...plotypeCallerGenotypingEngineUnitTest.java | 102 ++++++++++- .../HaplotypeCallerIntegrationTest.java | 10 ++ 3 files changed, 208 insertions(+), 68 deletions(-) diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngine.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngine.java index 1f31e7cd6..2a57c09ac 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngine.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngine.java @@ -54,10 +54,7 @@ package org.broadinstitute.gatk.tools.walkers.haplotypecaller; import com.google.common.annotations.VisibleForTesting; import com.google.java.contract.Ensures; import com.google.java.contract.Requires; -import htsjdk.samtools.util.StringUtil; import htsjdk.variant.variantcontext.*; -import org.apache.commons.lang.ArrayUtils; -import org.apache.commons.math3.stat.StatUtils; import org.broadinstitute.gatk.engine.arguments.GenotypeCalculationArgumentCollection; import org.broadinstitute.gatk.utils.*; import org.broadinstitute.gatk.utils.contexts.ReferenceContext; @@ -87,8 +84,8 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine practicaAlleleCountForPloidy = new HashMap<>(); + private final int maxGenotypeCountToEnumerate; + private final Map practicalAlleleCountForPloidy = new HashMap<>(); private MergeVariantsAcrossHaplotypes crossHaplotypeEventMerger; @@ -105,13 +102,18 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine readAlleleLikelihoods = readLikelihoods.marginalize(practicalAlleleMapper, + final ReadLikelihoods readAlleleLikelihoods = readLikelihoods.marginalize(alleleMapper, genomeLocParser.createPaddedGenomeLoc(genomeLocParser.createGenomeLoc(mergedVC), ALLELE_EXTENSION)); if (configuration.isSampleContaminationPresent()) @@ -358,52 +340,82 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine> reduceNumberOfAlternativeAllelesBasedOnHaplotypesScores(final Map> alleleMapper, final int desiredNumOfAlleles) { + private VariantContext removeAltAllelesIfTooManyGenotypes(final int ploidy, final Map> alleleMapper, final VariantContext mergedVC) { - final PriorityQueue altAlleleMaxPriorityQ = new PriorityQueue<>((sa1, sa2) -> - sa2.compareTo(sa1)); // -1 to turn it into max priority q + final int originalAlleleCount = alleleMapper.size(); + practicalAlleleCountForPloidy.putIfAbsent(ploidy, GenotypeLikelihoodCalculators.computeMaxAcceptableAlleleCount(ploidy, maxGenotypeCountToEnumerate)); + final int practicalAlleleCount = practicalAlleleCountForPloidy.get(ploidy); - final Set allelesToRetain = new HashSet<>(); - // populate allelePriorityQ with the relevant information + if (originalAlleleCount > practicalAlleleCount) { + final List allelesToKeep = whichAllelesToKeepBasedonHapScores(alleleMapper, practicalAlleleCount); + alleleMapper.keySet().retainAll(allelesToKeep); + logger.warn(String.format("Removed alt alleles where ploidy is %d and original allele count is %d, whereas after trimming the allele count becomes %d. Alleles kept are:%s", + ploidy, originalAlleleCount, practicalAlleleCount, allelesToKeep)); + return removeExcessAltAllelesFromVC(mergedVC, allelesToKeep); + } else { + return mergedVC; + } + } + + /** + * Returns a list of alleles that is a subset of the key set of input map {@code alleleMapper}. + * The size of the returned list is min({@code desiredNumOfAlleles}, alleleMapper.size()). + * + * Alleles kept are guaranteed to have higher precedence than those removed, where precedence is determined by + * {@link AlleleScoredByHaplotypeScores}. + * + * Entries in the returned list are guaranteed to have the same relative order as they were in the input map. + * + * @param alleleMapper original allele to haplotype map + * @param desiredNumOfAlleles desired allele count, including ref allele + */ + @VisibleForTesting + static List whichAllelesToKeepBasedonHapScores(final Map> alleleMapper, + final int desiredNumOfAlleles) { + + if(alleleMapper.size() <= desiredNumOfAlleles){ + return alleleMapper.keySet().stream().collect(Collectors.toList()); + } + + final PriorityQueue alleleMaxPriorityQ = new PriorityQueue<>(); for(final Allele allele : alleleMapper.keySet()){ - - if(allele.isReference()){ // collect scores information only on alt alleles; ref allele is never trimmed by this function - allelesToRetain.add(allele); - continue; - } - - final List hapScores = alleleMapper.get(allele).stream().map(hap -> hap.getScore()).collect(Collectors.toList()); - Collections.sort(hapScores); + final List hapScores = alleleMapper.get(allele).stream().map(Haplotype::getScore).sorted().collect(Collectors.toList()); final Double highestScore = hapScores.get(hapScores.size()-1); final Double secondHighestScore = hapScores.size()>1 ? hapScores.get(hapScores.size()-2) : Double.NEGATIVE_INFINITY; - altAlleleMaxPriorityQ.add(new AlleleScoredByHaplotypeScores(allele, highestScore, secondHighestScore)); + alleleMaxPriorityQ.add(new AlleleScoredByHaplotypeScores(allele, highestScore, secondHighestScore)); } + final Set allelesToRetain = new LinkedHashSet<>(); while(allelesToRetain.size() allelesToRetain.contains(p.getKey())) - .collect(Collectors.toMap(p->p.getKey(), p->p.getValue())); + return alleleMapper.keySet().stream().filter(allelesToRetain::contains).collect(Collectors.toList()); } /** * A utility class that provides ordering information, given best and second best haplotype scores. * If there's a tie between the two alleles when comparing their best haplotype score, the second best haplotype score * is used for breaking the tie. In the case that one allele doesn't have a second best allele, i.e. it has only one - * supportive haplotype, its second best score is set as null, and is always considered "worse" than another allele - * that has the same best haplotype score, but also has a second best haplotype score. - * TODO: in the extremely unlikely case that two alleles, having the same best haplotype score, neither have a second - * best haplotype score, the case is undecided. + * supportive haplotype, its second best score is set as {@link Double#NEGATIVE_INFINITY}. + * In the extremely unlikely cases that two alleles, having the same best haplotype score, neither have a second + * best haplotype score, or the same second best haplotype score, the order is exactly the same as determined by + * {@link Allele#compareTo(Allele)}. */ - private static final class AlleleScoredByHaplotypeScores { + private static final class AlleleScoredByHaplotypeScores implements Comparable{ private final Allele allele; private final Double bestHaplotypeScore; private final Double secondBestHaplotypeScore; @@ -414,13 +426,21 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine other.bestHaplotypeScore) { - return 1; - } else if (bestHaplotypeScore < other.bestHaplotypeScore) { + + if(allele.isReference() && other.allele.isNonReference()){ return -1; + } else if(allele.isNonReference() && other.allele.isReference()){ + return 1; + } else if(bestHaplotypeScore > other.bestHaplotypeScore) { + return -1; + } else if (bestHaplotypeScore < other.bestHaplotypeScore) { + return 1; + } else if (!secondBestHaplotypeScore.equals(other.secondBestHaplotypeScore)) { + return secondBestHaplotypeScore > other.secondBestHaplotypeScore ? -1 : 1; } else { - return secondBestHaplotypeScore > other.secondBestHaplotypeScore ? 1 : -1; + return allele.compareTo(other.allele); } } @@ -429,6 +449,28 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine allelesToKeep){ + Utils.validateArg(allelesToKeep!=null, "alleles to keep is null"); + Utils.validateArg(!allelesToKeep.contains(null), "alleles to keep contains null elements"); + Utils.validateArg(allelesToKeep.stream().anyMatch(Allele::isReference), "alleles to keep doesn't contain reference allele!"); + Utils.validateArg(inputVC.getAlleles().containsAll(allelesToKeep), "alleles to keep is not a subset of input VC alleles"); + if(inputVC.getAlleles().size() == allelesToKeep.size()) return inputVC; + + final VariantContextBuilder vcb = new VariantContextBuilder(inputVC); + final List originalList = inputVC.getAlleles(); + originalList.retainAll(allelesToKeep); + vcb.alleles(originalList); + return vcb.make(); + } + /** * Reduce the number alternative alleles in a read-likelihoods collection to the maximum-alt-allele user parameter value. *

    diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngineUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngineUnitTest.java index 25ed5e59e..ac4476037 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngineUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngineUnitTest.java @@ -444,12 +444,12 @@ public class HaplotypeCallerGenotypingEngineUnitTest extends BaseTest { @Test(dataProvider="ConstructPhaseSetMappingProvider") public void testConstructPhaseSetMapping(final List calls, - final Map> haplotypeMap, - final int totalHaplotypes, - final int expectedMapSize, - final int expectedNumGroups, - final int expectedNum01, - final int expectedNum10) { + final Map> haplotypeMap, + final int totalHaplotypes, + final int expectedMapSize, + final int expectedNumGroups, + final int expectedNum01, + final int expectedNum10) { final Map> actualPhaseSetMapping = new HashMap<>(); final int actualNumGroups = HaplotypeCallerGenotypingEngine.constructPhaseSetMapping(calls, haplotypeMap, totalHaplotypes, actualPhaseSetMapping); Assert.assertEquals(actualNumGroups, expectedNumGroups); @@ -558,11 +558,99 @@ public class HaplotypeCallerGenotypingEngineUnitTest extends BaseTest { final PloidyModel ploidyModel = new HomogeneousPloidyModel(indexedSampleList, 2); final GenotypingModel genotypingModel = new InfiniteRandomMatingPopulationModel(); - final GenotypingLikelihoods genotypeLikelihoods = genotypingModel.calculateLikelihoods(readLikelihoods, new GenotypingData<>(ploidyModel,readLikelihoods)); + final GenotypingLikelihoods genotypeLikelihoods = genotypingModel.calculateLikelihoods(readLikelihoods, new GenotypingData<>(ploidyModel, readLikelihoods)); // test final Set excessAltAlleles = HaplotypeCallerGenotypingEngine.excessAlternativeAlleles(genotypeLikelihoods, 2); Assert.assertFalse(excessAltAlleles.contains(ref)); Assert.assertEquals(excessAltAlleles.size(), 1); } + + @Test + public void testReduceNumberOfAlternativeAllelesBasedOnHaplotypesScores(){ + + // first have a list of alleles, one ref, several alt + final Allele ref = Allele.create("A", true); + final Allele altC = Allele.create("C", false); + final Allele altT = Allele.create("T", false); + final Allele altT2 = Allele.create("TT", false); + final Allele altG = Allele.create("G", false); + + // then create several haplotypes, assign ad-hoc scores + final Haplotype hapRef = new Haplotype("AAAAA".getBytes()); + hapRef.setScore(Double.MAX_VALUE); + + // test case when both same best score and second best score are the same + final Haplotype hapT = new Haplotype("TAAAA".getBytes()); + hapT.setScore(-2.0); + final Haplotype hapTAnother = new Haplotype("TAAAT".getBytes()); + hapTAnother.setScore(-3.0); + final Haplotype hapT2 = new Haplotype("TTAAA".getBytes()); + hapT2.setScore(-2.0); + final Haplotype hapT2Another = new Haplotype("TTAAT".getBytes()); + hapT2Another.setScore(-3.0); + + final Haplotype hapC = new Haplotype("CAAAA".getBytes()); + hapC.setScore(-3.0); + + // for case when there's tie in highest haplotype score + final Haplotype hapG = new Haplotype("GAAAA".getBytes()); + hapG.setScore(-3.0); + final Haplotype hapGAnother = new Haplotype("GAAAG".getBytes()); + hapGAnother.setScore(-5.0); + + final Map> alleleMapper = new LinkedHashMap<>(); + alleleMapper.put(ref, Arrays.asList(hapRef)); + alleleMapper.put(altC, Arrays.asList(hapC)); + alleleMapper.put(altT, Arrays.asList(hapT, hapTAnother)); + alleleMapper.put(altT2, Arrays.asList(hapT2, hapT2Another)); + alleleMapper.put(altG, Arrays.asList(hapG, hapGAnother)); + + List allelesToKeep = HaplotypeCallerGenotypingEngine.whichAllelesToKeepBasedonHapScores(alleleMapper, 5); + Assert.assertEquals(allelesToKeep.size(), 5); + + Iterator it = allelesToKeep.iterator(); + Assert.assertEquals(it.next(), ref); + Assert.assertEquals(it.next(), altC); + Assert.assertEquals(it.next(), altT); + Assert.assertEquals(it.next(), altT2); + Assert.assertEquals(it.next(), altG); + + allelesToKeep = HaplotypeCallerGenotypingEngine.whichAllelesToKeepBasedonHapScores(alleleMapper, 4); + Assert.assertEquals(allelesToKeep.size(), 4); + it = allelesToKeep.iterator(); + Assert.assertEquals(it.next(), ref); + Assert.assertEquals(it.next(), altT); + Assert.assertEquals(it.next(), altT2); + Assert.assertEquals(it.next(), altG); + + allelesToKeep = HaplotypeCallerGenotypingEngine.whichAllelesToKeepBasedonHapScores(alleleMapper, 3); + Assert.assertEquals(allelesToKeep.size(), 3); + it = allelesToKeep.iterator(); + Assert.assertEquals(it.next(), ref); + Assert.assertEquals(it.next(), altT); + Assert.assertEquals(it.next(), altT2); + + allelesToKeep = HaplotypeCallerGenotypingEngine.whichAllelesToKeepBasedonHapScores(alleleMapper, 2); + Assert.assertEquals(allelesToKeep.size(), 2); + it = allelesToKeep.iterator(); + Assert.assertEquals(it.next(), ref); + Assert.assertEquals(it.next(), altT); + + allelesToKeep = HaplotypeCallerGenotypingEngine.whichAllelesToKeepBasedonHapScores(alleleMapper, 1); + Assert.assertEquals(allelesToKeep.size(), 1); + it = allelesToKeep.iterator(); + Assert.assertEquals(it.next(), ref); + } + + + @Test + public void testRemoveExcessiveAltAlleleFromVC(){ + final VariantContext originalVC = new VariantContextBuilder("source", "1", 1000000, 1000000, Arrays.asList(Allele.create("A", true), Allele.create("T", false), Allele.create("C", false), Allele.create("G", false))).make(); + + final VariantContext reducedVC = HaplotypeCallerGenotypingEngine.removeExcessAltAllelesFromVC(originalVC, Arrays.asList(Allele.create("A", true), Allele.create("T", false), Allele.create("C", false))); + + Assert.assertEquals(reducedVC.getNAlleles(), 3); + Assert.assertTrue(reducedVC.getAlleles().containsAll(Arrays.asList(Allele.create("A", true), Allele.create("T", false), Allele.create("C", false)))); + } } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index 5303ecdf7..6e28abe01 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -512,5 +512,15 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList(md5Variants, md5BAMOut)); executeTest("testHaplotypeCallerReadPosRankSum", spec); } + + @Test + public void testHaplotypeCallerRemoveAltAlleleBasedOnHaptypeScores() { + final File testBAM = new File(privateTestDir + "pretendTobeTetraPloidTetraAllelicSite.bam"); + final String md5 = "289304f56833ea76b60cd08763b0f68b"; + final String base = String.format("-T HaplotypeCaller -R %s -I %s -L 20:11363580-11363600 -ploidy 4 -maxGT 15 ", REF, testBAM) + + " --no_cmdline_in_header -o %s"; + final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList(md5)); + executeTest("testHaplotypeCallerRemoveAltAlleleBasedOnHaptypeScores", spec); + } } From 029632eb1c15fe7c97b1be6422912858f1347cc8 Mon Sep 17 00:00:00 2001 From: David Benjamin Date: Tue, 1 Nov 2016 03:16:21 -0400 Subject: [PATCH 54/68] backport numerics changes in new qual --- .../afcalc/AlleleFrequencyCalculator.java | 48 +++++++++++-------- .../broadinstitute/gatk/utils/MathUtils.java | 4 ++ 2 files changed, 31 insertions(+), 21 deletions(-) diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/AlleleFrequencyCalculator.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/AlleleFrequencyCalculator.java index abfddab82..21eb94e15 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/AlleleFrequencyCalculator.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/AlleleFrequencyCalculator.java @@ -144,30 +144,31 @@ public final class AlleleFrequencyCalculator extends AFCalculator { } final int ploidy = g.getPloidy() == 0 ? defaultPloidy : g.getPloidy(); final GenotypeLikelihoodCalculator glCalc = GL_CALCS.getInstance(ploidy, numAlleles); - final double[] genotypePosteriors = normalizedGenotypePosteriors(g, glCalc, log10AlleleFrequencies); + + final double[] log10GenotypePosteriors = log10NormalizedGenotypePosteriors(g, glCalc, log10AlleleFrequencies); //the total probability - log10PNoVariant += Math.log10(genotypePosteriors[HOM_REF_GENOTYPE_INDEX]); + log10PNoVariant += log10GenotypePosteriors[HOM_REF_GENOTYPE_INDEX]; // per allele non-log space probabilities of zero counts for this sample // for each allele calculate the total probability of genotypes containing at least one copy of the allele - final double[] pOfNonZeroAltAlleles = new double[numAlleles]; + final double[] log10ProbabilityOfNonZeroAltAlleles = new double[numAlleles]; + Arrays.fill(log10ProbabilityOfNonZeroAltAlleles, Double.NEGATIVE_INFINITY); for (int genotype = 0; genotype < glCalc.genotypeCount(); genotype++) { - final double genotypePosterior = genotypePosteriors[genotype]; + final double log10GenotypePosterior = log10GenotypePosteriors[genotype]; glCalc.genotypeAlleleCountsAt(genotype).forEachAlleleIndexAndCount((alleleIndex, count) -> - pOfNonZeroAltAlleles[alleleIndex] += genotypePosterior); + log10ProbabilityOfNonZeroAltAlleles[alleleIndex] = + MathUtils.log10SumLog10(log10ProbabilityOfNonZeroAltAlleles[alleleIndex], log10GenotypePosterior)); } - // Make sure that we handle appropriately pOfNonZeroAltAlleles that are close to 1; values just over 1.0 due to - // rounding error would result in NaN. - // As every allele is present in at least one genotype, the p-non-zero-count for - // any allele is bound above by 1.0 - minimum genotype posterior because at least one genotype - // does not contain this allele. - final double maximumPNonZeroCount = 1.0 - MathUtils.arrayMin(genotypePosteriors); - for (int allele = 0; allele < numAlleles; allele++) { - log10POfZeroCountsByAllele[allele] += Math.log10(1.0 - Math.min(maximumPNonZeroCount, pOfNonZeroAltAlleles[allele])); + // if prob of non hom ref == 1 up to numerical precision, short-circuit to avoid NaN + if (log10ProbabilityOfNonZeroAltAlleles[allele] >= 0) { + log10POfZeroCountsByAllele[allele] = Double.NEGATIVE_INFINITY; + } else { + log10POfZeroCountsByAllele[allele] += MathUtils.log10OneMinusPow10(log10ProbabilityOfNonZeroAltAlleles[allele]); + } } } @@ -183,37 +184,42 @@ public final class AlleleFrequencyCalculator extends AFCalculator { // we compute posteriors here and don't have the same prior that AFCalculationResult expects. Therefore, we // give it our posterior as its "likelihood" along with a flat dummy prior final double[] dummyFlatPrior = {-1e-10, -1e-10}; //TODO: HACK must be negative for AFCalcResult - final double[] log10PosteriorOfNoVariantYesVariant = {log10PNoVariant, Math.log10(1 - Math.pow(10, log10PNoVariant))}; + final double[] log10PosteriorOfNoVariantYesVariant = {log10PNoVariant, MathUtils.log10OneMinusPow10(log10PNoVariant)}; return new AFCalculationResult(integerAltAlleleCounts, DUMMY_N_EVALUATIONS, alleles, log10PosteriorOfNoVariantYesVariant, dummyFlatPrior, log10PRefByAllele); } + // effectiveAlleleCounts[allele a] = SUM_{genotypes g} (posterior_probability(g) * num_copies of a in g), which we denote as SUM [n_g p_g] + // for numerical stability we will do this in log space: + // count = SUM 10^(log (n_g p_g)) = SUM 10^(log n_g + log p_g) + // thanks to the log-sum-exp trick this lets us work with log posteriors alone private double[] effectiveAlleleCounts(final VariantContext vc, final double[] log10AlleleFrequencies) { final int numAlleles = vc.getNAlleles(); Utils.validateArg(numAlleles == log10AlleleFrequencies.length, "number of alleles inconsistent"); - final double[] result = new double[numAlleles]; + final double[] log10Result = new double[numAlleles]; + Arrays.fill(log10Result, Double.NEGATIVE_INFINITY); for (final Genotype g : vc.getGenotypes()) { if (!g.hasLikelihoods()) { continue; } final GenotypeLikelihoodCalculator glCalc = GL_CALCS.getInstance(g.getPloidy(), numAlleles); - final double[] genotypePosteriors = normalizedGenotypePosteriors(g, glCalc, log10AlleleFrequencies); + final double[] log10GenotypePosteriors = log10NormalizedGenotypePosteriors(g, glCalc, log10AlleleFrequencies); new IndexRange(0, glCalc.genotypeCount()).forEach(genotypeIndex -> glCalc.genotypeAlleleCountsAt(genotypeIndex).forEachAlleleIndexAndCount((alleleIndex, count) -> - result[alleleIndex] += genotypePosteriors[genotypeIndex] * count)); + log10Result[alleleIndex] = MathUtils.log10SumLog10(log10Result[alleleIndex], log10GenotypePosteriors[genotypeIndex] + MathUtils.Log10Cache.get(count)))); } - return result; + return MathUtils.applyToArrayInPlace(log10Result, x -> Math.pow(10.0, x)); } - private static double[] normalizedGenotypePosteriors(final Genotype g, final GenotypeLikelihoodCalculator glCalc, final double[] log10AlleleFrequencies) { + private static double[] log10NormalizedGenotypePosteriors(final Genotype g, final GenotypeLikelihoodCalculator glCalc, final double[] log10AlleleFrequencies) { final double[] log10Likelihoods = g.getLikelihoods().getAsVector(); - final double[] unnormalizedLog10Likelihoods = new IndexRange(0, glCalc.genotypeCount()).mapToDouble(genotypeIndex -> { + final double[] log10Posteriors = new IndexRange(0, glCalc.genotypeCount()).mapToDouble(genotypeIndex -> { final GenotypeAlleleCounts gac = glCalc.genotypeAlleleCountsAt(genotypeIndex); return gac.log10CombinationCount() + log10Likelihoods[genotypeIndex] + gac.sumOverAlleleIndicesAndCounts((index, count) -> count * log10AlleleFrequencies[index]); }); - return MathUtils.normalizeFromLog10(unnormalizedLog10Likelihoods); + return MathUtils.normalizeFromLog10(log10Posteriors, true); } @Override //Note: unused diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/MathUtils.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/MathUtils.java index 27cb6a6aa..009d4eaae 100644 --- a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/MathUtils.java +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/MathUtils.java @@ -330,6 +330,10 @@ public class MathUtils { return log10sumLog10(log10values, 0); } + public static double log10SumLog10(final double a, final double b) { + return a > b ? a + Math.log10(1 + Math.pow(10.0, b - a)) : b + Math.log10(1 + Math.pow(10.0, a - b)); + } + public static boolean wellFormedDouble(final double val) { return !Double.isInfinite(val) && !Double.isNaN(val); } From df0ba2ce8de85537d1c5761192f36e9d6771a66e Mon Sep 17 00:00:00 2001 From: Ron Levine Date: Mon, 17 Oct 2016 10:09:51 -0400 Subject: [PATCH 55/68] BaseCountsBySample counting bases at a particular position --- .../walkers/annotator/BaseCountsBySample.java | 30 +++------- .../HaplotypeCallerIntegrationTest.java | 4 +- .../tools/walkers/annotator/BaseCounts.java | 45 +++++++++------ .../gatk/utils/sam/AlignmentUtils.java | 44 ++++++++++++++ .../utils/sam/AlignmentUtilsUnitTest.java | 57 ++++++++++++++++++- .../gatk/utils/sam/ReadUtilsUnitTest.java | 6 +- 6 files changed, 141 insertions(+), 45 deletions(-) diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/BaseCountsBySample.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/BaseCountsBySample.java index 13e85e62d..c37b3b650 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/BaseCountsBySample.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/BaseCountsBySample.java @@ -60,15 +60,14 @@ import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompa import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.GenotypeAnnotation; import org.broadinstitute.gatk.utils.contexts.AlignmentContext; import org.broadinstitute.gatk.utils.contexts.ReferenceContext; -import org.broadinstitute.gatk.utils.genotyper.MostLikelyAllele; import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; -import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; +import org.broadinstitute.gatk.utils.sam.AlignmentUtils; import org.broadinstitute.gatk.utils.variant.GATKVCFConstants; import org.broadinstitute.gatk.utils.variant.GATKVCFHeaderLines; -import org.broadinstitute.gatk.utils.BaseUtils; import java.util.*; +import java.util.stream.Collectors; /** * Count of A, C, G, T bases for each sample @@ -110,8 +109,9 @@ public class BaseCountsBySample extends GenotypeAnnotation { final GenotypeBuilder gb, final PerReadAlleleLikelihoodMap alleleLikelihoodMap) { - if ( alleleLikelihoodMap != null && !alleleLikelihoodMap.isEmpty() ) - gb.attribute(GATKVCFConstants.BASE_COUNTS_BY_SAMPLE_KEY, getBaseCounts(alleleLikelihoodMap, vc)); + if ( alleleLikelihoodMap != null && !alleleLikelihoodMap.isEmpty() ) { + gb.attribute(GATKVCFConstants.BASE_COUNTS_BY_SAMPLE_KEY, Arrays.stream(getBaseCounts(alleleLikelihoodMap, vc)).boxed().collect(Collectors.toList())); + } } @Override @@ -123,31 +123,15 @@ public class BaseCountsBySample extends GenotypeAnnotation { } /** - * Base counts given for the most likely allele + * Counts of observed bases at a genomic position e.g. {13,0,0,1} at chr1:100,000,000 * * @param perReadAlleleLikelihoodMap for each read, the underlying alleles represented by an aligned read, and corresponding relative likelihood. * @param vc variant context * @return count of A, C, G, T bases - * @throws IllegalStateException if alleles in vc are not in perReadAlleleLikelihoodMap */ private int[] getBaseCounts(final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap, final VariantContext vc) { final Set alleles = new HashSet<>(vc.getAlleles()); - // make sure that there's a meaningful relationship between the alleles in the perReadAlleleLikelihoodMap and our VariantContext - if ( !perReadAlleleLikelihoodMap.getAllelesSet().containsAll(alleles) ) - throw new IllegalStateException("VC alleles " + alleles + " not a strict subset of per read allele map alleles " + perReadAlleleLikelihoodMap.getAllelesSet()); - - final int[] counts = new int[4]; - for ( final Map.Entry> el : perReadAlleleLikelihoodMap.getLikelihoodReadMap().entrySet()) { - final MostLikelyAllele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue(), alleles); - if (! a.isInformative() ) continue; // read is non-informative - for (final byte base : el.getKey().getReadBases() ){ - int index = BaseUtils.simpleBaseToBaseIndex(base); - if ( index != -1 ) - counts[index]++; - } - } - - return counts; + return AlignmentUtils.countBasesAtPileupPosition(perReadAlleleLikelihoodMap, alleles, vc.getStart()); } } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index 6e28abe01..605af9b54 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -488,8 +488,8 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { } @Test - public void testHBaseCountsBySample() throws IOException{ - HCTest(NA12878_BAM, " -L 20:10001000-10010000 -A BaseCountsBySample", "c4550a5933cc954bad70980750e0df52"); + public void testBaseCounts() throws IOException{ + HCTest(CEUTRIO_BAM, "-A BaseCountsBySample -A BaseCounts", "40def0e9c06031d6b624a22a093574c0"); } @Test diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/BaseCounts.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/BaseCounts.java index e13e9f1ed..4c607b52c 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/BaseCounts.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/BaseCounts.java @@ -25,22 +25,22 @@ package org.broadinstitute.gatk.tools.walkers.annotator; +import htsjdk.variant.variantcontext.Allele; +import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.ActiveRegionBasedAnnotation; import org.broadinstitute.gatk.utils.contexts.AlignmentContext; import org.broadinstitute.gatk.utils.contexts.ReferenceContext; import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap; -import org.broadinstitute.gatk.utils.BaseUtils; import htsjdk.variant.vcf.VCFInfoHeaderLine; import htsjdk.variant.variantcontext.VariantContext; +import org.broadinstitute.gatk.utils.sam.AlignmentUtils; import org.broadinstitute.gatk.utils.variant.GATKVCFConstants; import org.broadinstitute.gatk.utils.variant.GATKVCFHeaderLines; -import java.util.Arrays; -import java.util.HashMap; -import java.util.List; -import java.util.Map; +import java.util.*; +import java.util.stream.Collectors; /** @@ -61,7 +61,7 @@ import java.util.Map; * */ - public class BaseCounts extends InfoFieldAnnotation { + public class BaseCounts extends InfoFieldAnnotation implements ActiveRegionBasedAnnotation { public Map annotate(final RefMetaDataTracker tracker, final AnnotatorCompatible walker, @@ -69,21 +69,34 @@ import java.util.Map; final Map stratifiedContexts, final VariantContext vc, final Map stratifiedPerReadAlleleLikelihoodMap) { - if ( stratifiedContexts.size() == 0 ) + + if ( stratifiedPerReadAlleleLikelihoodMap == null || stratifiedPerReadAlleleLikelihoodMap.isEmpty() ) { return null; + } - int[] counts = new int[4]; + final Map map = new HashMap<>(); + map.put(getKeyNames().get(0), Arrays.stream(getBaseCounts(stratifiedPerReadAlleleLikelihoodMap, vc)).boxed().collect(Collectors.toList())); + return map; + } - for ( Map.Entry sample : stratifiedContexts.entrySet() ) { - for (byte base : sample.getValue().getBasePileup().getBases() ) { - int index = BaseUtils.simpleBaseToBaseIndex(base); - if ( index != -1 ) - counts[index]++; + /** + * Counts of observed bases at a genomic position (e.g. {13,0,0,1} at chr1:100,000,000) over all samples + * + * @param stratifiedPerReadAlleleLikelihoodMap for each read, the underlying alleles represented by an aligned read, and corresponding relative likelihood. + * @param vc variant context + * @return count of A, C, G, T bases + */ + private int[] getBaseCounts(final Map stratifiedPerReadAlleleLikelihoodMap, final VariantContext vc) { + final Set alleles = new HashSet<>(vc.getAlleles()); + final int[] baseCounts = new int[4]; + for ( final Map.Entry strat : stratifiedPerReadAlleleLikelihoodMap.entrySet() ) { + final int[] counts = AlignmentUtils.countBasesAtPileupPosition(strat.getValue(), alleles, vc.getStart()); + for ( int i = 0; i < baseCounts.length; i++ ) { + baseCounts[i] += counts[i]; } } - Map map = new HashMap<>(); - map.put(getKeyNames().get(0), counts); - return map; + + return baseCounts; } public List getKeyNames() { return Arrays.asList(GATKVCFConstants.BASE_COUNTS_KEY); } diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/sam/AlignmentUtils.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/sam/AlignmentUtils.java index c3d6b5cac..83d3c178c 100644 --- a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/sam/AlignmentUtils.java +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/sam/AlignmentUtils.java @@ -31,8 +31,11 @@ import htsjdk.samtools.Cigar; import htsjdk.samtools.CigarElement; import htsjdk.samtools.CigarOperator; import htsjdk.samtools.SAMRecord; +import htsjdk.variant.variantcontext.Allele; import org.broadinstitute.gatk.utils.BaseUtils; import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; +import org.broadinstitute.gatk.utils.genotyper.MostLikelyAllele; +import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.gatk.utils.haplotype.Haplotype; import org.broadinstitute.gatk.utils.pileup.PileupElement; import org.broadinstitute.gatk.utils.recalibration.EventType; @@ -1382,4 +1385,45 @@ public final class AlignmentUtils { // 1: xxx I1 yyy new CigarPairTransform(CigarOperator.I, CigarOperator.I, CigarOperator.I, 1, 0) ); + + /** + * Get the counts of bases at a genome location for a pileup + * + * @param perReadAlleleLikelihoodMap underlying alleles represented by an aligned read, and corresponding relative likelihood. + * @param alleles the alleles + * @param location genome location + * @return the number of A, C, G, and T bases across all samples, in that order + * @throws IllegalStateException if alleles are not in perReadAlleleLikelihoodMap + */ + @Ensures({"likelihoodReadMap != null", "alleles != null", "location >= 0", "baseCounts != null && !baseCounts.isEmpty()"}) + public static int[] countBasesAtPileupPosition(final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap, final Set alleles, + final int location) throws IllegalStateException { + + if ( perReadAlleleLikelihoodMap == null ) { throw new IllegalArgumentException("PerReadAlleleLikelihoodMap is null."); } + if ( alleles == null ) { throw new IllegalArgumentException("Alleles are null."); } + if ( location < 0 ) { throw new IllegalArgumentException("location < 0"); } + + // make sure that there's a meaningful relationship between the alleles in the perReadAlleleLikelihoodMap and our VariantContext + if ( !perReadAlleleLikelihoodMap.getAllelesSet().containsAll(alleles) ) { + throw new IllegalStateException("VC alleles " + alleles + " not a strict subset of per read allele map alleles " + perReadAlleleLikelihoodMap.getAllelesSet()); + } + + final int[] baseCounts = new int[4]; + for ( final Map.Entry> el : perReadAlleleLikelihoodMap.getLikelihoodReadMap().entrySet()) { + final MostLikelyAllele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue(), alleles); + if (a.isInformative()) { + final byte[] bases = el.getKey().getReadBases(); + final int position = location - el.getKey().getAlignmentStart(); + if (position >= 0 && position < bases.length) { + final byte[] coveredBases = AlignmentUtils.getBasesCoveringRefInterval(position, position, bases, 0, el.getKey().getCigar()); + if ( coveredBases != null && coveredBases.length != 0 ) { + final int index = BaseUtils.simpleBaseToBaseIndex(coveredBases[0]); + if (index != -1) baseCounts[index]++; + } + } + } + } + + return baseCounts; + } } diff --git a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/sam/AlignmentUtilsUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/sam/AlignmentUtilsUnitTest.java index 160d2e51f..2ad8ef367 100644 --- a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/sam/AlignmentUtilsUnitTest.java +++ b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/sam/AlignmentUtilsUnitTest.java @@ -26,17 +26,24 @@ package org.broadinstitute.gatk.utils.sam; import htsjdk.samtools.*; +import htsjdk.samtools.reference.ReferenceSequenceFile; +import htsjdk.variant.variantcontext.Allele; import org.apache.commons.lang.ArrayUtils; import org.broadinstitute.gatk.utils.Utils; +import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile; +import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.gatk.utils.pileup.PileupElement; import org.testng.Assert; import org.testng.annotations.BeforeClass; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; +import org.broadinstitute.gatk.utils.BaseTest; +import java.io.File; +import java.io.FileNotFoundException; import java.util.*; -public class AlignmentUtilsUnitTest { +public class AlignmentUtilsUnitTest extends BaseTest { private final static boolean DEBUG = false; private SAMFileHeader header; @@ -1064,4 +1071,52 @@ public class AlignmentUtilsUnitTest { Assert.assertEquals(originalCigar.equals(newCigar), !cigar.endsWith("D")); } + + @DataProvider(name = "CountBasesAtPileupPositionData") + public Object[][] makeCountBasesAtPileupPositionData() throws FileNotFoundException { + final ReferenceSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(publicTestDir + "exampleFASTA.fasta")); + final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(seq.getSequenceDictionary()); + final byte[] bases = new byte[]{'A','C','G','T'}; + final byte[] basesSnp = new byte[]{'A','A','G','T'}; + final byte[] basesDel = new byte[]{'A','G','T'}; + final byte[] basesIns = ArrayUtils.addAll(new byte[]{'A'}, bases); + final Allele allele = Allele.create(new byte[]{'A'}, false); + final byte[] quals = new byte[] {30,30,30,30}; + final byte[] qualsDel = new byte[] {30,30,30}; + final byte[] qualsIns = new byte[] {30,30,30,30,30}; + + return new Object[][]{ + { header, new byte[][]{bases, bases}, new byte[][]{quals, quals}, allele, new String[]{"4M", "4M"}, new int[][]{{2,0,0,0}, {0,2,0,0}, {0,0,2,0}, {0,0,0,2}} }, + { header, new byte[][]{basesSnp, bases}, new byte[][]{quals, quals}, allele, new String[]{"4M", "4M"}, new int[][]{{2,0,0,0}, {1,1,0,0}, {0,0,2,0}, {0,0,0,2}} }, + { header, new byte[][]{basesDel, bases}, new byte[][]{qualsDel, quals}, allele, new String[]{"1M1D2M", "4M"}, new int[][]{{2,0,0,0}, {0,1,0,0}, {0,0,2,0}, {0,0,0,2}} }, + { header, new byte[][]{basesIns, bases}, new byte[][]{qualsIns, quals}, allele, new String[]{"1M1I3M", "4M"}, new int[][]{{2,0,0,0}, {0,2,0,0}, {0,0,2,0}, {0,0,0,2}} } + }; + } + + @Test(dataProvider = "CountBasesAtPileupPositionData") + public void testCountBasesAtPileupPosition(final SAMFileHeader header, final byte[][] bases, final byte[][] quals, final Allele allele, final String[] cigar, final int[][] expected) { + + final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap = new PerReadAlleleLikelihoodMap(); + + for ( int i = 1; i <= bases.length; i++ ) { + final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "read" + i, 0, 1, bases[i-1], quals[i-1], cigar[i-1]); + perReadAlleleLikelihoodMap.add(read, allele, -0.01); + } + + final Set alleles = new HashSet<>(Arrays.asList(allele)); + final int endPosition = Math.min(bases[0].length, bases[1].length); + for ( int i = 1; i <= endPosition; i++ ) { + final int[] baseCounts = AlignmentUtils.countBasesAtPileupPosition(perReadAlleleLikelihoodMap, alleles, i); + Assert.assertEquals(baseCounts, expected[i-1]); + } + } + + @Test(dataProvider = "CountBasesAtPileupPositionData", expectedExceptions = IllegalStateException.class) + public void testCountBasesAtPileupPositionException(final SAMFileHeader header, final byte[][] bases, final byte[][] quals, final Allele allele, final String[] cigar, final int[][] expected) { + + final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap = new PerReadAlleleLikelihoodMap(); + + final Set wrongAlleles = new HashSet<>(Arrays.asList(allele)); + AlignmentUtils.countBasesAtPileupPosition(perReadAlleleLikelihoodMap, wrongAlleles, bases.length); + } } diff --git a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/sam/ReadUtilsUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/sam/ReadUtilsUnitTest.java index 9a9a540a4..6df455d18 100644 --- a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/sam/ReadUtilsUnitTest.java +++ b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/sam/ReadUtilsUnitTest.java @@ -208,7 +208,7 @@ public class ReadUtilsUnitTest extends BaseTest { public void testGetMaxReadLength() { for( final int minLength : Arrays.asList( 5, 30, 50 ) ) { for( final int maxLength : Arrays.asList( 50, 75, 100 ) ) { - final List reads = new ArrayList(); + final List reads = new ArrayList<>(); for( int readLength = minLength; readLength <= maxLength; readLength++ ) { reads.add( ReadUtils.createRandomRead( readLength ) ); } @@ -216,7 +216,7 @@ public class ReadUtilsUnitTest extends BaseTest { } } - final List reads = new LinkedList(); + final List reads = new LinkedList<>(); Assert.assertEquals(ReadUtils.getMaxReadLength(reads), 0, "Empty list should have max length of zero"); } @@ -254,7 +254,7 @@ public class ReadUtilsUnitTest extends BaseTest { @DataProvider(name = "HasWellDefinedFragmentSizeData") public Object[][] makeHasWellDefinedFragmentSizeData() throws Exception { - final List tests = new LinkedList(); + final List tests = new LinkedList<>(); // setup a basic read that will work final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(); From bcbd7fe952edf6ccee822c8401339dfe702a77ec Mon Sep 17 00:00:00 2001 From: Ron Levine Date: Thu, 10 Nov 2016 16:53:13 -0500 Subject: [PATCH 56/68] Change default value of STANDARD_CONFIDENCE_FOR_CALLING to 10 --- ...GenotypeCalculationArgumentCollection.java | 2 +- .../walkers/genotyper/GenotypingEngine.java | 4 ++-- .../NanoSchedulerIntegrationTest.java | 2 +- ...perGeneralPloidySuite1IntegrationTest.java | 4 ++-- ...dGenotyperIndelCallingIntegrationTest.java | 12 +++++------ .../UnifiedGenotyperIntegrationTest.java | 16 +++++++-------- ...GenotyperNormalCallingIntegrationTest.java | 4 ++-- .../HaplotypeCallerGVCFIntegrationTest.java | 2 +- .../HaplotypeCallerIntegrationTest.java | 8 ++++---- .../GenotypeGVCFsIntegrationTest.java | 20 +++++++++---------- 10 files changed, 37 insertions(+), 37 deletions(-) diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/arguments/GenotypeCalculationArgumentCollection.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/arguments/GenotypeCalculationArgumentCollection.java index f3915f555..c74a3b751 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/arguments/GenotypeCalculationArgumentCollection.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/arguments/GenotypeCalculationArgumentCollection.java @@ -122,7 +122,7 @@ public class GenotypeCalculationArgumentCollection implements Cloneable{ * is the default). */ @Argument(fullName = "standard_min_confidence_threshold_for_calling", shortName = "stand_call_conf", doc = "The minimum phred-scaled confidence threshold at which variants should be called", required = false) - public double STANDARD_CONFIDENCE_FOR_CALLING = 30.0; + public double STANDARD_CONFIDENCE_FOR_CALLING = 10.0; /** * This argument allows you to emit low quality calls as filtered records. diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypingEngine.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypingEngine.java index 04e33c53b..fe801b9b0 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypingEngine.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypingEngine.java @@ -378,7 +378,7 @@ public abstract class GenotypingEngine= QualityUtils.qualToErrorProbLog10(configuration.genotypeArgs.STANDARD_CONFIDENCE_FOR_CALLING/3)) + if (normalizedLog10ACeq0Posterior >= QualityUtils.qualToErrorProbLog10(configuration.genotypeArgs.STANDARD_CONFIDENCE_FOR_CALLING)) return 0.0; return 1.0 - Math.pow(10.0, normalizedLog10ACeq0Posterior); diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/NanoSchedulerIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/NanoSchedulerIntegrationTest.java index 7ed81fdb6..d23a6fe92 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/NanoSchedulerIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/NanoSchedulerIntegrationTest.java @@ -70,7 +70,7 @@ public class NanoSchedulerIntegrationTest extends WalkerTest { for ( final int nt : Arrays.asList(1, 2) ) for ( final int nct : Arrays.asList(1, 2) ) { - tests.add(new Object[]{ "BOTH", "52f590f6b37a1b3b12042ae917738965", nt, nct }); + tests.add(new Object[]{ "BOTH", "e2fdd36a4eda18f748df944b428fa392", nt, nct }); } return tests.toArray(new Object[][]{}); diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite1IntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite1IntegrationTest.java index 5de6bcebf..8e9027de7 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite1IntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite1IntegrationTest.java @@ -69,7 +69,7 @@ public class UnifiedGenotyperGeneralPloidySuite1IntegrationTest extends WalkerTe @Test(enabled = true) public void testSNP_ACS_Pools() { - executor.PC_LSV_Test_short("-A AlleleCountBySample -maxAltAlleles 1 -ploidy 6 -out_mode EMIT_ALL_CONFIDENT_SITES", "LSV_SNP_ACS", "SNP", "ebdf749d404aaef298780a53059a4f93"); + executor.PC_LSV_Test_short("-A AlleleCountBySample -maxAltAlleles 1 -ploidy 6 -out_mode EMIT_ALL_CONFIDENT_SITES", "LSV_SNP_ACS", "SNP", "853b47780322b8133577aea528b9fd77"); } @Test(enabled = true) @@ -88,6 +88,6 @@ public class UnifiedGenotyperGeneralPloidySuite1IntegrationTest extends WalkerTe //TODO the old MD5 is kept for the record. //TODO this should be revisit once we get into addressing inaccuracies by the independent allele approach. // executor.PC_LSV_Test_NoRef("-A AlleleCountBySample -maxAltAlleles 2 -ploidy 1", "LSV_INDEL_DISC_NOREF_p1", "INDEL", "b5ff7530827f4b9039a58bdc8a3560d2"); - executor.PC_LSV_Test_NoRef("-A AlleleCountBySample -maxAltAlleles 2 -ploidy 1", "LSV_INDEL_DISC_NOREF_p1", "INDEL", "5b76f96b6b74944e0c0d9914700588f0"); + executor.PC_LSV_Test_NoRef("-A AlleleCountBySample -maxAltAlleles 2 -ploidy 1", "LSV_INDEL_DISC_NOREF_p1", "INDEL", "f6b9e1ac0c51c9702525ee52bb2db18a"); } } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperIndelCallingIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperIndelCallingIntegrationTest.java index 2b94dfc7a..44125d861 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperIndelCallingIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperIndelCallingIntegrationTest.java @@ -78,7 +78,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,500,000", 1, - Arrays.asList("32bece91e170d623092817738faddb4e")); + Arrays.asList("96afa04944156c1ca5028e5506ba8b94")); executeTest(String.format("test indel caller in SLX"), spec); } @@ -105,7 +105,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,500,000", 1, - Arrays.asList("dd66e5f8a6e43be0e473251185a4f38a")); + Arrays.asList("7e211573190003342af274e64a0612fb")); executeTest(String.format("test indel calling, multiple technologies"), spec); } @@ -115,7 +115,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "indelAllelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1, - Arrays.asList("aa56ed44e77162efce45c936c485769e")); + Arrays.asList("50622e495cad2a24fbc4a80f1281d4dc")); executeTest("test MultiSample Pilot2 indels with alleles passed in", spec); } @@ -125,7 +125,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest { baseCommandIndels + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "indelAllelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1, - Arrays.asList("a4b6434c59c4b119e480ddafc86de234")); + Arrays.asList("50622e495cad2a24fbc4a80f1281d4dc")); executeTest("test MultiSample Pilot2 indels with alleles passed in and emitting all sites", spec); } @@ -181,7 +181,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest { public void testMinIndelFraction0() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( assessMinIndelFraction + " -minIndelFrac 0.0", 1, - Arrays.asList("2a82d1586b2148e8d902da5cf8538210")); + Arrays.asList("0d1a5c865c382f1f0ca6f0f104478366")); executeTest("test minIndelFraction 0.0", spec); } @@ -189,7 +189,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest { public void testMinIndelFraction25() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( assessMinIndelFraction + " -minIndelFrac 0.25", 1, - Arrays.asList("3184a3f58b3aeafcd97280af708a04bb")); + Arrays.asList("aab86cec61adaeb3a5c6887e70211663")); executeTest("test minIndelFraction 0.25", spec); } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index 9fc875267..18f091d4c 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -129,12 +129,12 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { @Test public void testOutputParameterAllConfident() { - testOutputParameters("--output_mode EMIT_ALL_CONFIDENT_SITES", "f6937cc8ec068f2d38b5d277a92be34b"); + testOutputParameters("--output_mode EMIT_ALL_CONFIDENT_SITES", "06386b0a4495583aa924e5addd56c5dc"); } @Test public void testOutputParameterAllSites() { - testOutputParameters("--output_mode EMIT_ALL_SITES", "1cddd7b1e730765c2b7b55d8a1d69b4c"); + testOutputParameters("--output_mode EMIT_ALL_SITES", "b8b21ad6a2ff1f908e8e0073b57ba0e0"); } private void testOutputParameters(final String args, final String md5) { @@ -190,12 +190,12 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { @Test public void testHeterozyosity1() { - testHeterozosity( 0.01, "6b8bdde9d303139806c5177fae53b1fd" ); + testHeterozosity( 0.01, "7bbba110f720fc8c115fe2d53b34d693" ); } @Test public void testHeterozyosity2() { - testHeterozosity( 1.0 / 1850, "b1604d1ba68dfe2fcfb861ef6420a8ba" ); + testHeterozosity( 1.0 / 1850, "767e8eacd216ac7437456e690287cecf" ); } private void testHeterozosity(final double arg, final String md5) { @@ -238,7 +238,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { // Note that we need to turn off any randomization for this to work, so no downsampling and no annotations - String md5 = "398d3ad38834fea8961ab6f46a21dc4b"; + String md5 = "75b4b097747f91b8b7ceea153d2b7e1c"; final String myCommand = "-T UnifiedGenotyper --disableDithering -R " + b36KGReference + " --no_cmdline_in_header -glm BOTH -minIndelFrac 0.0 --dbsnp " + b36dbSNP129; WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( @@ -274,7 +274,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,100,000", 1, - Arrays.asList("7ed55f70feeacf8ecc6b36f0d741dfc7")); + Arrays.asList("6464138c0bd2ab2cac06773f19e37a4c")); executeTest(String.format("test multiple technologies"), spec); } @@ -293,7 +293,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -L 1:10,000,000-10,100,000" + " -baq CALCULATE_AS_NECESSARY", 1, - Arrays.asList("90224ac1c9e2ce9b77fee8dd6e044efe")); + Arrays.asList("8885492069487efcf67e13608e762acd")); executeTest(String.format("test calling with BAQ"), spec); } @@ -310,7 +310,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000 " + "-A SnpEff", 1, - Arrays.asList("2a1eced23dd605d1b0a3efde3f04e23f")); + Arrays.asList("81ac0ffd22a0d0907848019944034359")); executeTest("testSnpEffAnnotationRequestedWithoutRodBinding", spec); } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperNormalCallingIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperNormalCallingIntegrationTest.java index a8f3f6187..0ec669c01 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperNormalCallingIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperNormalCallingIntegrationTest.java @@ -70,7 +70,7 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{ public void testMultiSamplePilot1() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000", 1, - Arrays.asList("f03e4ef62d6614c9b1b0a600f7e9f16d")); + Arrays.asList("605f447127bf9c92f60bbaa9c6a6732e")); executeTest("test MultiSample Pilot1", spec); } @@ -94,7 +94,7 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{ public void testSingleSamplePilot2() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,100,000", 1, - Arrays.asList("281db46f39e3367f207838c620a82bd2")); + Arrays.asList("e5c34be242c9b6bec687c7384ef83cb2")); executeTest("test SingleSample Pilot2", spec); } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java index b2a86148e..1b9f72c7a 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java @@ -87,7 +87,7 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest { //TODO this might need to be addressed at some point. //TODO the following test is commented out for the record //tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.NONE, PCRFreeIntervals, "7f09c261950bf86e435edfa69ed2ec71"}); - tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.NONE, PCRFreeIntervals, "8d30370465d74fd549d76dd31adc4c0c"}); + tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.NONE, PCRFreeIntervals, "a4286ada7f9efaa83f7a8f0e72c3cb45"}); tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "cf5545094ebb264fa8eb879fd848d9ef"}); tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "0086cc735cf792a9f236ec057c73b750"}); tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.NONE, WExIntervals, "2e81881e92061ad4eb29025ffdc129c7"}); diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index 605af9b54..aadd03a32 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -107,7 +107,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeBAMOutFlags() throws IOException { - HCTestWithBAMOut(NA12878_BAM, " -L 20:10000000-10100000 ", "a6abb0aa68d3b4d15185a119350e76dc", "d38aab5bf8ef0bc7c18e8c909819da84"); + HCTestWithBAMOut(NA12878_BAM, " -L 20:10000000-10100000 ", "700c5d20e9d9d9a431fcda9bff91f72e", "a0daf5a80158d4a462248415c1e17565"); } @Test @@ -203,7 +203,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerSingleSampleIndelQualityScores() { - HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "3625167f0e788d409c7eab1898d5eafe"); + HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "49b8fb444c6f88def2069b8b0efe47c7"); } private void HCTestNearbySmallIntervals(String bam, String args, String md5) { @@ -361,7 +361,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { public void HCTestAggressivePcrIndelModelWGS() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T HaplotypeCaller --disableDithering --pcr_indel_model AGGRESSIVE -pairHMMSub " + HMM_SUB_IMPLEMENTATION + " " + ALWAYS_LOAD_VECTOR_HMM + " -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_BAM + " -o %s -L 20:10,270,000-10,300,000", 1, - Arrays.asList("c2dab66ad3740320004874c83051bbfc")); + Arrays.asList("fcc81209c562f3c7f1627b187a4dfab4")); executeTest("HC calling with aggressive indel error modeling on WGS intervals", spec); } @@ -369,7 +369,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { public void HCTestConservativePcrIndelModelWGS() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T HaplotypeCaller --disableDithering --pcr_indel_model CONSERVATIVE -pairHMMSub " + HMM_SUB_IMPLEMENTATION + " " + ALWAYS_LOAD_VECTOR_HMM + " -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_BAM + " -o %s -L 20:10,270,000-10,300,000", 1, - Arrays.asList("a8ea15ac136042891434ccb0b3c3b686")); + Arrays.asList("61aef3fe9d18eec1df526e99a8456115")); executeTest("HC calling with conservative indel error modeling on WGS intervals", spec); } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFsIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFsIntegrationTest.java index 891705794..1665d674c 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFsIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFsIntegrationTest.java @@ -108,7 +108,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { " -V:sample3 " + privateTestDir + "combine.single.sample.pipeline.3.vcf" + " -L 20:10,000,000-11,000,000", b37KGReference), 1, - Collections.singletonList("b82f29eee8b1369b376ace857bf9b55a")); + Collections.singletonList("7b2a135e694f9d1190e041e6fd420123")); executeTest("combineSingleSamplePipelineGVCF", spec); } @@ -157,7 +157,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { " -V:sample3 " + privateTestDir + "combine.single.sample.pipeline.3.vcf" + " --includeNonVariantSites -L 20:10,030,000-10,033,000 -L 20:10,386,000-10,386,500", b37KGReference), 1, - Collections.singletonList("ea11554de21ef8f25e9983db8b5a8480")); + Collections.singletonList("a9ecd152ec4b5b541887a0aed016f40d")); spec.disableShadowBCF(); executeTest("combineSingleSamplePipelineGVCF_includeNonVariants", spec); } @@ -171,7 +171,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { " -V:sample3 " + privateTestDir + "combine.single.sample.pipeline.3.vcf" + " -L 20:10,000,000-20,000,000", b37KGReference), 1, - Collections.singletonList("b304c7e3bb3625a1cdb5531c77b13bcd")); + Collections.singletonList("06e218297fa5399538d13b6a8db4cfe3")); executeTest("combineSingleSamplePipelineGVCFHierarchical", spec); } @@ -183,7 +183,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { " -V:sample3 " + privateTestDir + "combine.single.sample.pipeline.3.vcf" + " -L 20:10,000,000-11,000,000 --dbsnp " + b37dbSNP132, b37KGReference), 1, - Collections.singletonList("08adc638b9539fd275836ed008d900ee")); + Collections.singletonList("181fcb5d240b9bd92e3c793ca5aa7954")); executeTest("combineSingleSamplePipelineGVCF_addDbsnp", spec); } @@ -251,7 +251,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { final WalkerTestSpec spec = new WalkerTestSpec( baseBPResolutionString("-allSites"), 1, - Collections.singletonList("764ac46e0b985db187d85655240f7ec0")); + Collections.singletonList("2425f2567bfcf187ebae3fb5fa7558b1")); spec.disableShadowBCF(); executeTest("testAllSitesNonBiallelic", spec); } @@ -270,7 +270,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { final WalkerTestSpec spec = new WalkerTestSpec( baseBPResolutionString("-stand_call_conf 300"), 1, - Collections.singletonList("30903101c5459f602d7004934bc85ca9")); + Collections.singletonList("0ea995f728391647c69f2a3c9a6c1d03")); executeTest("testStandardConf", spec); } @@ -310,7 +310,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { " -V:combined2 " + privateTestDir + "combine.single.sample.pipeline.combined.vcf" + " --uniquifySamples", b37KGReference), 1, - Collections.singletonList("1cb3bddf47c620d294b08acd70d35fa3")); + Collections.singletonList("0c99b1b20fb035a5dada036bd4cf39e5")); executeTest("testUniquifiedSamples", spec); } @@ -590,7 +590,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { baseTestString(" -V " + privateTestDir + "set.zero.RGQs.no.call.sample1.g.vcf" + " -V " + privateTestDir + "set.zero.RGQs.no.call.sample2.g.vcf" + " -L chr16:1279274-1279874 -allSites", hg19ReferenceWithChrPrefixInChromosomeNames), - Collections.singletonList("e88db6e49c12487c55de42769d2f8c6c")); + Collections.singletonList("903047b6262fcb82070556ff74f26a75")); spec.disableShadowBCF(); executeTest("testSetZeroRGQsToNoCall", spec); } @@ -668,7 +668,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { public void testGenotypingSpanningDeletionWithAllSites() { final WalkerTestSpec spec = new WalkerTestSpec( baseTestString(" -V " + privateTestDir + "spanningDel.genotyping.g.vcf -allSites", b37KGReference), - Collections.singletonList("d3d862faf954f9bb8b1619c3e889ad8c")); + Collections.singletonList("04cfe93e92444cbde80e13ca8b8c3913")); spec.disableShadowBCF(); executeTest("testGenotypingSpanningDeletionWithAllSites", spec); } @@ -686,7 +686,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { public void testNewQualNaNBugFix() { final WalkerTestSpec spec = new WalkerTestSpec( baseTestString(" -newQual -V " + privateTestDir + "input-newqual-nan-bug-fix.vcf", b37KGReferenceWithDecoy), - Collections.singletonList("503f4193c22fbcc451bd1c425b8b6bf8")); + Collections.singletonList("e1a7801c9bb5e80d204635bac6105abf")); spec.disableShadowBCF(); executeTest("testNewQualNaNBugFix", spec); } From 2b83dd7c5baa1f0c6838b4199abdbd9dab16d832 Mon Sep 17 00:00:00 2001 From: Geraldine Van der Auwera Date: Mon, 28 Sep 2015 17:34:00 -0400 Subject: [PATCH 57/68] Added TreeReduce interface to VariantFiltration --- ...iantFiltrationParallelIntegrationTest.java | 114 ++++++++++++++++++ .../walkers/filters/VariantFiltration.java | 16 ++- 2 files changed, 127 insertions(+), 3 deletions(-) create mode 100644 protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantFiltrationParallelIntegrationTest.java diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantFiltrationParallelIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantFiltrationParallelIntegrationTest.java new file mode 100644 index 000000000..40fdea3e6 --- /dev/null +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantFiltrationParallelIntegrationTest.java @@ -0,0 +1,114 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 ("BROAD") and the LICENSEE and is effective at the date the downloading is completed ("EFFECTIVE DATE"). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system ("PHONE-HOME") which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE'S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2016 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.tools.walkers.variantutils; + +import org.broadinstitute.gatk.engine.walkers.WalkerTest; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.Arrays; + +public class VariantFiltrationParallelIntegrationTest extends WalkerTest { + + private class ParallelVariantFiltrationTestProvider extends TestDataProvider { + final String reference; + final String args; + final String md5; + final int nt; + + private ParallelVariantFiltrationTestProvider(final String reference, final String args, final String md5, final int nt) { + super(ParallelVariantFiltrationTestProvider.class); + this.reference = reference; + this.args = args; + this.md5 = md5; + this.nt = nt; + } + + public final String getCmdLine() { + return "-T VariantFiltration -R " + reference + " -o %s --no_cmdline_in_header -nt " + nt + " " + args; + } + + public String toString() { + return String.format("ParallelVariantFiltration nt=%d args=%s", nt, args); + } + } + + @DataProvider(name = "ParallelVariantFiltrationTest") + public Object[][] makeParallelSelectTestProvider() { + for ( int nt : Arrays.asList(1, 2, 4) ) { + { + String testfile = privateTestDir + "vcfexample2.vcf" ; + String args = " -filter 'AlleleBalance < 70.0 && FisherStrand == 1.4' -filterName bar --variant -V " + testfile + + " -L 1:10,020,000-10,021,000"; + new ParallelVariantFiltrationTestProvider(b36KGReference, args, "0c0fddb0eb6f9d3f74556332cd498079", nt); + } + { + final String testfile = privateTestDir + "filteringDepthInFormat.vcf"; + final String args = " --genotypeFilterExpression 'DP < 8' --genotypeFilterName highDP --invertGenotypeFilterExpression -V " + testfile; + new ParallelVariantFiltrationTestProvider(b37KGReference, args, "c6bc275c97a9e737748d16132ee76f48", nt); + } + { + final String testfile = privateTestDir + "filteringDepthInFormat.vcf"; + final String args = " --genotypeFilterExpression 'DP < 8' --genotypeFilterName lowDP -V " + testfile; + new ParallelVariantFiltrationTestProvider(b37KGReference, args, "b0016040127766a4163fcbd91afff3ea", nt); + } + } + + return ParallelVariantFiltrationTestProvider.getTests(ParallelVariantFiltrationTestProvider.class); + } + + @Test(dataProvider = "ParallelVariantFiltrationTest") + public void testParallelSelectTestProvider(final ParallelVariantFiltrationTestProvider cfg) { + final WalkerTestSpec spec = new WalkerTestSpec( cfg.getCmdLine(), 1, Arrays.asList(cfg.md5) ); + executeTest(cfg.toString(), spec); + } +} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/filters/VariantFiltration.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/filters/VariantFiltration.java index c3087fe42..fca863ff6 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/filters/VariantFiltration.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/filters/VariantFiltration.java @@ -27,6 +27,7 @@ package org.broadinstitute.gatk.tools.walkers.filters; import com.google.common.annotations.VisibleForTesting; import htsjdk.tribble.Feature; +import org.broadinstitute.gatk.engine.walkers.TreeReducible; import org.broadinstitute.gatk.utils.Utils; import org.broadinstitute.gatk.utils.commandline.*; import org.broadinstitute.gatk.engine.CommandLineGATK; @@ -97,7 +98,7 @@ import java.util.*; */ @DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VAREVAL, extraDocs = {CommandLineGATK.class} ) @Reference(window=@Window(start=-50,stop=50)) -public class VariantFiltration extends RodWalker { +public class VariantFiltration extends RodWalker implements TreeReducible { // ----------------------------------------------------------------------------------------------- // Arguments @@ -232,7 +233,7 @@ public class VariantFiltration extends RodWalker { // ----------------------------------------------------------------------------------------------- // public methods from base classes // ----------------------------------------------------------------------------------------------- - + @Override public void initialize() { if ( maskExtension < 0 ) { @@ -262,6 +263,7 @@ public class VariantFiltration extends RodWalker { * @param context the context for the given locus * @return 1 if the locus was successfully processed, 0 otherwise */ + @Override public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { if ( tracker == null ) { return 0; @@ -320,17 +322,25 @@ public class VariantFiltration extends RodWalker { return 1; } + @Override + public Integer reduceInit() { return 0; } + + @Override public Integer reduce(Integer value, Integer sum) { return sum + value; } - public Integer reduceInit() { return 0; } + @Override + public Integer treeReduce( Integer value, Integer sum ) { + return reduce(value, sum); + } /** * Tell the user the number of loci processed and close out the new variants file. * * @param result the number of loci seen. */ + @Override public void onTraversalDone(Integer result) { // move the window over so that we can filter the last few variants if ( windowInitializer != null ) { From 2d06f332276fdb25e8ca59c8d660d3118d2a09a4 Mon Sep 17 00:00:00 2001 From: Ron Levine Date: Tue, 8 Nov 2016 14:35:03 -0500 Subject: [PATCH 58/68] Set HTSJDK log level --- .../utils/commandline/CommandLineProgram.java | 32 +++++++++++++------ 1 file changed, 23 insertions(+), 9 deletions(-) diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/commandline/CommandLineProgram.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/commandline/CommandLineProgram.java index 0cb132bd6..04ed1c135 100644 --- a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/commandline/CommandLineProgram.java +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/commandline/CommandLineProgram.java @@ -25,6 +25,7 @@ package org.broadinstitute.gatk.utils.commandline; +import htsjdk.samtools.util.Log; import org.apache.log4j.FileAppender; import org.apache.log4j.Level; import org.apache.log4j.Logger; @@ -279,30 +280,43 @@ public abstract class CommandLineProgram { } /** - * this function takes the logger level passed in on the command line and uses it to set the level of the logger + * This function takes the logger level passed in on the command line and uses it to set the level of the GATK and HTSJDK loggers. + * Caveat: The HTSJDK logger level can only be set to DEBUG, INFO, WARN or ERROR. A command line FATAL or OFF will set the HTSJDK logger level to ERROR. + * * @throws ArgumentException if the logging level is not valid (DEBUG, INFO, WARN, ERROR, FATAL, OFF) */ private void setupLoggerLevel() { // set the default logger level - Level par; + Level gatkLevel; + Log.LogLevel htsjdkLevel; if (logging_level.toUpperCase().equals("DEBUG")) { - par = Level.DEBUG; + gatkLevel = Level.DEBUG; + htsjdkLevel = Log.LogLevel.DEBUG; } else if (logging_level.toUpperCase().equals("INFO")) { - par = Level.INFO; + gatkLevel = Level.INFO; + htsjdkLevel = Log.LogLevel.INFO; } else if (logging_level.toUpperCase().equals("WARN")) { - par = Level.WARN; + gatkLevel = Level.WARN; + htsjdkLevel = Log.LogLevel.WARNING; } else if (logging_level.toUpperCase().equals("ERROR")) { - par = Level.ERROR; + gatkLevel = Level.ERROR; + htsjdkLevel = Log.LogLevel.ERROR; } else if (logging_level.toUpperCase().equals("FATAL")) { - par = Level.FATAL; + gatkLevel = Level.FATAL; + htsjdkLevel = Log.LogLevel.ERROR; } else if (logging_level.toUpperCase().equals("OFF")) { - par = Level.OFF; + gatkLevel = Level.OFF; + htsjdkLevel = Log.LogLevel.ERROR; } else { // we don't understand the logging level, let's get out of here throw new ArgumentException("Unable to match: " + logging_level + " to a logging level, make sure it's a valid level (DEBUG, INFO, WARN, ERROR, FATAL, OFF)"); } - Logger.getRootLogger().setLevel(par); + // Set GATK log level + Logger.getRootLogger().setLevel(gatkLevel); + + // Set HTSJDK log level + Log.setGlobalLogLevel(htsjdkLevel); } public static String getVersionNumber() { From d61b4c7e91fb294170af51d8b43c2ac8f412b28c Mon Sep 17 00:00:00 2001 From: Ron Levine Date: Tue, 15 Nov 2016 14:56:15 -0500 Subject: [PATCH 59/68] Change the truth VCF --- ...ntRecalibrationWalkersIntegrationTest.java | 27 ++++++++++--------- 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java index 7c716fef4..929da55c5 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java @@ -64,6 +64,9 @@ import java.util.Arrays; import java.util.List; public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest { + private static String TRAINING_VCF = comparisonDataLocation + "Validated/HapMap/3.3/sites_r27_nr.b37_fwd.vcf"; + private static String TRUTH_VCF = comparisonDataLocation + "Validated/Omni2.5_chip/Omni25_genotypes_2141_samples.b37.vcf"; + private static class VRTest { String inVCF; String aggregateVCF; @@ -93,15 +96,15 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest { } VRTest lowPass = new VRTest(validationDataLocation + "phase1.projectConsensus.chr20.raw.snps.vcf", - "41e2d951a17de433fe378bb3d9ec75d4", // tranches - "3fe87e69c6a613addb7eff5449e86aa1", // recal file - "78b8f1934d77341df2f6a9fdbd30fa74"); // cut VCF + "3ccb3aa81aebee74d32641105a64ea32", // tranches + "1a87e9cdc66c53891eab61ab39ff2434", // recal file + "217ee1523b6ddaf31f0eb0464b89bab6"); // cut VCF VRTest lowPassPlusExomes = new VRTest(validationDataLocation + "phase1.projectConsensus.chr20.raw.snps.vcf", validationDataLocation + "1kg_exomes_unfiltered.AFR.unfiltered.vcf", - "ce4bfc6619147fe7ce1f8331bbeb86ce", // tranches - "5a298554e9175961f63506c4e42ea78b", // recal file - "f284c0cbb00407cc5273c6f1a871513e"); // cut VCF + "be89401e09dd06817c43f152c789f854", // tranches + "8ce11e7555cccb3f13ea34a9074aec00", // recal file + "c09c2425744e8d914d69a2585dba0e97"); // cut VCF @DataProvider(name = "VRTest") public Object[][] createData1() { @@ -119,8 +122,8 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-R " + b37KGReference + " -resource:known=true,prior=10.0 " + GATKDataLocation + "dbsnp_132_b37.leftAligned.vcf" + - " -resource:truth=true,training=true,prior=15.0 " + comparisonDataLocation + "Validated/HapMap/3.3/sites_r27_nr.b37_fwd.vcf" + - " -resource:training=true,truth=true,prior=12.0 " + comparisonDataLocation + "Validated/Omni2.5_chip/Omni25_sites_1525_samples.b37.vcf" + + " -resource:truth=true,training=true,prior=15.0 " + TRAINING_VCF + + " -resource:training=true,truth=true,prior=12.0 " + TRUTH_VCF + " -T VariantRecalibrator" + " -input " + params.inVCF + " -L 20:1,000,000-40,000,000" + @@ -159,8 +162,8 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-R " + b37KGReference + " -resource:known=true,prior=10.0 " + GATKDataLocation + "dbsnp_132_b37.leftAligned.vcf" + - " -resource:truth=true,training=true,prior=15.0 " + comparisonDataLocation + "Validated/HapMap/3.3/sites_r27_nr.b37_fwd.vcf" + - " -resource:training=true,truth=true,prior=12.0 " + comparisonDataLocation + "Validated/Omni2.5_chip/Omni25_sites_1525_samples.b37.vcf" + + " -resource:truth=true,training=true,prior=15.0 " + TRAINING_VCF + + " -resource:training=true,truth=true,prior=12.0 " + TRUTH_VCF + " -T VariantRecalibrator" + " -input " + params.inVCF + " -aggregate " + params.aggregateVCF + @@ -210,8 +213,8 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-R " + b37KGReference + " -resource:known=true,prior=10.0 " + GATKDataLocation + "dbsnp_132_b37.leftAligned.vcf" + - " -resource:truth=true,training=true,prior=15.0 " + comparisonDataLocation + "Validated/HapMap/3.3/sites_r27_nr.b37_fwd.vcf" + - " -resource:training=true,truth=true,prior=12.0 " + comparisonDataLocation + "Validated/Omni2.5_chip/Omni25_sites_1525_samples.b37.vcf" + + " -resource:truth=true,training=true,prior=15.0 " + TRAINING_VCF + + " -resource:training=true,truth=true,prior=12.0 " + TRUTH_VCF + " -T VariantRecalibrator" + " -input " + params.inVCF + " -L 20:10,000,000-20,000,000" + From 6156b85ad9d26793793ab03a50160c463db8dc81 Mon Sep 17 00:00:00 2001 From: David Benjamin Date: Mon, 25 Apr 2016 15:28:22 -0400 Subject: [PATCH 60/68] Fixed logic error and tidied AlleleBalance and AlleleBalanceBySample --- .../HaplotypeCallerIntegrationTest.java | 5 + .../walkers/annotator/AlleleBalance.java | 172 ++++++++---------- .../annotator/AlleleBalanceBySample.java | 92 ++++------ 3 files changed, 108 insertions(+), 161 deletions(-) diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index aadd03a32..f2a9a6f20 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -522,5 +522,10 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList(md5)); executeTest("testHaplotypeCallerRemoveAltAlleleBasedOnHaptypeScores", spec); } + + @Test + public void testAlleleBalance() throws IOException{ + HCTest(CEUTRIO_BAM, " -L 20:10001000-10010000 -A AlleleBalance -A AlleleBalanceBySample", "a210161843f4cb80143ff56e4e5c250f"); + } } diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AlleleBalance.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AlleleBalance.java index a8d06a603..123ea7716 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AlleleBalance.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AlleleBalance.java @@ -35,9 +35,9 @@ import org.broadinstitute.gatk.utils.contexts.ReferenceContext; import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.InfoFieldAnnotation; +import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.ActiveRegionBasedAnnotation; import org.broadinstitute.gatk.utils.MathUtils; import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap; -import org.broadinstitute.gatk.utils.pileup.ReadBackedPileup; import org.broadinstitute.gatk.utils.variant.GATKVCFConstants; import java.util.Arrays; @@ -70,7 +70,7 @@ import java.util.Map; * */ -public class AlleleBalance extends InfoFieldAnnotation { +public class AlleleBalance extends InfoFieldAnnotation implements ActiveRegionBasedAnnotation { public Map annotate(final RefMetaDataTracker tracker, final AnnotatorCompatible walker, @@ -78,135 +78,105 @@ public class AlleleBalance extends InfoFieldAnnotation { final Map stratifiedContexts, final VariantContext vc, final Map stratifiedPerReadAlleleLikelihoodMap) { - //if ( stratifiedContexts.size() == 0 ) - // return null; - - if ( !vc.isBiallelic() ) + if ( !(vc.isBiallelic() && vc.hasGenotypes())) { return null; - final GenotypesContext genotypes = vc.getGenotypes(); - if ( !vc.hasGenotypes() ) - return null; - - double ratioHom = 0.0; - double ratioHet = 0.0; - double weightHom = 0.0; - double weightHet = 0.0; - double overallNonDiploid = 0.0; - for ( Genotype genotype : genotypes ) { - - if ( vc.isSNP() ) { - - final int[] counts = getCounts(genotype, stratifiedContexts, vc); - // If AD was not calculated, we can't continue - if(counts == null) - continue; - - final int n_allele = counts.length; - int count_sum = 0; - for(int i=0; i bestOtherCount ) - bestOtherCount = counts[i]; - } - final int otherCount = count_sum - alleleCount; - ratioHom += pTrue*( (double) alleleCount)/((double) (alleleCount+bestOtherCount)); - weightHom += pTrue; - overallNonDiploid += ((double ) otherCount)/((double) count_sum*genotypes.size()); - } - // Allele Balance for indels was not being computed correctly (since there was no allele matching). Instead of - // prolonging the life of imperfect code, I've decided to delete it. If someone else wants to try again from - // scratch, be my guest - but make sure it's done correctly! [EB] - } } - // make sure we had a het genotype + double refCountInHetSamples = 0.0; + double altCountInHetSamples = 0.0; + double correctCountInHomSamples = 0.0; + double incorrectCountInHomSamples = 0.0; + double nonDiploidCount = 0.0; + double totalReadCount = 0.0; + + for ( final Genotype genotype : vc.getGenotypes() ) { + if ( !vc.isSNP() ) { + continue; + } + + final int[] alleleCounts = getCounts(genotype, stratifiedContexts, vc); + + if (alleleCounts == null) continue; + + final long totalReads = MathUtils.sum(alleleCounts); + if ( genotype.isHet() ) { + // weight read counts by genotype quality so that e.g. mis-called homs don't affect the ratio too much + refCountInHetSamples += alleleCounts[0]; + altCountInHetSamples += alleleCounts[1]; + nonDiploidCount += totalReads - (alleleCounts[0] + alleleCounts[1]); + totalReadCount += totalReads; + } else if ( genotype.isHom() ) { + final int alleleIndex = genotype.isHomRef() ? 0 : 1 ; + final int alleleCount = alleleCounts[alleleIndex]; + int bestOtherCount = 0; + for(int n = 0; n < alleleCounts.length; n++){ + if( n != alleleIndex && alleleCounts[n] > bestOtherCount ) { + bestOtherCount = alleleCounts[n]; + } + } + correctCountInHomSamples += alleleCount; + incorrectCountInHomSamples += bestOtherCount; + nonDiploidCount += totalReads - alleleCount; + totalReadCount += totalReads; + } + // Allele Balance for indels was not being computed correctly (since there was no allele matching). Instead of + // prolonging the life of imperfect code, I've decided to delete it. If someone else wants to try again from + // scratch, be my guest - but make sure it's done correctly! [EB] + + } + final double diploidCountInHetSamples = altCountInHetSamples + refCountInHetSamples; + final double diploidCountInHomSamples = correctCountInHomSamples + incorrectCountInHomSamples; Map map = new HashMap<>(); - if ( weightHet > 0.0 ) { - map.put(GATKVCFConstants.ALLELE_BALANCE_HET_KEY,ratioHet/weightHet); + if ( diploidCountInHetSamples > 0.0 ) { + map.put(GATKVCFConstants.ALLELE_BALANCE_HET_KEY, refCountInHetSamples / diploidCountInHetSamples); } - if ( weightHom > 0.0 ) { - map.put(GATKVCFConstants.ALLELE_BALANCE_HOM_KEY,ratioHom/weightHom); + if ( diploidCountInHomSamples > 0.0 ) { + map.put(GATKVCFConstants.ALLELE_BALANCE_HOM_KEY, correctCountInHomSamples / diploidCountInHomSamples); } - if ( overallNonDiploid > 0.0 ) { - map.put(GATKVCFConstants.NON_DIPLOID_RATIO_KEY,overallNonDiploid); + if ( totalReadCount > 0.0 ) { + map.put(GATKVCFConstants.NON_DIPLOID_RATIO_KEY, nonDiploidCount / totalReadCount); } return map; } /** - * Provide a centralized method of getting the number of reads per allele, - * depending on the input given. Will use the following (in order of preference): + * Get the number of reads per allele, using the following (in order of preference): * - genotype.getAD() * - reads from an AlignmentContext * - reads from a PerReadAlleleLikelihoodMap (Not yet implemented) * - * * @param genotype The genotype of interest - * @param stratifiedContexts A mapping - * @param vc - * @return + * @param stratifiedContexts A mapping of sample name to read alignments at a location + * @param vc The Variant Context + * @return The number of reads per allele */ private int[] getCounts(final Genotype genotype, final Map stratifiedContexts, final VariantContext vc){ - - // Can't do anything without a genotype here if(genotype == null) return null; - int[] retVal = genotype.getAD(); - AlignmentContext context; + if (genotype.hasAD()) { + return genotype.getAD(); + } else { // If getAD() returned no information we count alleles from the pileup + final AlignmentContext context = stratifiedContexts == null ? null : stratifiedContexts.get(genotype.getSampleName()); + if (context == null) return null; - if ( retVal == null && stratifiedContexts != null && - (context = stratifiedContexts.get(genotype.getSampleName())) != null){ - // If we get to this point, the getAD() function returned no information - // about AlleleDepth by Sample - perhaps it wasn't annotated? - // In that case, let's try to build it up using the algorithm that - // was here in v 3.1-1 and earlier - // Also, b/c of the assignment check in the if statement above, - // we know we have a valid AlignmentContext for this sample! - - final ReadBackedPileup pileup = context.getBasePileup(); - final String bases = new String(pileup.getBases()); - List alleles = vc.getAlleles(); - final int n_allele = alleles.size(); - retVal = new int[n_allele]; - - // Calculate the depth for each allele, under the assumption that - // the allele is a single base - int i=0; - for(Allele a : alleles){ - retVal[i] = MathUtils.countOccurrences(a.toString().charAt(0), bases); - i++; + final byte[] bases = context.getBasePileup().getBases(); + // Should be able to replace with the following, but this annotation was not found when using -A AlleleBalance + // return vc.getAlleles().stream().map(a -> MathUtils.countOccurrences(a.getBases()[0], bases)).mapToInt(Integer::intValue).toArray(); + final List alleles = vc.getAlleles(); + final int[] result = new int[alleles.size()]; + // Calculate the depth for each allele, assuming that the allele is a single base + for(int n = 0; n < alleles.size(); n++){ + result[n] = MathUtils.countOccurrences(alleles.get(n).getBases()[0], bases); } + return result; } - - return retVal; - } @Override diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AlleleBalanceBySample.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AlleleBalanceBySample.java index 24913bfe9..dd0caa865 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AlleleBalanceBySample.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AlleleBalanceBySample.java @@ -32,6 +32,7 @@ import htsjdk.variant.variantcontext.GenotypeBuilder; import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.vcf.VCFFormatHeaderLine; import htsjdk.variant.vcf.VCFHeaderLineType; +import org.apache.commons.lang.mutable.MutableInt; import org.broadinstitute.gatk.utils.contexts.AlignmentContext; import org.broadinstitute.gatk.utils.contexts.ReferenceContext; import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; @@ -84,36 +85,23 @@ public class AlleleBalanceBySample extends GenotypeAnnotation implements Experim final Genotype g, final GenotypeBuilder gb, final PerReadAlleleLikelihoodMap alleleLikelihoodMap){ - - - // We need a heterozygous genotype and either a context or alleleLikelihoodMap - if ( g == null || !g.isCalled() || !g.isHet() || ( stratifiedContext == null && alleleLikelihoodMap == null) ) + // We need a heterozygous genotype and either a context or non-empty alleleLikelihoodMap + if ( g == null || !g.isCalled() || !g.isHet() || + ( stratifiedContext == null && (alleleLikelihoodMap == null || alleleLikelihoodMap.isEmpty())) ) return; - // Test for existence of allele, and manually check isSNP() - // and isBiallelic() while ignoring the allele - boolean biallelicSNP = vc.isSNP() && vc.isBiallelic(); - if(vc.hasAllele(GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE)){ - // If we have the GVCF allele, then the SNP is biallelic - // iff there are 3 alleles and both the reference and first alt - // allele are length 1. - biallelicSNP = vc.getAlleles().size() == 3 && - vc.getReference().length() == 1 && - vc.getAlternateAllele(0).length() == 1; - } + // If we have a allele the SNP is biallelic if there are 3 alleles and both the reference and first alt allele are length 1. + final boolean biallelicSNP = vc.hasAllele(GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE) ? + vc.getAlleles().size() == 3 && vc.getReference().length() == 1 && vc.getAlternateAllele(0).length() == 1 : + vc.isSNP() && vc.isBiallelic(); if ( !biallelicSNP ) return; - Double ratio; - if (alleleLikelihoodMap != null && !alleleLikelihoodMap.isEmpty()) - ratio = annotateWithLikelihoods(alleleLikelihoodMap, vc); - else if ( stratifiedContext != null ) - ratio = annotateWithPileup(stratifiedContext, vc); - else - return; - + final Double ratio = (alleleLikelihoodMap != null && !alleleLikelihoodMap.isEmpty()) ? + annotateWithLikelihoods(alleleLikelihoodMap, vc) : + annotateWithPileup(stratifiedContext, vc); if (ratio == null) return; @@ -121,58 +109,42 @@ public class AlleleBalanceBySample extends GenotypeAnnotation implements Experim } private Double annotateWithPileup(final AlignmentContext stratifiedContext, final VariantContext vc) { - - final HashMap alleleCounts = new HashMap<>(); + final HashMap alleleCounts = new HashMap<>(); for ( final Allele allele : vc.getAlleles() ) - alleleCounts.put(allele.getBases()[0], 0); + alleleCounts.put(allele.getBases()[0], new MutableInt(0)); - final ReadBackedPileup pileup = stratifiedContext.getBasePileup(); - for ( final PileupElement p : pileup ) { - if ( alleleCounts.containsKey(p.getBase()) ) - alleleCounts.put(p.getBase(), alleleCounts.get(p.getBase())+1); + for ( final byte base : stratifiedContext.getBasePileup().getBases() ) { + if ( alleleCounts.containsKey(base) ) + alleleCounts.get(base).increment(); } - // we need to add counts in the correct order - final int[] counts = new int[alleleCounts.size()]; - counts[0] = alleleCounts.get(vc.getReference().getBases()[0]); - for (int i = 0; i < vc.getAlternateAlleles().size(); i++) - counts[i+1] = alleleCounts.get(vc.getAlternateAllele(i).getBases()[0]); - - // sanity check - if(counts[0] + counts[1] == 0) - return null; - - return ((double) counts[0] / (double)(counts[0] + counts[1])); + final int refCount = alleleCounts.get(vc.getReference().getBases()[0]).intValue(); + final int altCount = alleleCounts.get(vc.getAlternateAllele(0).getBases()[0]).intValue(); + return (refCount + altCount == 0) ? null : ((double) refCount) / (refCount + altCount); } private Double annotateWithLikelihoods(final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap, final VariantContext vc) { final Set alleles = new HashSet<>(vc.getAlleles()); // make sure that there's a meaningful relationship between the alleles in the perReadAlleleLikelihoodMap and our VariantContext - if ( ! perReadAlleleLikelihoodMap.getAllelesSet().containsAll(alleles) ) + if (!perReadAlleleLikelihoodMap.getAllelesSet().containsAll(alleles)) throw new IllegalStateException("VC alleles " + alleles + " not a strict subset of per read allele map alleles " + perReadAlleleLikelihoodMap.getAllelesSet()); - final HashMap alleleCounts = new HashMap<>(); - for ( final Allele allele : vc.getAlleles() ) { alleleCounts.put(allele, 0); } - - for ( final Map.Entry> el : perReadAlleleLikelihoodMap.getLikelihoodReadMap().entrySet()) { - final MostLikelyAllele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue(), alleles); - if (! a.isInformative() ) continue; // read is non-informative - final int prevCount = alleleCounts.get(a.getMostLikelyAllele()); - alleleCounts.put(a.getMostLikelyAllele(), prevCount + 1); + final HashMap alleleCounts = new HashMap<>(); + for (final Allele allele : vc.getAlleles()) { + alleleCounts.put(allele, new MutableInt(0)); } - final int[] counts = new int[alleleCounts.size()]; - counts[0] = alleleCounts.get(vc.getReference()); - for (int i = 0; i < vc.getAlternateAlleles().size(); i++) - counts[i+1] = alleleCounts.get( vc.getAlternateAllele(i) ); - - // sanity check - if(counts[0] + counts[1] == 0) - return null; - - return ((double) counts[0] / (double)(counts[0] + counts[1])); + for (final Map.Entry> el : perReadAlleleLikelihoodMap.getLikelihoodReadMap().entrySet()) { + final MostLikelyAllele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue(), alleles); + if (a.isInformative()) { + alleleCounts.get(a.getMostLikelyAllele()).increment(); + } + } + final int refCount = alleleCounts.get(vc.getReference()).intValue(); + final int altCount = alleleCounts.get(vc.getAlternateAllele(0)).intValue(); + return (refCount + altCount == 0) ? null : ((double) refCount) / (refCount + altCount); } public List getKeyNames() { return Arrays.asList(GATKVCFConstants.ALLELE_BALANCE_KEY); } From 22d94a70962ea5c4fe87a4579ced9501c77807bb Mon Sep 17 00:00:00 2001 From: Ron Levine Date: Mon, 14 Nov 2016 12:03:54 -0500 Subject: [PATCH 61/68] Fix adapter bounday for positive strand --- .../walkers/bqsr/BQSRIntegrationTest.java | 14 ++-- .../ErrorRatePerCycleIntegrationTest.java | 2 +- ...perGeneralPloidySuite1IntegrationTest.java | 6 +- .../UnifiedGenotyperIntegrationTest.java | 26 +++--- ...GenotyperNormalCallingIntegrationTest.java | 6 +- ...lexAndSymbolicVariantsIntegrationTest.java | 2 +- .../HaplotypeCallerGVCFIntegrationTest.java | 44 +++++----- .../HaplotypeCallerIntegrationTest.java | 26 +++--- ...aplotypeCallerParallelIntegrationTest.java | 2 +- .../coverage/CallableLociIntegrationTest.java | 8 +- .../DepthOfCoverageIntegrationTest.java | 26 +++--- .../qc/PileupWalkerIntegrationTest.java | 6 +- .../gatk/utils/sam/ReadUtils.java | 2 +- .../fragments/FragmentUtilsUnitTest.java | 83 ++++++++++--------- .../LocusIteratorByStateUnitTest.java | 4 +- .../gatk/utils/sam/ReadUtilsUnitTest.java | 4 +- 16 files changed, 131 insertions(+), 130 deletions(-) diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/bqsr/BQSRIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/bqsr/BQSRIntegrationTest.java index 4a148698b..8d35c0135 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/bqsr/BQSRIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/bqsr/BQSRIntegrationTest.java @@ -109,13 +109,13 @@ public class BQSRIntegrationTest extends WalkerTest { @DataProvider(name = "BQSRTest") public Object[][] createBQSRTestData() { return new Object[][]{ - {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, "", "fc9df1faf67bab70d32f89bcf4fa39db")}, - {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --no_standard_covs -cov ContextCovariate", "73ec38eb23b1739ecef8194cbb1132a3")}, - {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --no_standard_covs -cov CycleCovariate", "2d5721193ed4410d1a7d8db467a1fa05")}, - {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --indels_context_size 4", "16df7f1745f17f190c9fc33c475b91d8")}, - {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --low_quality_tail 5", "01811003ae811ee74c4b8d3eb5e992fe")}, - {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --quantizing_levels 6", "5e0eea6b0b300fbd2edabc3506ad3a60")}, - {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --mismatches_context_size 4", "8500b9747c16cb8eb17082163bdb8069")}, + {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, "", "dde4269a873c6f7a751e775cbc79fdb9")}, + {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --no_standard_covs -cov ContextCovariate", "fae427cb969638060e2294540e120dfc")}, + {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --no_standard_covs -cov CycleCovariate", "651be7dcd798c71ceaefb773ed792193")}, + {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --indels_context_size 4", "f5fd024e900d0d77c681483da1e5dfd5")}, + {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --low_quality_tail 5", "751189ec8cd406628cf4e698c69e8d11")}, + {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --quantizing_levels 6", "b921c36eb7f5be8f8b91b651247a83d7")}, + {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --mismatches_context_size 4", "6c3c6176eb6214dc7ef121fa11916e5d")}, {new BQSRTest(b36KGReference, SimpleCigarMatchMismatchBam, SimpleCigarMatchMismatchInterval, "", "56dfb2918a4cdae3ef9d705a43e85194")}, {new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", "", "0b5a8e259e997e4c7b5836d4c28e6f4d")}, {new BQSRTest(b36KGReference, validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "1:10,000,000-10,200,000", "", "281682124584ab384f23359934df0c3b")}, diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/diagnostics/ErrorRatePerCycleIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/diagnostics/ErrorRatePerCycleIntegrationTest.java index 82b842604..f619036e6 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/diagnostics/ErrorRatePerCycleIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/diagnostics/ErrorRatePerCycleIntegrationTest.java @@ -62,7 +62,7 @@ public class ErrorRatePerCycleIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( "-T ErrorRatePerCycle -R " + b37KGReference + " -I " + b37GoodBAM + " -L 20:10,000,000-10,100,000 -o %s", 1, - Arrays.asList("6191340f0b56ee81fb248c8f5c913a8e")); + Arrays.asList("a83453820b7afb5ee79856093d62901f")); executeTest("ErrorRatePerCycle:", spec); } } \ No newline at end of file diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite1IntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite1IntegrationTest.java index 8e9027de7..9382cbe2a 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite1IntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite1IntegrationTest.java @@ -69,17 +69,17 @@ public class UnifiedGenotyperGeneralPloidySuite1IntegrationTest extends WalkerTe @Test(enabled = true) public void testSNP_ACS_Pools() { - executor.PC_LSV_Test_short("-A AlleleCountBySample -maxAltAlleles 1 -ploidy 6 -out_mode EMIT_ALL_CONFIDENT_SITES", "LSV_SNP_ACS", "SNP", "853b47780322b8133577aea528b9fd77"); + executor.PC_LSV_Test_short("-A AlleleCountBySample -maxAltAlleles 1 -ploidy 6 -out_mode EMIT_ALL_CONFIDENT_SITES", "LSV_SNP_ACS", "SNP", "90ed6f1c268b9c57ecb52b35a88b9368"); } @Test(enabled = true) public void testBOTH_GGA_Pools() { - executor.PC_LSV_Test(String.format("-A AlleleCountBySample -maxAltAlleles 2 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s", LSV_ALLELES), "LSV_BOTH_GGA", "BOTH", "ee5a2f8954f38d6e5d44fe50b22e43a1"); + executor.PC_LSV_Test(String.format("-A AlleleCountBySample -maxAltAlleles 2 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s", LSV_ALLELES), "LSV_BOTH_GGA", "BOTH", "5ad4dd6b0c3c170ba44fdad6d4fa58cf"); } @Test(enabled = true) public void testINDEL_GGA_Pools() { - executor.PC_LSV_Test(String.format("-A AlleleCountBySample -maxAltAlleles 1 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s", LSV_ALLELES), "LSV_INDEL_GGA", "INDEL", "5cb3fe396302f3d4a4a9b7b3cc1877cc"); + executor.PC_LSV_Test(String.format("-A AlleleCountBySample -maxAltAlleles 1 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s", LSV_ALLELES), "LSV_INDEL_GGA", "INDEL", "d26b0ba07e056b73fe4cfe873636d0d6"); } @Test(enabled = true) diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index 18f091d4c..6c7c633b3 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -86,7 +86,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMinBaseQualityScore() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 --min_base_quality_score 26", 1, - Arrays.asList("52a3064863b97e43d8df878edc29275c")); + Arrays.asList("d0499af17dc66e77849e547bc5a182ff")); executeTest("test min_base_quality_score 26", spec); } @@ -102,7 +102,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testNDA() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " --annotateNDA -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1, - Arrays.asList("2f2d7dd623446fc3cae62a44a016c16d")); + Arrays.asList("5f69de274c0705cf1cb9387651df98bf")); executeTest("test NDA", spec); } @@ -124,17 +124,17 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { @Test public void testOutputParameterSitesOnly() { - testOutputParameters("-sites_only", "3c0e109190cfbe41d24e7726cc8fe6e3"); + testOutputParameters("-sites_only", "4355f5b6fd8cd769a479677f1255bee5"); } @Test public void testOutputParameterAllConfident() { - testOutputParameters("--output_mode EMIT_ALL_CONFIDENT_SITES", "06386b0a4495583aa924e5addd56c5dc"); + testOutputParameters("--output_mode EMIT_ALL_CONFIDENT_SITES", "182af9490667cb6ce1415305de4f3fdd"); } @Test public void testOutputParameterAllSites() { - testOutputParameters("--output_mode EMIT_ALL_SITES", "b8b21ad6a2ff1f908e8e0073b57ba0e0"); + testOutputParameters("--output_mode EMIT_ALL_SITES", "524e85c225ce330fd094de93f078fa56"); } private void testOutputParameters(final String args, final String md5) { @@ -148,7 +148,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testConfidence() { WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_call_conf 10 ", 1, - Arrays.asList("5c7d237e666439edb0ef8c697e37933c")); + Arrays.asList("c794c7681856c1ec3c3429dbd9e5dc75")); executeTest("test confidence 1", spec1); } @@ -156,7 +156,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testNoPrior() { WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_call_conf 10 -inputPrior 0.33333 -inputPrior 0.33333", 1, - Arrays.asList("24b550bbc3c9f0577e069b3fd3122d52")); + Arrays.asList("39d15f041a0c86058f46f23960bb129b")); executeTest("test no prior 1", spec1); } @@ -165,7 +165,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testUserPrior() { WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_call_conf 10 -inputPrior 0.001 -inputPrior 0.495", 1, - Arrays.asList("f60b6705daec1059ce3e533bf8e44c89")); + Arrays.asList("00bff7a5dc584b5b6931a826eae6b013")); executeTest("test user prior 1", spec1); } @@ -174,7 +174,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void emitPLsAtAllSites() { WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 --output_mode EMIT_ALL_SITES -allSitePLs", 1, - Arrays.asList("ae778a64323abe0da5194f0b936f48aa")); + Arrays.asList("afcb9c4fd4a0e9ba4694d911bc75a7b2")); // GDA: TODO: BCF encoder/decoder doesn't seem to support non-standard values in genotype fields. IE even if there is a field defined in FORMAT and in the header the BCF2 encoder will still fail spec1.disableShadowBCF(); @@ -190,12 +190,12 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { @Test public void testHeterozyosity1() { - testHeterozosity( 0.01, "7bbba110f720fc8c115fe2d53b34d693" ); + testHeterozosity( 0.01, "8abbec54f4bf82d7c48bf40b43fdaa91" ); } @Test public void testHeterozyosity2() { - testHeterozosity( 1.0 / 1850, "767e8eacd216ac7437456e690287cecf" ); + testHeterozosity( 1.0 / 1850, "4221b00b0d10c005fb69fd2a298e384c" ); } private void testHeterozosity(final double arg, final String md5) { @@ -274,7 +274,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,100,000", 1, - Arrays.asList("6464138c0bd2ab2cac06773f19e37a4c")); + Arrays.asList("482f6b310e59d05508811932ec21c801")); executeTest(String.format("test multiple technologies"), spec); } @@ -293,7 +293,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -L 1:10,000,000-10,100,000" + " -baq CALCULATE_AS_NECESSARY", 1, - Arrays.asList("8885492069487efcf67e13608e762acd")); + Arrays.asList("c99b7dbe881aa3274cb9876e495cf8f3")); executeTest(String.format("test calling with BAQ"), spec); } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperNormalCallingIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperNormalCallingIntegrationTest.java index 0ec669c01..11e910752 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperNormalCallingIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperNormalCallingIntegrationTest.java @@ -94,7 +94,7 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{ public void testSingleSamplePilot2() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,100,000", 1, - Arrays.asList("e5c34be242c9b6bec687c7384ef83cb2")); + Arrays.asList("5919639094d775cdd6b6965e1210753a")); executeTest("test SingleSample Pilot2", spec); } @@ -102,7 +102,7 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{ public void testMultipleSNPAlleles() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -glm BOTH --dbsnp " + b37dbSNP129 + " -I " + privateTestDir + "multiallelic.snps.bam -o %s -L " + privateTestDir + "multiallelic.snps.intervals", 1, - Arrays.asList("fca6cacfb523114a3fb93772569deb08")); + Arrays.asList("3304a20af6745beeec07ef2c47d617d3")); executeTest("test Multiple SNP alleles", spec); } @@ -126,7 +126,7 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{ public void testMismatchedPLs() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -I " + privateTestDir + "mismatchedPLs.bam -o %s -L 1:24020341", 1, - Arrays.asList("c41ff9e1e3cfb6bd45d772787dd8e2d3")); + Arrays.asList("5dc0ccd66105e0f12c72987d56c85235")); executeTest("test mismatched PLs", spec); } } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java index 39d2cf8c8..755e4af49 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java @@ -72,7 +72,7 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa @Test public void testHaplotypeCallerMultiSampleComplex1() { - HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "79567a4e4307495e880e9782b3a88f7d"); + HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "4f30d9c9f1eb4529071b7060e497235d"); } private void HCTestSymbolicVariants(String bam, String args, String md5) { diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java index 1b9f72c7a..334e38907 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java @@ -87,11 +87,11 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest { //TODO this might need to be addressed at some point. //TODO the following test is commented out for the record //tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.NONE, PCRFreeIntervals, "7f09c261950bf86e435edfa69ed2ec71"}); - tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.NONE, PCRFreeIntervals, "a4286ada7f9efaa83f7a8f0e72c3cb45"}); - tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "cf5545094ebb264fa8eb879fd848d9ef"}); - tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "0086cc735cf792a9f236ec057c73b750"}); + tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.NONE, PCRFreeIntervals, "c87ff5438d4a6559b33970f1294f77c6"}); + tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "2cc9f789100e138ffc0c383b12a1322a"}); + tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "44cc8f78e28d905efc30c218d821cc7c"}); tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.NONE, WExIntervals, "2e81881e92061ad4eb29025ffdc129c7"}); - tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "2c67bdc08c8784f2114c2039270b9766"}); + tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "39b2ad53ffdfcbaa4af3454c321daaa7"}); tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.GVCF, WExIntervals, "861fa31b135d200f765914126b422cf4"}); return tests.toArray(new Object[][]{}); @@ -106,11 +106,11 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest { final String WExIntervals = "-L 20:10,000,000-10,100,000 -isr INTERSECTION -L " + hg19Chr20Intervals; // this functionality can be adapted to provide input data for whatever you might want in your data - tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.NONE, PCRFreeIntervals, "3ae2c7e570855f6d6ca58ddd1089a970"}); - tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "1961007bd98a174a4a1b3e76a9c2f156"}); - tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "b34b0b61583628fbd51221627adcdb81"}); + tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.NONE, PCRFreeIntervals, "cbf988eca3f368ef5b17108595cd8c5e"}); + tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "d146c8dc4fc0605b3776ab5fec837d53"}); + tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "c317193f0d1c9a8168f2625c8bf1dd2b"}); tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.NONE, WExIntervals, "63ff771eed3e62340c8938b4963d0add"}); - tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "1122a0b3849f42d1c4a654f93b660e1b"}); + tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "c6c19ff9dc229f6af6080a175267344c"}); final String NA12878bandedResolutionMD5 = "7240907ec3dc2ed49b55c9956546ba13"; tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.GVCF, WExIntervals, NA12878bandedResolutionMD5}); @@ -129,11 +129,11 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest { final String WExIntervals = "-L 20:10,000,000-10,100,000 -isr INTERSECTION -L " + hg19Chr20Intervals; // this functionality can be adapted to provide input data for whatever you might want in your data - tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.NONE, PCRFreeIntervals, "8bf132d73cf6b0851ae73c6799f19ba9"}); - tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "654c8264cfcbcb71da479761912fbd71"}); - tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "4959f20a8bd3327760d94ccc40157f81"}); + tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.NONE, PCRFreeIntervals, "85749762dac9684f4c8f9da18a210109"}); + tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "6c727b804084a2324ecd1c98b72734b9"}); + tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "190cef14684c95ba290d7a5fa13fdc07"}); tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.NONE, WExIntervals, "6ad7855dbf6dda2060aa93a3ee010b3e"}); - tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "50e628de2a79cd6887af020b713ca3b8"}); + tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "21c87a3edafee3cb080169963e1e2623"}); tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.GVCF, WExIntervals, "e48bbcf453e63a6ea5eeda05f6865f94"}); return tests.toArray(new Object[][]{}); @@ -147,11 +147,11 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest { final String WExIntervals = "-L 20:10,000,000-10,100,000 -isr INTERSECTION -L " + hg19Chr20Intervals; // this functionality can be adapted to provide input data for whatever you might want in your data - tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.NONE, PCRFreeIntervals, "6662cfc41393257dfd6c39f1af1e3843"}); - tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "dd9fdcae44ab316c04650bf50c38e4b2"}); - tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "cb318100ae15cb3dcc342b6142ac6361"}); + tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.NONE, PCRFreeIntervals, "532188686870c7edd2ea3352ea93f66a"}); + tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "48521b89cecceb9846e4dfc0dd415874"}); + tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "eaacbeaff99a37ffa07e1f11e7f1deb2"}); tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.NONE, WExIntervals, "af0fe243e3b96e59097187cd16ba1597"}); - tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "8a094080fb25bbcd39325dcdd62bcf65"}); + tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "868a097a8a108f5159dbbabbfdb2e38b"}); tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.GVCF, WExIntervals, "685025831ac783784d7838e568e35f46"}); return tests.toArray(new Object[][]{}); @@ -294,7 +294,7 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest { public void testNoCallGVCFMissingPLsBugFix() { final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -pairHMMSub %s %s -R %s -I %s -L %s -ERC GVCF --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d", HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, b37KGReference, NOCALL_GVCF_BUGFIX_BAM, NOCALL_GVCF_BUGFIX_INTERVALS, GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER); - final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("66f242cf3f1f1776c743505b84505f94")); + final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("883fdc6c10fd7cbc1de375ed26ce5734")); spec.disableShadowBCF(); executeTest("testNoCallGVCFMissingPLsBugFix", spec); } @@ -363,7 +363,7 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest { public void testHaplotypeCallerMultiAllelicNonRef() { final String commandLine = String.format("-T HaplotypeCaller -R %s -I %s -L %s -ERC GVCF --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d -A StrandAlleleCountsBySample", b37KGReference, privateTestDir + "multiallelic-nonref.bam", "2:47641259-47641859", GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER); - final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Collections.singletonList("19fc2c5218d907fcdcd36de2afbef19c")); + final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Collections.singletonList("7c707c66f77482e3f6b2b014b152bbf4")); spec.disableShadowBCF(); executeTest(" testHaplotypeCallerMultiAllelicNonRef", spec); } @@ -372,7 +372,7 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest { public void testHaplotypeCallerMaxNumPLValues() { final String commandLine = String.format("-T HaplotypeCaller -R %s -I %s -L %s -ERC GVCF --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d -ploidy 4 -maxNumPLValues 70", b37KGReference, privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.bam", validationDataLocation + "NA12878.HiSeq.b37.chr20.10_11mb.test.intervals", GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER); - final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Collections.singletonList("b2adc744d9dff2f488149bcc96d6bb6d")); + final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Collections.singletonList("dd0bfade4f0a9f23a500fd23c3a24a29")); spec.disableShadowBCF(); executeTest("testHaplotypeCallerMaxNumPLValues", spec); } @@ -389,7 +389,7 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest { final String commandLine = String.format("-T HaplotypeCaller -R %s -I %s -L %s -ERC GVCF --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d -ploidy 4 -maxNumPLValues 30 -log %s", b37KGReference, privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.bam", validationDataLocation + "NA12878.HiSeq.b37.chr20.10_11mb.test.intervals", GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER, logFileName); - final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("cbd37b492f77c50d2da744d5e00c6f90")); + final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("5ea45f32c09e9f7a4132f98d642f260b")); spec.disableShadowBCF(); executeTest("testHaplotypeCallerMaxNumPLValuesExceededWithWarnLogLevel", spec); // Make sure the "Maximum allowed number of PLs exceeded" messages are in the log @@ -414,7 +414,7 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest { final String commandLine = String.format("-T HaplotypeCaller -R %s -I %s -L %s -ERC GVCF --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d -ploidy 4 -maxNumPLValues 30 -log %s", b37KGReference, privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.bam", validationDataLocation + "NA12878.HiSeq.b37.chr20.10_11mb.test.intervals", GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER, logFileName); - final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("cbd37b492f77c50d2da744d5e00c6f90")); + final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("5ea45f32c09e9f7a4132f98d642f260b")); spec.disableShadowBCF(); executeTest("testHaplotypeCallerMaxNumPLValuesExceededWithDebugLogLevel", spec); // Make sure the "Maximum allowed number of PLs exceeded" messages are in the log @@ -458,7 +458,7 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest { public void testHaplotypeCallerGVCSpanDel() { final String commandLine = String.format("-T HaplotypeCaller -R %s -I %s -L 1:26357667 -ERC GVCF --no_cmdline_in_header -A AS_ReadPosRankSumTest -A ReadPosRankSumTest -variant_index_type %s -variant_index_parameter %d", b37KGReference, privateTestDir + "NexPond-377866-1:26357600-26357700.bam", GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER); - final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("b8f0bb74bc099a8f78d600d88861e1b6")); + final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("bb12cf2dfa6f1fa0692395e295792584")); spec.disableShadowBCF(); executeTest("testHaplotypeCallerGVCSpanDel", spec); } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index aadd03a32..3f80b779b 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -107,7 +107,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeBAMOutFlags() throws IOException { - HCTestWithBAMOut(NA12878_BAM, " -L 20:10000000-10100000 ", "700c5d20e9d9d9a431fcda9bff91f72e", "a0daf5a80158d4a462248415c1e17565"); + HCTestWithBAMOut(NA12878_BAM, " -L 20:10000000-10100000 ", "2e4cd93b4cad12259728d19a41d2a6ff", "9d6bd79cdae3e3222fa93f542fbca153"); } @Test @@ -118,7 +118,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerSingleSample() throws IOException { - HCTest(NA12878_BAM, "", "c04293cb8466a1a217bce4ef419bdabe"); + HCTest(NA12878_BAM, "", "9f17ce83e639a1bd9b3f2d9fa33b15b2"); } @Test @@ -133,12 +133,12 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerSingleSampleTetraploid() throws IOException { - HCTest(NA12878_BAM, "-ploidy 4", "5098645e8b570bc4521570654fa91806"); + HCTest(NA12878_BAM, "-ploidy 4", "f993db900080aeb48c43982745e1084d"); } @Test public void testHaplotypeCallerMinBaseQuality() throws IOException { - HCTest(NA12878_BAM, "-mbq 15", "c04293cb8466a1a217bce4ef419bdabe"); + HCTest(NA12878_BAM, "-mbq 15", "9f17ce83e639a1bd9b3f2d9fa33b15b2"); } @Test @@ -148,12 +148,12 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMinBaseQualityTetraploid() throws IOException { - HCTest(NA12878_BAM, "-mbq 15 -ploidy 4", "5098645e8b570bc4521570654fa91806"); + HCTest(NA12878_BAM, "-mbq 15 -ploidy 4", "f993db900080aeb48c43982745e1084d"); } @Test public void testHaplotypeCallerGraphBasedSingleSample() throws IOException { - HCTest(NA12878_BAM, "-likelihoodEngine GraphBased", "8ab21bd6fb7ef37480f556fd5fa5375c"); + HCTest(NA12878_BAM, "-likelihoodEngine GraphBased", "420954190aef671edd02bd3c73e22642"); } @Test @@ -168,7 +168,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerSingleSampleWithDbsnp() throws IOException { - HCTest(NA12878_BAM, "-D " + b37dbSNP132, "ff8e142f491b06e17e64e3a5d59737a7"); + HCTest(NA12878_BAM, "-D " + b37dbSNP132, "9e8513ed4065138bee8dd9363a9fd355"); } @Test @@ -312,7 +312,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { public void HCTestDBSNPAnnotationWGS() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -pairHMMSub " + HMM_SUB_IMPLEMENTATION + " " + ALWAYS_LOAD_VECTOR_HMM + " -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,090,000-10,100,000 -D " + b37dbSNP132, 1, - Arrays.asList("b56895e6d28ea0b9dadeecd0ff61687e")); + Arrays.asList("fc71471b01f93bc531e3cf19cdf78b1f")); executeTest("HC calling with dbSNP ID annotation on WGS intervals", spec); } @@ -321,7 +321,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -pairHMMSub " + HMM_SUB_IMPLEMENTATION + " " + ALWAYS_LOAD_VECTOR_HMM + " -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,100,000-11,000,000 -D " + b37dbSNP132 + " -L " + hg19Intervals + " -isr INTERSECTION", 1, - Arrays.asList("7b52164df8bf76d789836f990bd6066a")); + Arrays.asList("bf8bb5d13b01facdf90ec24bfbf82faa")); executeTest("HC calling with dbSNP ID annotation on WEx intervals", spec); } @@ -329,7 +329,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { public void HCTestDBSNPAnnotationWGSGraphBased() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T HaplotypeCaller -likelihoodEngine GraphBased --disableDithering --pcr_indel_model NONE -pairHMMSub " + HMM_SUB_IMPLEMENTATION + " " + ALWAYS_LOAD_VECTOR_HMM + " -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,090,000-10,100,000 -D " + b37dbSNP132, 1, - Arrays.asList("096826325215f79fe70661d984ae45a4")); + Arrays.asList("ec9a1fb56882c21f3e4793e5f71f4e9e")); executeTest("HC calling with dbSNP ID annotation on WGS intervals", spec); } @@ -338,7 +338,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T HaplotypeCaller -likelihoodEngine GraphBased --disableDithering --pcr_indel_model NONE -pairHMMSub " + HMM_SUB_IMPLEMENTATION + " " + ALWAYS_LOAD_VECTOR_HMM + " -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,000,000-11,000,000 -D " + b37dbSNP132 + " -L " + hg19Intervals + " -isr INTERSECTION", 1, - Arrays.asList("ff3b24412090ce7693d66d750ae84ac9")); + Arrays.asList("2ffaf2e9ef293a6d5ce7c00be40edba7")); executeTest("HC calling with dbSNP ID annotation on WEx intervals", spec); } @@ -430,7 +430,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { // but please make sure that both outputs get the same variant, // alleles all with DBSNP ids // We test here that change in active region size does not have an effect in placement of indels. - final String md5 = "87b687b5476eb38b11db6a156b4066c8"; + final String md5 = "66caceac0a54cdfd847bfdf4226bb36a"; final WalkerTestSpec shortSpec = new WalkerTestSpec(commandLineShortInterval + " -o %s",Arrays.asList(md5)); executeTest("testDifferentIndelLocationsDueToSWExactDoubleComparisonsFix::shortInterval",shortSpec); final WalkerTestSpec longSpec = new WalkerTestSpec(commandLineLongInterval + " -o %s",Arrays.asList(md5)); @@ -484,7 +484,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerTandemRepeatAnnotator() throws IOException{ - HCTest(NA12878_BAM, " -L 20:10001000-10010000 -A TandemRepeatAnnotator -XA MappingQualityZero -XA SpanningDeletions", "2cf4cab0035d09aa0aec6f3faa2c9df6"); + HCTest(NA12878_BAM, " -L 20:10001000-10010000 -A TandemRepeatAnnotator -XA MappingQualityZero -XA SpanningDeletions", "408c6940a090d31c11c171ed5e0e033c"); } @Test diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerParallelIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerParallelIntegrationTest.java index fdc51094c..6cc7f63a8 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerParallelIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerParallelIntegrationTest.java @@ -69,7 +69,7 @@ public class HaplotypeCallerParallelIntegrationTest extends WalkerTest { List tests = new ArrayList<>(); for ( final int nct : Arrays.asList(1, 2, 4) ) { - tests.add(new Object[]{nct, "07f969acede5e0ad7e1e94f4383af2a9"}); + tests.add(new Object[]{nct, "da195c6c4c8e765acb35f08e37132108"}); } return tests.toArray(new Object[][]{}); diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/tools/walkers/coverage/CallableLociIntegrationTest.java b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/tools/walkers/coverage/CallableLociIntegrationTest.java index 1889f6f4d..06d74a636 100644 --- a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/tools/walkers/coverage/CallableLociIntegrationTest.java +++ b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/tools/walkers/coverage/CallableLociIntegrationTest.java @@ -33,13 +33,13 @@ import java.util.Arrays; public class CallableLociIntegrationTest extends WalkerTest { final static String commonArgs = "-R " + b36KGReference + " -T CallableLoci -I " + validationDataLocation + "/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s"; - final static String SUMMARY_MD5 = "a6f5963669f19d9d137ced87d65834b0"; + final static String SUMMARY_MD5 = "27aea9e76af554db5e886dacb0d3edd0"; @Test public void testCallableLociWalkerBed() { String gatk_args = commonArgs + " -format BED -L 1:10,000,000-11,000,000 -summary %s"; WalkerTestSpec spec = new WalkerTestSpec(gatk_args, 2, - Arrays.asList("9b4ffea1dbcfefadeb1c9fa74b0e0e59", SUMMARY_MD5)); + Arrays.asList("908a22c0bfe81e6f4c571de290e48d03", SUMMARY_MD5)); executeTest("formatBed", spec); } @@ -47,7 +47,7 @@ public class CallableLociIntegrationTest extends WalkerTest { public void testCallableLociWalkerPerBase() { String gatk_args = commonArgs + " -format STATE_PER_BASE -L 1:10,000,000-11,000,000 -summary %s"; WalkerTestSpec spec = new WalkerTestSpec(gatk_args, 2, - Arrays.asList("d6505e489899e80c08a7168777f6e07b", SUMMARY_MD5)); + Arrays.asList("11e7fdd39bb83eec44ca452faf0dc825", SUMMARY_MD5)); executeTest("format_state_per_base", spec); } @@ -63,7 +63,7 @@ public class CallableLociIntegrationTest extends WalkerTest { public void testCallableLociWalker3() { String gatk_args = commonArgs + " -format BED -L 1:10,000,000-11,000,000 -minDepth 10 -maxDepth 100 --minBaseQuality 10 --minMappingQuality 20 -summary %s"; WalkerTestSpec spec = new WalkerTestSpec(gatk_args, 2, - Arrays.asList("7f79ad8195c4161060463eeb21d2bb11", "7ee269e5f4581a924529a356cc806e55")); + Arrays.asList("5e5ee8850815810680f205a520d05a0f", "441b7c31bed5c30705f9565b708ecace")); executeTest("formatBed lots of arguments", spec); } } diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/tools/walkers/coverage/DepthOfCoverageIntegrationTest.java b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/tools/walkers/coverage/DepthOfCoverageIntegrationTest.java index d064328c0..9515978cf 100644 --- a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/tools/walkers/coverage/DepthOfCoverageIntegrationTest.java +++ b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/tools/walkers/coverage/DepthOfCoverageIntegrationTest.java @@ -84,25 +84,25 @@ public class DepthOfCoverageIntegrationTest extends WalkerTest { spec.setOutputFileLocation(baseOutputFile); // now add the expected files that get generated - spec.addAuxFile("0f9603eb1ca4a26828e82d8c8f4991f6", baseOutputFile); - spec.addAuxFile("51e6c09a307654f43811af35238fb179", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".library_cumulative_coverage_counts")); + spec.addAuxFile("423571e4c05e7934322172654ac6dbb7", baseOutputFile); + spec.addAuxFile("9df5e7e07efeb34926c94a724714c219", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".library_cumulative_coverage_counts")); spec.addAuxFile("3bf1d7247ea68d1afb35c2032c68dbdf", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".library_cumulative_coverage_proportions")); spec.addAuxFile("9cd395f47b329b9dd00ad024fcac9929", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".library_interval_statistics")); - spec.addAuxFile("681dcbedcf7ca14bb44134abd1d8da3f", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".library_interval_summary")); - spec.addAuxFile("0f58e7f0909b84897fea5daebd9d2948", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".library_statistics")); - spec.addAuxFile("2832e48c12b1d8811ccd319a8ffb8dc1", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".library_summary")); - spec.addAuxFile("a836b92ac17b8ff9788e2aaa9116b5d4", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".read_group_cumulative_coverage_counts")); + spec.addAuxFile("31f9aa7ddad7809bc2f6675014dd4eaf", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".library_interval_summary")); + spec.addAuxFile("958667ce6f92106a32a77098c1533d4f", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".library_statistics")); + spec.addAuxFile("5e87db30702e44031920f7417cfac844", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".library_summary")); + spec.addAuxFile("b82846df660f0aac8429aec57c2a62d6", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".read_group_cumulative_coverage_counts")); spec.addAuxFile("22fee04ffca7fa489ae2e66309de12e4", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".read_group_cumulative_coverage_proportions")); spec.addAuxFile("7b9d0e93bf5b5313995be7010ef1f528", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".read_group_interval_statistics")); - spec.addAuxFile("2de22ad840bf40621e51316728a32fe9", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".read_group_interval_summary")); - spec.addAuxFile("6c84a067016c63e8c2face2de800acc7", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".read_group_statistics")); - spec.addAuxFile("2629883b99428f51e2d90d820b45504a", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".read_group_summary")); - spec.addAuxFile("6909d50a7da337cd294828b32b945eb8", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_cumulative_coverage_counts")); + spec.addAuxFile("e0f0d44cb4bc0557ee55b8b2aaa0cb9f", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".read_group_interval_summary")); + spec.addAuxFile("d69d08fd37f63941eea52d7b66a625df", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".read_group_statistics")); + spec.addAuxFile("de1795f42397d1282c012a3b9c396b5c", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".read_group_summary")); + spec.addAuxFile("d53431022f7387fe9ac47814ab1fcd88", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_cumulative_coverage_counts")); spec.addAuxFile("f6e52c5659f53afdcfc69c25272b9e99", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_cumulative_coverage_proportions")); spec.addAuxFile("df0ba76e0e6082c0d29fcfd68efc6b77", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_interval_statistics")); - spec.addAuxFile("c2fb3a31d02df03c35afbe7f2284ad66", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_interval_summary")); - spec.addAuxFile("d1b3d08c6e9565a1dab727d089085761", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_statistics")); - spec.addAuxFile("c0fadfcfd88cec81ba0d57b33bdec277", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_summary")); + spec.addAuxFile("a72253b8e06dc6c653355b20e56e2f6e", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_interval_summary")); + spec.addAuxFile("23ca7c647e3e53eb9b713d46b92bf6a2", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_statistics")); + spec.addAuxFile("cd4ef2d43dc98f1bd7e6db532c60b3d1", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_summary")); execute("testBaseOutputNoFiltering",spec); } diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/tools/walkers/qc/PileupWalkerIntegrationTest.java b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/tools/walkers/qc/PileupWalkerIntegrationTest.java index a52059bb5..db75bf6f6 100644 --- a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/tools/walkers/qc/PileupWalkerIntegrationTest.java +++ b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/tools/walkers/qc/PileupWalkerIntegrationTest.java @@ -77,10 +77,10 @@ public class PileupWalkerIntegrationTest extends WalkerTest { @DataProvider(name="GATKBAMIndexTest") public Object[][] makeMyDataProvider() { List tests = new ArrayList(); - tests.add(new Object[]{"-L 20:1-76,050","8702701350de11a6d28204acefdc4775"}); + tests.add(new Object[]{"-L 20:1-76,050","f275a2de053f44db19bc6591fe1c8dae"}); tests.add(new Object[]{"-L 20:10,000,000-10,001,100","818cf5a8229efe6f89fc1cd8145ccbe3"}); - tests.add(new Object[]{"-L 20:62,954,114-63,025,520","22471ea4a12e5139aef62bf8ff2a5b63"}); - tests.add(new Object[]{"-L 20:1-76,050 -L 20:20,000,000-20,000,100 -L 20:40,000,000-40,000,100 -L 20:30,000,000-30,000,100 -L 20:50,000,000-50,000,100 -L 20:62,954,114-63,025,520 ","08d899ed7c5a76ef3947bf67338acda1"}); + tests.add(new Object[]{"-L 20:62,954,114-63,025,520","7e3462745d298de3b7493b42d3603392"}); + tests.add(new Object[]{"-L 20:1-76,050 -L 20:20,000,000-20,000,100 -L 20:40,000,000-40,000,100 -L 20:30,000,000-30,000,100 -L 20:50,000,000-50,000,100 -L 20:62,954,114-63,025,520 ","977298255b04d66c17bb5466da8251f7"}); return tests.toArray(new Object[][]{}); } diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/sam/ReadUtils.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/sam/ReadUtils.java index afa486cac..72254c884 100644 --- a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/sam/ReadUtils.java +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/sam/ReadUtils.java @@ -215,7 +215,7 @@ public class ReadUtils { return read.getMateAlignmentStart() - 1; // case 1 (see header) } else { final int insertSize = Math.abs(read.getInferredInsertSize()); // the inferred insert size can be negative if the mate is mapped before the read (so we take the absolute value) - return read.getAlignmentStart() + insertSize + 1; // case 2 (see header) + return read.getAlignmentStart() + insertSize; // case 2 (see header) } } diff --git a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/fragments/FragmentUtilsUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/fragments/FragmentUtilsUnitTest.java index 1791f2274..d6bbb927e 100644 --- a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/fragments/FragmentUtilsUnitTest.java +++ b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/fragments/FragmentUtilsUnitTest.java @@ -54,8 +54,8 @@ public class FragmentUtilsUnitTest extends BaseTest { private final static boolean DEBUG = false; private class FragmentUtilsTest extends TestDataProvider { - List statesForPileup = new ArrayList(); - List statesForReads = new ArrayList(); + List statesForPileup = new ArrayList<>(); + List statesForReads = new ArrayList<>(); private FragmentUtilsTest(String name, int readLen, int leftStart, int rightStart, boolean leftIsFirst, boolean leftIsNegative) { @@ -70,8 +70,8 @@ public class FragmentUtilsUnitTest extends BaseTest { boolean posCoveredByRight = pos >= right.getAlignmentStart() && pos <= right.getAlignmentEnd(); if ( posCoveredByLeft || posCoveredByRight ) { - List reads = new ArrayList(); - List offsets = new ArrayList(); + List reads = new ArrayList<>(); + List offsets = new ArrayList<>(); if ( posCoveredByLeft ) { reads.add(left); @@ -174,7 +174,7 @@ public class FragmentUtilsUnitTest extends BaseTest { @DataProvider(name = "MergeFragmentsTest") public Object[][] createMergeFragmentsTest() throws Exception { - List tests = new ArrayList(); + List tests = new ArrayList<>(); final String leftFlank = "CCC"; final String rightFlank = "AAA"; @@ -248,29 +248,29 @@ public class FragmentUtilsUnitTest extends BaseTest { final byte[] commonQuals = Utils.dupBytes((byte)30, common.length()); final String adapter = "NNNN"; - final GATKSAMRecord read1 = makeOverlappingRead(adapter, 30, common, commonQuals, "", 30, 10); - final GATKSAMRecord read2 = makeOverlappingRead("", 30, common, commonQuals, adapter, 30, 10); + final GATKSAMRecord readLeftAdapter = makeOverlappingRead(adapter, 30, common, commonQuals, "", 30, 10); + final GATKSAMRecord readRightAdapter = makeOverlappingRead("", 30, common, commonQuals, adapter, 30, 10); final GATKSAMRecord expectedMerged = makeOverlappingRead("", 30, common, commonQuals, "", 30, 10); - read1.setCigarString("4S" + common.length() + "M"); - read1.setProperPairFlag(true); - read1.setReadPairedFlag(true); - read1.setFirstOfPairFlag(true); - read1.setReadNegativeStrandFlag(true); - read1.setMateNegativeStrandFlag(false); - read1.setMateAlignmentStart(read2.getAlignmentStart()); - read2.setCigarString(common.length() + "M4S"); - read2.setProperPairFlag(true); - read2.setReadPairedFlag(true); - read2.setFirstOfPairFlag(false); - read2.setReadNegativeStrandFlag(false); - read2.setMateNegativeStrandFlag(true); - read2.setMateAlignmentStart(read1.getAlignmentStart()); + readLeftAdapter.setCigarString(adapter.length() + "S" + common.length() + "M"); + readLeftAdapter.setProperPairFlag(true); + readLeftAdapter.setReadPairedFlag(true); + readLeftAdapter.setFirstOfPairFlag(true); + readLeftAdapter.setReadNegativeStrandFlag(true); + readLeftAdapter.setMateNegativeStrandFlag(false); + readLeftAdapter.setMateAlignmentStart(readRightAdapter.getAlignmentStart()); + readRightAdapter.setCigarString(common.length() + "M4S"); + readRightAdapter.setProperPairFlag(true); + readRightAdapter.setReadPairedFlag(true); + readRightAdapter.setFirstOfPairFlag(false); + readRightAdapter.setReadNegativeStrandFlag(false); + readRightAdapter.setMateNegativeStrandFlag(true); + readRightAdapter.setMateAlignmentStart(readLeftAdapter.getAlignmentStart()); - final int insertSize = common.length() - 1; - read1.setInferredInsertSize(-insertSize); - read2.setInferredInsertSize(insertSize); + final int insertSize = common.length(); + readLeftAdapter.setInferredInsertSize(-insertSize); + readRightAdapter.setInferredInsertSize(insertSize); - final GATKSAMRecord actual = FragmentUtils.mergeOverlappingPairedFragments(read1, read2); + final GATKSAMRecord actual = FragmentUtils.mergeOverlappingPairedFragments(readLeftAdapter, readRightAdapter); Assert.assertEquals(actual.getCigarString(), expectedMerged.getCigarString()); Assert.assertEquals(actual.getReadBases(), expectedMerged.getReadBases()); Assert.assertEquals(actual.getReadGroup(), expectedMerged.getReadGroup()); @@ -282,24 +282,25 @@ public class FragmentUtilsUnitTest extends BaseTest { @Test(enabled = true) public void testHardClippingBeforeMergeResultingInCompletelyContainedSecondRead() { final String adapter = "NNNN"; + final int minReadSize = 7; - final GATKSAMRecord read1 = makeOverlappingRead(adapter, 30, Utils.dupString("A", 10), Utils.dupBytes((byte)30, 10), "", 30, 10); - final GATKSAMRecord read2 = makeOverlappingRead("", 30, Utils.dupString("A", 7), Utils.dupBytes((byte)30, 7), adapter, 30, 10); - read1.setCigarString("4S10M"); - read1.setProperPairFlag(true); - read1.setFirstOfPairFlag(true); - read1.setReadNegativeStrandFlag(true); - read1.setMateAlignmentStart(10); - read2.setCigarString("7M4S"); - read2.setProperPairFlag(true); - read2.setFirstOfPairFlag(false); - read2.setReadNegativeStrandFlag(false); + final GATKSAMRecord readLeftAdapter = makeOverlappingRead(adapter, 30, Utils.dupString("A", 10), Utils.dupBytes((byte)30, 10), "", 30, 10); + final GATKSAMRecord readRightAdapter = makeOverlappingRead("", 30, Utils.dupString("A", minReadSize), Utils.dupBytes((byte)30, minReadSize), adapter, 30, 10); + readLeftAdapter.setCigarString(adapter.length() + "S10M"); + readLeftAdapter.setProperPairFlag(true); + readLeftAdapter.setFirstOfPairFlag(true); + readLeftAdapter.setReadNegativeStrandFlag(true); + readLeftAdapter.setMateAlignmentStart(10); + readRightAdapter.setCigarString(minReadSize + "M4S"); + readRightAdapter.setProperPairFlag(true); + readRightAdapter.setFirstOfPairFlag(false); + readRightAdapter.setReadNegativeStrandFlag(false); - final int insertSize = 7 - 1; - read1.setInferredInsertSize(insertSize); - read2.setInferredInsertSize(-insertSize); + final int insertSize = minReadSize; + readLeftAdapter.setInferredInsertSize(insertSize); + readRightAdapter.setInferredInsertSize(-insertSize); - final GATKSAMRecord actual = FragmentUtils.mergeOverlappingPairedFragments(read1, read2); + final GATKSAMRecord actual = FragmentUtils.mergeOverlappingPairedFragments(readLeftAdapter, readRightAdapter); Assert.assertNull(actual); } @@ -356,7 +357,7 @@ public class FragmentUtilsUnitTest extends BaseTest { @DataProvider(name = "AdjustFragmentsTest") public Object[][] createAdjustFragmentsTest() throws Exception { - List tests = new ArrayList(); + List tests = new ArrayList<>(); final String leftFlank = "CCC"; final String rightFlank = "AAA"; diff --git a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/locusiterator/LocusIteratorByStateUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/locusiterator/LocusIteratorByStateUnitTest.java index 3fecd369c..70d1308aa 100644 --- a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/locusiterator/LocusIteratorByStateUnitTest.java +++ b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/locusiterator/LocusIteratorByStateUnitTest.java @@ -710,7 +710,7 @@ public class LocusIteratorByStateUnitTest extends LocusIteratorByStateBaseTest { read.setReadNegativeStrandFlag(false); read.setMateNegativeStrandFlag(true); read.setMateAlignmentStart(start - 1); - read.setInferredInsertSize(goodBases - 1); + read.setInferredInsertSize(goodBases); tests.add(new Object[]{0, goodBases, nClips, read}); } } @@ -738,6 +738,6 @@ public class LocusIteratorByStateUnitTest extends LocusIteratorByStateBaseTest { } final int nExpectedPileups = nReadContainingPileups; - Assert.assertEquals(nPileups, nExpectedPileups, "Wrong number of pileups seen"); + Assert.assertEquals(nPileups, nExpectedPileups, "Wrong number of pileups seen for " + read + " with " + nClipsOnLeft + " clipped bases."); } } diff --git a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/sam/ReadUtilsUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/sam/ReadUtilsUnitTest.java index 6df455d18..b0677cd8f 100644 --- a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/sam/ReadUtilsUnitTest.java +++ b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/sam/ReadUtilsUnitTest.java @@ -90,7 +90,7 @@ public class ReadUtilsUnitTest extends BaseTest { read.setReadNegativeStrandFlag(false); read.setMateNegativeStrandFlag(true); boundary = get.getAdaptor(read); - Assert.assertEquals(boundary, myStart + fragmentSize + 1); + Assert.assertEquals(boundary, myStart + fragmentSize); // Test case 2: positive strand, second read read = makeRead(fragmentSize, mateStart); @@ -99,7 +99,7 @@ public class ReadUtilsUnitTest extends BaseTest { read.setReadNegativeStrandFlag(false); read.setMateNegativeStrandFlag(true); boundary = get.getAdaptor(read); - Assert.assertEquals(boundary, myStart + fragmentSize + 1); + Assert.assertEquals(boundary, myStart + fragmentSize); // Test case 3: negative strand, second read read = makeRead(fragmentSize, mateStart); From 262c08aee3dd1d11063da45792d85d051c85644b Mon Sep 17 00:00:00 2001 From: Ron Levine Date: Wed, 23 Nov 2016 01:06:13 -0500 Subject: [PATCH 62/68] Make exit system file type message generic --- .../gatk/utils/commandline/CommandLineProgram.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/commandline/CommandLineProgram.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/commandline/CommandLineProgram.java index 04ed1c135..8bb1c3f29 100644 --- a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/commandline/CommandLineProgram.java +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/commandline/CommandLineProgram.java @@ -457,9 +457,9 @@ public abstract class CommandLineProgram { throw new ReviewedGATKException("SamException found with no message!", t); errorPrintf("------------------------------------------------------------------------------------------%n"); - errorPrintf("A BAM/CRAM ERROR has occurred (version %s): %n", getVersionNumber()); + errorPrintf("AN INPUT FILE ERROR has occurred (version %s): %n", getVersionNumber()); errorPrintf("%n"); - errorPrintf("This means that there is something wrong with the BAM/CRAM file(s) you provided.%n"); + errorPrintf("This means that there is something wrong with the input file(s) you provided.%n"); errorPrintf("The error message below tells you what is the problem.%n"); errorPrintf("%n"); printDocumentationReference(); From 68b6f8f6384c93a309947af4e6ead2709e910c6b Mon Sep 17 00:00:00 2001 From: Ron Levine Date: Mon, 28 Nov 2016 21:01:47 -0500 Subject: [PATCH 63/68] Replace VariantContextWriterFactory with VariantContextWriterBuilder --- .../haplotypecaller/HaplotypeResolver.java | 5 +-- .../walkers/phasing/ReadBackedPhasing.java | 4 +- .../storage/VariantContextWriterStorage.java | 38 ++++++++++++------- .../io/stubs/VariantContextWriterStub.java | 4 +- .../gatk/tools/CatVariants.java | 11 ++++-- .../LeftAlignAndTrimVariants.java | 4 +- .../variantutils/RandomlySplitVariants.java | 12 ++++-- .../VariantsToAllelicPrimitives.java | 4 +- .../walkers/variantutils/VariantsToVCF.java | 4 +- 9 files changed, 54 insertions(+), 32 deletions(-) diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeResolver.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeResolver.java index 3d32b1ea4..271b57961 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeResolver.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeResolver.java @@ -77,8 +77,7 @@ import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.variantcontext.VariantContextBuilder; import htsjdk.variant.variantcontext.VariantContextUtils; import htsjdk.variant.variantcontext.writer.VariantContextWriter; -import htsjdk.variant.variantcontext.writer.VariantContextWriterFactory; - +import htsjdk.variant.variantcontext.writer.SortingVariantContextWriter; import java.util.*; /** @@ -186,7 +185,7 @@ public class HaplotypeResolver extends RodWalker { headerLines.add(new VCFInfoHeaderLine(STATUS_KEY, 1, VCFHeaderLineType.String, "Extent to which records match")); final VCFHeader vcfHeader = new VCFHeader(headerLines, Collections.emptySet()); baseWriter.writeHeader(vcfHeader); - writer = VariantContextWriterFactory.sortOnTheFly(baseWriter, ACTIVE_WINDOW); + writer = new SortingVariantContextWriter(baseWriter, ACTIVE_WINDOW); } public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/phasing/ReadBackedPhasing.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/phasing/ReadBackedPhasing.java index 19928c3fd..767d3a103 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/phasing/ReadBackedPhasing.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/phasing/ReadBackedPhasing.java @@ -81,7 +81,7 @@ import org.broadinstitute.gatk.utils.pileup.PileupElement; import org.broadinstitute.gatk.utils.pileup.ReadBackedPileup; import htsjdk.variant.variantcontext.*; import htsjdk.variant.variantcontext.writer.VariantContextWriter; -import htsjdk.variant.variantcontext.writer.VariantContextWriterFactory; +import htsjdk.variant.variantcontext.writer.SortingVariantContextWriter; import java.io.*; import java.util.*; @@ -257,7 +257,7 @@ public class ReadBackedPhasing extends RodWalker hInfo = new HashSet(); diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/io/storage/VariantContextWriterStorage.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/io/storage/VariantContextWriterStorage.java index b3f76fe2f..7187e78c5 100644 --- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/io/storage/VariantContextWriterStorage.java +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/io/storage/VariantContextWriterStorage.java @@ -38,7 +38,7 @@ import htsjdk.variant.bcf2.BCF2Utils; import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.variantcontext.writer.Options; import htsjdk.variant.variantcontext.writer.VariantContextWriter; -import htsjdk.variant.variantcontext.writer.VariantContextWriterFactory; +import htsjdk.variant.variantcontext.writer.VariantContextWriterBuilder; import htsjdk.variant.vcf.VCFHeader; import java.io.*; @@ -82,8 +82,11 @@ public class VariantContextWriterStorage implements Storage options = stub.getWriterOptions(indexOnTheFly); - VariantContextWriter writer = VariantContextWriterFactory.create(file, this.stream, stub.getMasterSequenceDictionary(), stub.getIndexCreator(), options); + final VariantContextWriterBuilder.OutputType fileOutputType = allowCompressed && stub.isCompressed() ? + VariantContextWriterBuilder.OutputType.BLOCK_COMPRESSED_VCF : VariantContextWriterBuilder.OutputType.VCF; + final EnumSet options = stub.getWriterOptions(indexOnTheFly); + + VariantContextWriter writer = new VariantContextWriterBuilder() + .setOutputFile(file) + .setOutputFileType(fileOutputType) + .setReferenceDictionary(stub.getMasterSequenceDictionary()) + .setIndexCreator(stub.getIndexCreator()) + .setOptions(options) + .build(); // if the stub says to test BCF, create a secondary writer to BCF and an 2 way out writer to send to both // TODO -- remove me when argument generateShadowBCF is removed - if ( stub.alsoWriteBCFForTest() && ! VariantContextWriterFactory.isBCFOutput(file, options)) { + if ( stub.alsoWriteBCFForTest() && + ! (options.contains(Options.FORCE_BCF) || file != null && file.getName().contains(".bcf")) ) { final File bcfFile = BCF2Utils.shadowBCF(file); if ( bcfFile != null ) { - FileOutputStream bcfStream; - try { - bcfStream = new FileOutputStream(bcfFile); - } catch (FileNotFoundException e) { - throw new RuntimeException(bcfFile + ": Unable to create BCF writer", e); - } + final VariantContextWriter bcfWriter = new VariantContextWriterBuilder() + .setOutputFile(bcfFile) + .setOutputFileType(VariantContextWriterBuilder.OutputType.BCF) + .setReferenceDictionary(stub.getMasterSequenceDictionary()) + .setOptions(options) + .build(); - VariantContextWriter bcfWriter = VariantContextWriterFactory.create(bcfFile, bcfStream, stub.getMasterSequenceDictionary(), stub.getIndexCreator(), options); writer = new TestWriter(writer, bcfWriter); } } diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/io/stubs/VariantContextWriterStub.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/io/stubs/VariantContextWriterStub.java index 6cd63497a..276b5c309 100644 --- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/io/stubs/VariantContextWriterStub.java +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/io/stubs/VariantContextWriterStub.java @@ -34,7 +34,6 @@ import org.broadinstitute.gatk.engine.GATKVCFUtils; import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.variantcontext.writer.Options; import htsjdk.variant.variantcontext.writer.VariantContextWriter; -import htsjdk.variant.variantcontext.writer.VariantContextWriterFactory; import htsjdk.variant.vcf.VCFHeader; import htsjdk.variant.vcf.VCFHeaderLine; @@ -216,7 +215,8 @@ public class VariantContextWriterStub implements Stub, Var if ( indexOnTheFly) options.add(Options.INDEX_ON_THE_FLY); if ( writeFullFormatField ) options.add(Options.WRITE_FULL_FORMAT_FIELD); - if ( forceBCF || (getOutputFile() != null && VariantContextWriterFactory.isBCFOutput(getOutputFile())) ) + final File file = getOutputFile(); + if ( forceBCF || (file != null && options.contains(Options.FORCE_BCF) || file != null && file.getName().contains(".bcf")) ) options.add(Options.FORCE_BCF); return options.isEmpty() ? EnumSet.noneOf(Options.class) : EnumSet.copyOf(options); diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/CatVariants.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/CatVariants.java index 65936795c..f361cfa33 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/CatVariants.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/CatVariants.java @@ -51,7 +51,7 @@ import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.variantcontext.VariantContextComparator; import htsjdk.variant.variantcontext.writer.Options; import htsjdk.variant.variantcontext.writer.VariantContextWriter; -import htsjdk.variant.variantcontext.writer.VariantContextWriterFactory; +import htsjdk.variant.variantcontext.writer.VariantContextWriterBuilder; import java.io.*; import java.util.*; @@ -265,11 +265,16 @@ public class CatVariants extends CommandLineProgram { } } - FileOutputStream outputStream = new FileOutputStream(outputFile); EnumSet options = EnumSet.of(Options.INDEX_ON_THE_FLY); IndexCreator idxCreator = GATKVCFUtils.makeIndexCreator(variant_index_type, variant_index_parameter, outputFile, ref.getSequenceDictionary()); - final VariantContextWriter outputWriter = VariantContextWriterFactory.create(outputFile, outputStream, ref.getSequenceDictionary(), idxCreator, options); + final VariantContextWriter outputWriter = + new VariantContextWriterBuilder() + .setOutputFile(outputFile) + .setReferenceDictionary(ref.getSequenceDictionary()) + .setIndexCreator(idxCreator) + .setOptions(options) + .build(); boolean firstFile = true; int count = 0; while(!priorityQueue.isEmpty() ){ diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/LeftAlignAndTrimVariants.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/LeftAlignAndTrimVariants.java index c169a2844..7d24bdd5a 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/LeftAlignAndTrimVariants.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/LeftAlignAndTrimVariants.java @@ -55,7 +55,7 @@ import org.broadinstitute.gatk.utils.help.DocumentedGATKFeature; import org.broadinstitute.gatk.utils.sam.AlignmentUtils; import htsjdk.variant.variantcontext.*; import htsjdk.variant.variantcontext.writer.VariantContextWriter; -import htsjdk.variant.variantcontext.writer.VariantContextWriterFactory; +import htsjdk.variant.variantcontext.writer.SortingVariantContextWriter; import java.util.*; @@ -194,7 +194,7 @@ public class LeftAlignAndTrimVariants extends RodWalker { baseWriter.writeHeader(new VCFHeader(headerLines, samples)); - writer = VariantContextWriterFactory.sortOnTheFly(baseWriter, MAX_INDEL_LENGTH); + writer = new SortingVariantContextWriter(baseWriter, MAX_INDEL_LENGTH); referenceWindowStop = getToolkit().getArguments().reference_window_stop; } diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/RandomlySplitVariants.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/RandomlySplitVariants.java index ce737178a..5e50a3a97 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/RandomlySplitVariants.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/RandomlySplitVariants.java @@ -44,7 +44,7 @@ import org.broadinstitute.gatk.utils.exceptions.UserException; import org.broadinstitute.gatk.utils.help.DocumentedGATKFeature; import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.variantcontext.writer.VariantContextWriter; -import htsjdk.variant.variantcontext.writer.VariantContextWriterFactory; +import htsjdk.variant.variantcontext.writer.VariantContextWriterBuilder; import java.io.File; import java.util.*; @@ -134,14 +134,20 @@ public class RandomlySplitVariants extends RodWalker { if(splitToMany){ writers = new VariantContextWriter[numOfFiles]; for(int i = 0; i { baseWriter.writeHeader(new VCFHeader(headerLines, samples)); - vcfWriter = VariantContextWriterFactory.sortOnTheFly(baseWriter, 200); + vcfWriter = new SortingVariantContextWriter(baseWriter, 200); } public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantsToVCF.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantsToVCF.java index 3bf2c11b5..a2a55d69b 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantsToVCF.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantsToVCF.java @@ -52,7 +52,7 @@ import org.broadinstitute.gatk.utils.exceptions.UserException; import org.broadinstitute.gatk.utils.help.DocumentedGATKFeature; import htsjdk.variant.variantcontext.*; import htsjdk.variant.variantcontext.writer.VariantContextWriter; -import htsjdk.variant.variantcontext.writer.VariantContextWriterFactory; +import htsjdk.variant.variantcontext.writer.SortingVariantContextWriter; import java.io.File; import java.util.*; @@ -116,7 +116,7 @@ public class VariantsToVCF extends RodWalker { VariantOverlapAnnotator variantOverlapAnnotator = null; public void initialize() { - vcfwriter = VariantContextWriterFactory.sortOnTheFly(baseWriter, 40, false); + vcfwriter = new SortingVariantContextWriter(baseWriter, 40); variantOverlapAnnotator = new VariantOverlapAnnotator(dbsnp.dbsnp, getToolkit().getGenomeLocParser()); } From b42dc1624b3b518dc404870df6feb8347c9e74f6 Mon Sep 17 00:00:00 2001 From: Geraldine Van der Auwera Date: Sat, 4 Jun 2016 02:14:49 -0400 Subject: [PATCH 64/68] Documentation fixes - Fix missing closing code block tag - SOR fix + minor fix to template - Minor typo fixes - Make no_cmdline_in_header arg visible in docs - Caveats for UG-only args that are difficult to move out of common collection - Reference -L and -XL in -gvcf documentation - Annotation doc fixes (dsde-docs#1289) - Unhide some ContEst args and make Advanced - Fix inputPrior error message - Update web urls - Add JEXL doc link --- ...GenotypeCalculationArgumentCollection.java | 96 ++-- .../annotator/AS_BaseQualityRankSumTest.java | 2 +- .../annotator/AS_InsertSizeRankSum.java | 16 +- .../annotator/AS_MQMateRankSumTest.java | 4 +- .../AS_MappingQualityRankSumTest.java | 3 +- .../walkers/annotator/AS_RankSumTest.java | 4 +- .../annotator/AS_ReadPosRankSumTest.java | 2 +- .../annotator/BaseQualityRankSumTest.java | 2 +- .../annotator/ClippingRankSumTest.java | 2 +- .../tools/walkers/annotator/FisherStrand.java | 2 +- .../walkers/annotator/InbreedingCoeff.java | 2 +- .../annotator/LikelihoodRankSumTest.java | 2 +- .../annotator/MappingQualityRankSumTest.java | 3 +- .../tools/walkers/annotator/RankSumTest.java | 3 +- .../walkers/annotator/ReadPosRankSumTest.java | 2 +- .../walkers/annotator/StrandOddsRatio.java | 10 +- .../walkers/cancer/contamination/ContEst.java | 14 +- .../gatk/tools/walkers/cancer/m2/MuTect2.java | 38 +- .../walkers/genotyper/GenotypingEngine.java | 2 +- .../StandardCallerArgumentCollection.java | 42 +- .../walkers/phasing/PhaseByTransmission.java | 2 +- .../walkers/phasing/ReadBackedPhasing.java | 2 +- .../walkers/rnaseq/SplitNCigarReads.java | 3 +- .../arguments/GATKArgumentCollection.java | 5 +- .../filters/NotPrimaryAlignmentFilter.java | 9 +- .../walkers/annotator/VariantAnnotator.java | 16 +- .../walkers/filters/VariantFiltration.java | 2 +- .../variantutils/ValidateVariants.java | 17 +- .../gatk/utils/help/HelpConstants.java | 6 +- settings/helpTemplates/generic.template.html | 538 +++++++++--------- 30 files changed, 428 insertions(+), 423 deletions(-) diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/arguments/GenotypeCalculationArgumentCollection.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/arguments/GenotypeCalculationArgumentCollection.java index c74a3b751..ed639c951 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/arguments/GenotypeCalculationArgumentCollection.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/arguments/GenotypeCalculationArgumentCollection.java @@ -66,39 +66,23 @@ public class GenotypeCalculationArgumentCollection implements Cloneable{ public static final String MAX_ALTERNATE_ALLELES_SHORT_NAME = "maxAltAlleles"; /** - * Depending on the value of the --max_alternate_alleles argument, we may genotype only a fraction of the alleles being sent on for genotyping. - * Using this argument instructs the genotyper to annotate (in the INFO field) the number of alternate alleles that were originally discovered at the site. + * Depending on the value of the --max_alternate_alleles argument, we may genotype only a fraction of the alleles + * being sent on for genotyping. Using this argument instructs the genotyper to annotate (in the INFO field) the + * number of alternate alleles that were originally discovered (but not necessarily genotyped) at the site. */ - @Argument(fullName = "annotateNDA", shortName = "nda", doc = "If provided, we will annotate records with the number of alternate alleles that were discovered (but not necessarily genotyped) at a given site", required = false) + @Argument(fullName = "annotateNDA", shortName = "nda", doc = "Annotate number of alleles observed", required = false) public boolean ANNOTATE_NUMBER_OF_ALLELES_DISCOVERED = false; /** - * Use the new allele frequency / QUAL score model + * This activates a model for calculating QUAL that was introduced in version 3.7 (November 2016). We expect this + * model will become the default in future versions. */ - @Argument(fullName = "useNewAFCalculator", shortName = "newQual", doc = "If provided, we will use the new AF model instead of the so-called exact model", required = false) + @Argument(fullName = "useNewAFCalculator", shortName = "newQual", doc = "Use new AF model instead of the so-called exact model", required = false) public boolean USE_NEW_AF_CALCULATOR = false; /** - * The expected heterozygosity value used to compute prior probability that a locus is non-reference. - * - * From the heterozygosity we calculate the probability of N samples being hom-ref at a site as 1 - sum_i_2N (hets / i) - * where hets is this case is analogous to the parameter theta from population genetics. See https://en.wikipedia.org/wiki/Coalescent_theory for more details. - * - * Note that heterozygosity as used here is the population genetics concept. (See http://en.wikipedia.org/wiki/Zygosity#Heterozygosity_in_population_genetics. - * We also suggest the book "Population Genetics: A Concise Guide" by John H. Gillespie for further details on the theory.) That is, a hets value of 0.001 - * implies that two randomly chosen chromosomes from the population of organisms would differ from each other at a rate of 1 in 1000 bp. - * - * The default priors provided for humans (hets = 1e-3) - * - * Also note that this quantity has nothing to do with the likelihood of any given sample having a heterozygous genotype, - * which in the GATK is purely determined by the probability of the observed data P(D | AB) under the model that there - * may be a AB het genotype. The posterior probability of this AB genotype would use the het prior, but the GATK - * only uses this posterior probability in determining the prob. that a site is polymorphic. So changing the - * het parameters only increases the chance that a site will be called non-reference across all samples, but - * doesn't actually change the output genotype likelihoods at all, as these aren't posterior probabilities at all. - * - * The quantity that changes whether the GATK considers the possibility of a het genotype at all is the ploidy, - * which determines how many chromosomes each individual in the species carries. + * The expected heterozygosity value used to compute prior probability that a locus is non-reference. See + * https://software.broadinstitute.org/gatk/documentation/article?id=8603 for more details. */ @Argument(fullName = "heterozygosity", shortName = "hets", doc = "Heterozygosity value used to compute prior likelihoods for any locus", required = false) public Double snpHeterozygosity = HomoSapiensConstants.SNP_HETEROZYGOSITY; @@ -110,8 +94,8 @@ public class GenotypeCalculationArgumentCollection implements Cloneable{ public double indelHeterozygosity = HomoSapiensConstants.INDEL_HETEROZYGOSITY; /** - * The standard deviation of the distribution of alt allele fractions. The above heterozygosity parameters give the - * *mean* of this distribution; this parameter gives its spread. + * The standard deviation of the distribution of alt allele fractions. The above heterozygosity parameters give + * the *mean* of this distribution; this parameter gives its spread. */ @Argument(fullName = "heterozygosity_stdev", shortName = "heterozygosityStandardDeviation", doc = "Standard deviation of eterozygosity for SNP and indel calling.", required = false) public double heterozygosityStandardDeviation = 0.01; @@ -134,10 +118,11 @@ public class GenotypeCalculationArgumentCollection implements Cloneable{ public double STANDARD_CONFIDENCE_FOR_EMITTING = 30.0; /** - * If there are more than this number of alternate alleles presented to the genotyper (either through discovery or GENOTYPE_GIVEN_ALLELES), - * then only this many alleles will be used. Note that genotyping sites with many alternate alleles is both CPU and memory intensive and it - * scales exponentially based on the number of alternate alleles. Unless there is a good reason to change the default value, we highly recommend - * that you not play around with this parameter. + * If there are more than this number of alternate alleles presented to the genotyper (either through discovery or + * GENOTYPE_GIVEN_ALLELES), then only this many alleles will be used. Note that genotyping sites with many + * alternate alleles is both CPU and memory intensive and it scales exponentially based on the number of alternate + * alleles. Unless there is a good reason to change the default value, we highly recommend that you not play around + * with this parameter. * * See also {@link #MAX_GENOTYPE_COUNT}. */ @@ -146,19 +131,23 @@ public class GenotypeCalculationArgumentCollection implements Cloneable{ public int MAX_ALTERNATE_ALLELES = 6; /** - * If there are more than this number of genotypes at a locus presented to the genotyper, then only this many genotypes will be used. - * The possible genotypes are simply different ways of partitioning alleles given a specific ploidy asumption. - * Therefore, we remove genotypes from consideration by removing alternate alleles that are the least well supported. - * The estimate of allele support is based on the ranking of the candidate haplotypes coming out of the graph building step. - * Note that the reference allele is always kept. + * If there are more than this number of genotypes at a locus presented to the genotyper, then only this many + * genotypes will be used. This is intended to deal with sites where the combination of high ploidy and high alt + * allele count can lead to an explosion in the number of possible genotypes, with extreme adverse effects on + * runtime performance. * - * Note that genotyping sites with large genotype counts is both CPU and memory intensive. - * Unless there is a good reason to change the default value, we highly recommend that you not play around with this parameter. + * How does it work? The possible genotypes are simply different ways of partitioning alleles given a specific + * ploidy assumption. Therefore, we remove genotypes from consideration by removing alternate alleles that are the + * least well supported. The estimate of allele support is based on the ranking of the candidate haplotypes coming + * out of the graph building step. Note however that the reference allele is always kept. * * The maximum number of alternative alleles used in the genotyping step will be the lesser of the two: * 1. the largest number of alt alleles, given ploidy, that yields a genotype count no higher than {@link #MAX_GENOTYPE_COUNT} * 2. the value of {@link #MAX_ALTERNATE_ALLELES} * + * As noted above, genotyping sites with large genotype counts is both CPU and memory intensive. Unless you have + * a good reason to change the default value, we highly recommend that you not play around with this parameter. + * * See also {@link #MAX_ALTERNATE_ALLELES}. */ @Advanced @@ -175,23 +164,19 @@ public class GenotypeCalculationArgumentCollection implements Cloneable{ public int MAX_NUM_PL_VALUES = AFCalculator.MAX_NUM_PL_VALUES_DEFAULT; /** - * By default, the prior specified with the argument --heterozygosity/-hets is used for variant discovery at a particular locus, using an infinite sites model, - * see e.g. Waterson (1975) or Tajima (1996). - * This model asserts that the probability of having a population of k variant sites in N chromosomes is proportional to theta/k, for 1=1:N + * By default, the prior specified with the argument --heterozygosity/-hets is used for variant discovery at a + * particular locus, using an infinite sites model (see e.g. Waterson, 1975 or Tajima, 1996). This model asserts that + * the probability of having a population of k variant sites in N chromosomes is proportional to theta/k, for 1=1:N. + * However, there are instances where using this prior might not be desirable, e.g. for population studies where prior + * might not be appropriate, as for example when the ancestral status of the reference allele is not known. * - * There are instances where using this prior might not be desirable, e.g. for population studies where prior might not be appropriate, - * as for example when the ancestral status of the reference allele is not known. - * By using this argument, the user can manually specify a list of probabilities for each AC>1 to be used as priors for genotyping, - * with the following restrictions: - * a) User must specify 2N values, where N is the number of samples. - * b) Only diploid calls supported. - * c) Probability values are specified in Double format, in linear space (not log10 space or Phred-scale). - * d) No negative values allowed. - * e) Values will be added and Pr(AC=0) will be 1-sum, so that they sum up to one. - * f) If user-defined values add to more than one, an error will be produced. + * This argument allows you to manually specify a list of probabilities for each AC>1 to be used as + * priors for genotyping, with the following restrictions: only diploid calls are supported; you must specify 2 * + * N values where N is the number of samples; probability values must be positive and specified in Double format, + * in linear space (not log10 space nor Phred-scale); and all values must sume to 1. * - * If user wants completely flat priors, then user should specify the same value (=1/(2*N+1)) 2*N times,e.g. - * -inputPrior 0.33 -inputPrior 0.33 + * For completely flat priors, specify the same value (=1/(2*N+1)) 2*N times, e.g. + * -inputPrior 0.33 -inputPrior 0.33 * for the single-sample diploid case. */ @Advanced @@ -199,9 +184,10 @@ public class GenotypeCalculationArgumentCollection implements Cloneable{ public List inputPrior = Collections.emptyList(); /** - * Sample ploidy - equivalent to number of chromosomes per pool. In pooled experiments this should be = # of samples in pool * individual sample ploidy + * Sample ploidy - equivalent to number of chromosome copies per pool. For pooled experiments this should be set to + * the number of samples in pool multiplied by individual sample ploidy. */ - @Argument(shortName="ploidy", fullName="sample_ploidy", doc="Ploidy (number of chromosomes) per sample. For pooled data, set to (Number of samples in each pool * Sample Ploidy).", required=false) + @Argument(shortName="ploidy", fullName="sample_ploidy", doc="Ploidy per sample. For pooled data, set to (Number of samples in each pool * Sample Ploidy).", required=false) public int samplePloidy = HomoSapiensConstants.DEFAULT_PLOIDY; /** diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_BaseQualityRankSumTest.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_BaseQualityRankSumTest.java index 25f4b3274..046ae97fd 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_BaseQualityRankSumTest.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_BaseQualityRankSumTest.java @@ -71,7 +71,7 @@ import java.util.List; *

    The ideal result is a value close to zero, which indicates there is little to no difference. A negative value indicates that the bases supporting the alternate allele have lower quality scores than those supporting the reference allele. Conversely, a positive value indicates that the bases supporting the alternate allele have higher quality scores than those supporting the reference allele. Finding a statistically significant difference either way suggests that the sequencing process may have been biased or affected by an artifact.

    * *

    Statistical notes

    - *

    The value output for this annotation is the u-based z-approximation from the Mann-Whitney-Wilcoxon Rank Sum Test for base qualities (bases supporting REF vs. bases supporting ALT). See the method document on statistical tests for a more detailed explanation of the ranksum test.

    + *

    The value output for this annotation is the u-based z-approximation from the Mann-Whitney-Wilcoxon Rank Sum Test for base qualities (bases supporting REF vs. bases supporting ALT). See the method document on statistical tests for a more detailed explanation of the ranksum test.

    * *

    Caveats

    *
      diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_InsertSizeRankSum.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_InsertSizeRankSum.java index 4d7db9ba9..cb9afcfe5 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_InsertSizeRankSum.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_InsertSizeRankSum.java @@ -63,13 +63,19 @@ import java.util.List; /** * Allele specific Rank Sum Test for insert sizes of REF versus ALT reads * - *

      - * This annotation tests whether the insert sizes of reads supporting the REF allele and ALT allele are roughly equal. - * In case of multiple alternate alleles, each alternate allele is considered separately. + *

      This variant-level annotation compares the insert sizes of reads supporting the reference allele with those supporting each alternate allele. To be clear, it does so separately for each alternate allele.

      + * + *

      The ideal result is a value close to zero, which indicates there is little to no difference. A negative value indicates that the reads supporting the alternate allele are associated with smaller insert sizes than those supporting the reference allele. Conversely, a positive value indicates that reads supporting the alternate allele are associated with larger insert sizes than those supporting the reference allele. Finding a statistically significant difference either way suggests that the sequencing process may have been biased or affected by an artifact.

      * - *

      *

      Statistical notes

      - *

      See the method document for a more detailed explanation of the rank sum test.

      + *

      The value output for this annotation is the u-based z-approximation from the Mann-Whitney-Wilcoxon Rank Sum Test for insert sizes (insert sizes of reads supporting REF vs. insert sizes of reads supporting ALT). See the method document on statistical tests for a more detailed explanation of the ranksum test.

      + * + *

      Caveats

      + *
        + *
      • Uninformative reads are not used in these calculations.
      • + *
      • The insert size rank sum test cannot be calculated for sites without a mixture of reads showing both the reference and alternate alleles.
      • + *
      • This is an experimental annotation and as such it is unsupported. Use at your own risk.
      • + *
      * * */ diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_MQMateRankSumTest.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_MQMateRankSumTest.java index 67a9cc843..acbb6f6f7 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_MQMateRankSumTest.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_MQMateRankSumTest.java @@ -75,7 +75,7 @@ import java.util.Map; *

      This annotation can be used to evaluate confidence in a variant call and could be used as a covariate for variant recalibration (VQSR). Finding a statistically significant difference in quality either way suggests that the sequencing and/or mapping process may have been biased or affected by an artifact. In practice, we only filter out low negative values when evaluating variant quality because the idea is to filter out variants for which the quality of the data supporting the alternate allele is comparatively low. The reverse case, where it is the quality of data supporting the reference allele that is lower (resulting in positive ranksum scores), is not really informative for filtering variants.

      * *

      Statistical notes

      - *

      The value output for this annotation is the u-based z-approximation from the Mann-Whitney-Wilcoxon Rank Sum Test for mapping qualities of the read's mate See the method document on statistical tests for a more detailed explanation of the ranksum test.

      + *

      The value output for this annotation is the u-based z-approximation from the Mann-Whitney-Wilcoxon Rank Sum Test for mapping qualities of the read's mate See the method document on statistical tests for a more detailed explanation of the ranksum test.

      * * *

      Caveats

      *
      • The mapping quality rank sum test can not be calculated for sites without a mixture of reads showing both the reference and alternate alleles.
      • @@ -85,6 +85,8 @@ import java.util.Map; *

        Related annotations

        *
          *
        • AS_MappingQualityRankSumTest outputs the same rank sum test on the mapping quality of the reads themselves rather than their mates.
        • + *
        • MappingQualityRankSumTest outputs a version of the above mapping quality ranksum test annotation that includes all alternate alleles in a single calculation.
        • + *
        • RMSMappingQuality gives an estimation of the overall read mapping quality supporting a variant call.
        • *
        */ public class AS_MQMateRankSumTest extends AS_RankSumTest implements BetaTestingAnnotation { diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_MappingQualityRankSumTest.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_MappingQualityRankSumTest.java index 9b6d33003..66aa5f1ee 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_MappingQualityRankSumTest.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_MappingQualityRankSumTest.java @@ -76,7 +76,7 @@ import java.util.Map; *

        Finding a statistically significant difference in quality either way suggests that the sequencing and/or mapping process may have been biased or affected by an artifact. In practice, we only filter out low negative values when evaluating variant quality because the idea is to filter out variants for which the quality of the data supporting the alternate allele is comparatively low. The reverse case, where it is the quality of data supporting the reference allele that is lower (resulting in positive ranksum scores), is not really informative for filtering variants. * *

        Statistical notes

        - *

        The value output for this annotation is the u-based z-approximation from the Mann-Whitney-Wilcoxon Rank Sum Test for mapping qualities (MAPQ of reads supporting REF vs. MAPQ of reads supporting ALT). See the method document on statistical tests for a more detailed explanation of the ranksum test.

        + *

        The value output for this annotation is the u-based z-approximation from the Mann-Whitney-Wilcoxon Rank Sum Test for mapping qualities (MAPQ of reads supporting REF vs. MAPQ of reads supporting ALT). See the method document on statistical tests for a more detailed explanation of the ranksum test.

        * *

        Caveats

        *
        • The mapping quality rank sum test can not be calculated for sites without a mixture of reads showing both the reference and alternate alleles.
        • @@ -86,6 +86,7 @@ import java.util.Map; *

          Related annotations

          *
            *
          • MappingQualityRankSumTest outputs a version of this annotation that includes all alternate alleles in a single calculation.
          • + *
          • AS_MQMateRankSumTest outputs the same allele-specific rank sum test on the mapping quality of the reads' mates rather than the reads themselves.
          • *
          • RMSMappingQuality gives an estimation of the overal read mapping quality supporting a variant call.
          • *
          * diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_RankSumTest.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_RankSumTest.java index 05a696a32..70a13c52d 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_RankSumTest.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_RankSumTest.java @@ -72,7 +72,9 @@ import org.broadinstitute.gatk.utils.variant.GATKVCFHeaderLines; import java.util.*; /** - * Allele-specific implementation of rank sum test annotations + * Allele-specific implementation of rank sum test annotations. + * The RankSumTest concept is documented at https://software.broadinstitute.org/gatk/documentation/article?id=8031 + * */ public abstract class AS_RankSumTest extends RankSumTest implements ReducibleAnnotation { private final static Logger logger = Logger.getLogger(AS_RMSAnnotation.class); diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_ReadPosRankSumTest.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_ReadPosRankSumTest.java index 0028fe9f8..7d377f9df 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_ReadPosRankSumTest.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_ReadPosRankSumTest.java @@ -74,7 +74,7 @@ import java.util.List; *

          This annotation can be used to evaluate confidence in a variant call and is a recommended covariate for variant recalibration (VQSR). Finding a statistically significant difference in relative position either way suggests that the sequencing process may have been biased or affected by an artifact. In practice, we only filter out low negative values when evaluating variant quality because the idea is to filter out variants for which the quality of the data supporting the alternate allele is comparatively low. The reverse case, where it is the quality of data supporting the reference allele that is lower (resulting in positive ranksum scores), is not really informative for filtering variants.

          * *

          Statistical notes

          - *

          The value output for this annotation is the u-based z-approximation from the Mann-Whitney-Wilcoxon Rank Sum Test for site position within reads (position within reads supporting REF vs. position within reads supporting ALT). See the method document on statistical tests for a more detailed explanation of the ranksum test.

          + *

          The value output for this annotation is the u-based z-approximation from the Mann-Whitney-Wilcoxon Rank Sum Test for site position within reads (position within reads supporting REF vs. position within reads supporting ALT). See the method document on statistical tests for a more detailed explanation of the ranksum test.

          * *

          Caveat

          *
            diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/BaseQualityRankSumTest.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/BaseQualityRankSumTest.java index dfea75fa6..66fdf7d9f 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/BaseQualityRankSumTest.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/BaseQualityRankSumTest.java @@ -70,7 +70,7 @@ import java.util.*; *

            The ideal result is a value close to zero, which indicates there is little to no difference. A negative value indicates that the bases supporting the alternate allele have lower quality scores than those supporting the reference allele. Conversely, a positive value indicates that the bases supporting the alternate allele have higher quality scores than those supporting the reference allele. Finding a statistically significant difference either way suggests that the sequencing process may have been biased or affected by an artifact.

            * *

            Statistical notes

            - *

            The value output for this annotation is the u-based z-approximation from the Mann-Whitney-Wilcoxon Rank Sum Test for base qualities (bases supporting REF vs. bases supporting ALT). See the method document on statistical tests for a more detailed explanation of the ranksum test.

            + *

            The value output for this annotation is the u-based z-approximation from the Mann-Whitney-Wilcoxon Rank Sum Test for base qualities (bases supporting REF vs. bases supporting ALT). See the method document on statistical tests for a more detailed explanation of the ranksum test.

            * *

            Caveats

            *
              diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/ClippingRankSumTest.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/ClippingRankSumTest.java index c45ec4766..daa240b2f 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/ClippingRankSumTest.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/ClippingRankSumTest.java @@ -66,7 +66,7 @@ import java.util.*; *

              This variant-level annotation tests whether the data supporting the reference allele shows more or less base clipping (hard clips) than those supporting the alternate allele. The ideal result is a value close to zero, which indicates there is little to no difference. A negative value indicates that the reads supporting the alternate allele have more hard-clipped bases than those supporting the reference allele. Conversely, a positive value indicates that the reads supporting the alternate allele have fewer hard-clipped bases than those supporting the reference allele. Finding a statistically significant difference either way suggests that the sequencing and/or mapping process may have been biased or affected by an artifact.

              * *

              Statistical notes

              - *

              The value output for this annotation is the u-based z-approximation from the Mann-Whitney-Wilcoxon Rank Sum Test applied to base clips (number of hard-clipped bases on reads supporting REF vs. number of hard-clipped bases on reads supporting ALT). See the method document on statistical tests for a more detailed explanation of the ranksum test.

              + *

              The value output for this annotation is the u-based z-approximation from the Mann-Whitney-Wilcoxon Rank Sum Test applied to base clips (number of hard-clipped bases on reads supporting REF vs. number of hard-clipped bases on reads supporting ALT). See the method document on statistical tests for a more detailed explanation of the ranksum test.

              * *

              Caveat

              *

              The clipping rank sum test cannot be calculated for sites without a mixture of reads showing both the reference and alternate alleles.

              diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/FisherStrand.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/FisherStrand.java index d737f3d8b..5cfddd1da 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/FisherStrand.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/FisherStrand.java @@ -75,7 +75,7 @@ import java.util.*; *

              The output is a Phred-scaled p-value. The higher the output value, the more likely there is to be bias. More bias is indicative of false positive calls.

              * *

              Statistical notes

              - *

              See the method document on statistical tests for a more detailed explanation of this application of Fisher's Exact Test.

              + *

              See the method document on statistical tests for a more detailed explanation of this application of Fisher's Exact Test.

              * *

              Caveats

              *
                diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/InbreedingCoeff.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/InbreedingCoeff.java index f8be1c044..862846415 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/InbreedingCoeff.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/InbreedingCoeff.java @@ -76,7 +76,7 @@ import java.util.*; *

                This annotation estimates whether there is evidence of inbreeding in a population. The higher the score, the higher the chance that there is inbreeding.

                * *

                Statistical notes

                - *

                The calculation is a continuous generalization of the Hardy-Weinberg test for disequilibrium that works well with limited coverage per sample. The output is the F statistic from running the HW test for disequilibrium with PL values. See the method document on statistical tests for a more detailed explanation of this statistical test.

                + *

                The calculation is a continuous generalization of the Hardy-Weinberg test for disequilibrium that works well with limited coverage per sample. The output is the F statistic from running the HW test for disequilibrium with PL values. See the method document on statistical tests for a more detailed explanation of this statistical test.

                * *

                Caveats

                *
                  diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/LikelihoodRankSumTest.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/LikelihoodRankSumTest.java index e4ac83eb8..dd6ea1407 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/LikelihoodRankSumTest.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/LikelihoodRankSumTest.java @@ -66,7 +66,7 @@ import java.util.List; *

                  This variant-level annotation compares the likelihoods of reads to their best haplotype match, between reads that support the reference allele and those that support the alternate allele. The ideal result is a value close to zero, which indicates there is little to no difference. A negative value indicates that the reads supporting the alternate allele have lower likelihoods to their best haplotype match than those supporting the reference allele. Conversely, a positive value indicates that the reads supporting the alternate allele have higher likelihoods to their best haplotype match than those supporting the reference allele. Finding a statistically significant difference either way suggests that the sequencing and/or mapping process may have been biased or affected by an artifact.

                  * *

                  Statistical notes

                  - *

                  The value output for this annotation is the u-based z-approximation from the Mann-Whitney-Wilcoxon Rank Sum Test for per-read likelihoods to the best haplotype match (likelihoods of reads supporting REF vs. likelihoods of reads supporting ALT). See the method document on statistical tests for a more detailed explanation of the ranksum test.

                  + *

                  The value output for this annotation is the u-based z-approximation from the Mann-Whitney-Wilcoxon Rank Sum Test for per-read likelihoods to the best haplotype match (likelihoods of reads supporting REF vs. likelihoods of reads supporting ALT). See the method document on statistical tests for a more detailed explanation of the ranksum test.

                  * *

                  Caveat

                  *

                  The read position rank sum test can not be calculated for sites without a mixture of reads showing both the reference and alternate alleles.

                  diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/MappingQualityRankSumTest.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/MappingQualityRankSumTest.java index 7a8554eea..6fd308e5c 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/MappingQualityRankSumTest.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/MappingQualityRankSumTest.java @@ -68,7 +68,7 @@ import java.util.*; *

                  This annotation can be used to evaluate confidence in a variant call and is a recommended covariate for variant recalibration (VQSR). Finding a statistically significant difference in quality either way suggests that the sequencing and/or mapping process may have been biased or affected by an artifact. In practice, we only filter out low negative values when evaluating variant quality because the idea is to filter out variants for which the quality of the data supporting the alternate allele is comparatively low. The reverse case, where it is the quality of data supporting the reference allele that is lower (resulting in positive ranksum scores), is not really informative for filtering variants. * *

                  Statistical notes

                  - *

                  The value output for this annotation is the u-based z-approximation from the Mann-Whitney-Wilcoxon Rank Sum Test for mapping qualities (MAPQ of reads supporting REF vs. MAPQ of reads supporting ALT). See the method document on statistical tests for a more detailed explanation of the ranksum test.

                  + *

                  The value output for this annotation is the u-based z-approximation from the Mann-Whitney-Wilcoxon Rank Sum Test for mapping qualities (MAPQ of reads supporting REF vs. MAPQ of reads supporting ALT). See the method document on statistical tests for a more detailed explanation of the ranksum test.

                  * *

                  Caveats

                  *
                  • The mapping quality rank sum test can not be calculated for sites without a mixture of reads showing both the reference and alternate alleles.
                  • @@ -78,6 +78,7 @@ import java.util.*; *

                    Related annotations

                    *
                      *
                    • AS_MappingQualityRankSumTest outputs an allele-specific version of this annotation.
                    • + *
                    • AS_MQMateRankSumTest outputs the allele-specific rank sum test on the mapping quality of the reads' mates rather than the reads themselves.
                    • *
                    • RMSMappingQuality gives an estimation of the overal read mapping quality supporting a variant call.
                    • *
                    * diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/RankSumTest.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/RankSumTest.java index c9eaba76a..c4e1cd376 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/RankSumTest.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/RankSumTest.java @@ -71,7 +71,8 @@ import java.util.*; /** - * Abstract root for all RankSum-based annotations + * Abstract root for all RankSum-based annotations. + * The RankSumTest concept is documented at https://software.broadinstitute.org/gatk/documentation/article?id=8031 */ //TODO: will eventually implement ReducibleAnnotation in order to preserve accuracy for CombineGVCFs and GenotypeGVCFs -- see RMSAnnotation.java for an example of an abstract ReducibleAnnotation public abstract class RankSumTest extends InfoFieldAnnotation implements ActiveRegionBasedAnnotation { diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/ReadPosRankSumTest.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/ReadPosRankSumTest.java index c4cb8c3b6..d136a6d70 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/ReadPosRankSumTest.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/ReadPosRankSumTest.java @@ -74,7 +74,7 @@ import java.util.*; *

                    This annotation can be used to evaluate confidence in a variant call and is a recommended covariate for variant recalibration (VQSR). Finding a statistically significant difference in relative position either way suggests that the sequencing process may have been biased or affected by an artifact. In practice, we only filter out low negative values when evaluating variant quality because the idea is to filter out variants for which the quality of the data supporting the alternate allele is comparatively low. The reverse case, where it is the quality of data supporting the reference allele that is lower (resulting in positive ranksum scores), is not really informative for filtering variants.

                    * *

                    Statistical notes

                    - *

                    The value output for this annotation is the u-based z-approximation from the Mann-Whitney-Wilcoxon Rank Sum Test for site position within reads (position within reads supporting REF vs. position within reads supporting ALT). See the method document on statistical tests for a more detailed explanation of the ranksum test.

                    + *

                    The value output for this annotation is the u-based z-approximation from the Mann-Whitney-Wilcoxon Rank Sum Test for site position within reads (position within reads supporting REF vs. position within reads supporting ALT). See the method document on statistical tests for a more detailed explanation of the ranksum test.

                    * *

                    Caveat

                    *
                      diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/StrandOddsRatio.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/StrandOddsRatio.java index c818f4a79..3a7336549 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/StrandOddsRatio.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/StrandOddsRatio.java @@ -77,24 +77,22 @@ import java.util.*; * *

                      and its inverse:

                      * - * + *
                      * * * *
                       + strand - strand
                      REF;X[0][0]X[0][1]
                      ALT;X[1][0]X[1][1]
                      - * + *
                      *

                      The sum R + 1/R is used to detect a difference in strand bias for REF and for ALT (the sum makes it symmetric). A high value is indicative of large difference where one entry is very small compared to the others. A scale factor of refRatio/altRatio where

                      * - * $$ refRatio = \frac{max(X[0][0], X[0][1])}{min(X[0][0], X[0][1} $$ + * $$ refRatio = \frac{min(X[0][0], X[0][1])}{max(X[0][0], X[0][1} $$ * *

                      and

                      * - * $$ altRatio = \frac{max(X[1][0], X[1][1])}{min(X[1][0], X[1][1]} $$ + * $$ altRatio = \frac{min(X[1][0], X[1][1])}{max(X[1][0], X[1][1]} $$ * *

                      ensures that the annotation value is large only.

                      * - *

                      See the method document on statistical tests for a more detailed explanation of this statistical test.

                      - * *

                      Caveat

                      *

                      * The name SOR is not entirely appropriate because the implementation was changed somewhere between the start of development and release of this annotation. Now SOR isn't really an odds ratio anymore. The goal was to separate certain cases of data without penalizing variants that occur at the ends of exons because they tend to only be covered by reads in one direction (depending on which end of the exon they're on), so if a variant has 10 ref reads in the + direction, 1 ref read in the - direction, 9 alt reads in the + direction and 2 alt reads in the - direction, it's actually not strand biased, but the FS score is pretty bad. The implementation that resulted derived in part from empirically testing some read count tables of various sizes with various ratios and deciding from there.

                      diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/contamination/ContEst.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/contamination/ContEst.java index a0639181f..4e33b4fa3 100755 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/contamination/ContEst.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/cancer/contamination/ContEst.java @@ -197,29 +197,29 @@ public class ContEst extends RodWalker