diff --git a/build.xml b/build.xml index 47e4eeb47..3aead62d6 100644 --- a/build.xml +++ b/build.xml @@ -111,10 +111,9 @@ - + - @@ -273,19 +272,19 @@ - - - - - - + + - - - - - - + + @@ -1031,6 +1030,14 @@ + + + + + + + + @@ -1038,7 +1045,7 @@ - + @@ -1135,12 +1142,35 @@ - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -1148,6 +1178,7 @@ + @@ -1247,7 +1278,7 @@ - + diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyIntegrationTest.java index cdd31a5ef..5cdc15e5e 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyIntegrationTest.java @@ -55,36 +55,36 @@ public class UnifiedGenotyperGeneralPloidyIntegrationTest extends WalkerTest { @Test(enabled = true) public void testSNP_ACS_Pools() { - PC_LSV_Test_short(" -maxAltAlleles 1 -ploidy 6 -out_mode EMIT_ALL_CONFIDENT_SITES","LSV_SNP_ACS","SNP","df0e67c975ef74d593f1c704daab1705"); + PC_LSV_Test_short(" -maxAltAlleles 1 -ploidy 6 -out_mode EMIT_ALL_CONFIDENT_SITES","LSV_SNP_ACS","SNP","651469eeacdb3ab9e2690cfb71f6a634"); } @Test(enabled = true) public void testBOTH_GGA_Pools() { - PC_LSV_Test(String.format(" -maxAltAlleles 2 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",LSV_ALLELES),"LSV_BOTH_GGA","BOTH","7e5b28c9e21cc7e45c58c41177d8a0fc"); + PC_LSV_Test(String.format(" -maxAltAlleles 2 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",LSV_ALLELES),"LSV_BOTH_GGA","BOTH","be7dc20bdb5f200d189706bcf1aeb7ee"); } @Test(enabled = true) public void testINDEL_GGA_Pools() { - PC_LSV_Test(String.format(" -maxAltAlleles 1 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",LSV_ALLELES),"LSV_INDEL_GGA","INDEL","ae6c276cc46785a794acff6f7d10ecf7"); + PC_LSV_Test(String.format(" -maxAltAlleles 1 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",LSV_ALLELES),"LSV_INDEL_GGA","INDEL","25e5ea86d87b7d7ddaad834a6ed7481d"); } @Test(enabled = true) public void testINDEL_maxAltAlleles2_ploidy3_Pools_noRef() { - PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 3","LSV_INDEL_DISC_NOREF_p3","INDEL","6987b89e04dcb604d3743bb09aa9587d"); + PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 3","LSV_INDEL_DISC_NOREF_p3","INDEL","cdbf268d282e57189a88fb83f0e1fd72"); } @Test(enabled = true) public void testINDEL_maxAltAlleles2_ploidy1_Pools_noRef() { - PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 1","LSV_INDEL_DISC_NOREF_p1","INDEL","d0780f70365ed1b431099fd3b4cec449"); + PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 1","LSV_INDEL_DISC_NOREF_p1","INDEL","2ed40925cd112c1a45470d215b7ec4b3"); } @Test(enabled = true) public void testMT_SNP_DISCOVERY_sp4() { - PC_MT_Test(CEUTRIO_BAM, " -maxAltAlleles 1 -ploidy 8", "MT_SNP_DISCOVERY_sp4","3fc6f4d458313616727c60e49c0e852b"); + PC_MT_Test(CEUTRIO_BAM, " -maxAltAlleles 1 -ploidy 8", "MT_SNP_DISCOVERY_sp4","33695a998bcc906cabcc758727004387"); } @Test(enabled = true) public void testMT_SNP_GGA_sp10() { - PC_MT_Test(CEUTRIO_BAM, String.format(" -maxAltAlleles 1 -ploidy 20 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",NA12891_CALLS), "MT_SNP_GGA_sp10", "1bebbc0f28bff6fd64736ccca8839df8"); + PC_MT_Test(CEUTRIO_BAM, String.format(" -maxAltAlleles 1 -ploidy 20 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",NA12891_CALLS), "MT_SNP_GGA_sp10", "b2725242114bf9cc9bca14679705ba40"); } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index a8ba92634..e40a7ed38 100755 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -30,7 +30,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultiSamplePilot1() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000", 1, - Arrays.asList("847605f4efafef89529fe0e496315edd")); + Arrays.asList("2ba9af34d2a4d55caf152265a30ead46")); executeTest("test MultiSample Pilot1", spec); } @@ -38,7 +38,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testWithAllelesPassedIn1() { WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommand + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "allelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,025,000", 1, - Arrays.asList("5b31b811072a4df04524e13604015f9b")); + Arrays.asList("0630c35c070d7a7e0cf22b3cce797f22")); executeTest("test MultiSample Pilot2 with alleles passed in", spec1); } @@ -46,7 +46,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testWithAllelesPassedIn2() { WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( baseCommand + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "allelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,025,000", 1, - Arrays.asList("d9992e55381afb43742cc9b30fcd7538")); + Arrays.asList("5857dcb4e6a8422ae0813e42d433b122")); executeTest("test MultiSample Pilot2 with alleles passed in and emitting all sites", spec2); } @@ -54,7 +54,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testSingleSamplePilot2() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,100,000", 1, - Arrays.asList("fea530fdc8677e10be4cc11625fa5376")); + Arrays.asList("489deda5d3276545364a06b7385f8bd9")); executeTest("test SingleSample Pilot2", spec); } @@ -62,7 +62,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultipleSNPAlleles() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -glm BOTH --dbsnp " + b37dbSNP129 + " -I " + privateTestDir + "multiallelic.snps.bam -o %s -L " + privateTestDir + "multiallelic.snps.intervals", 1, - Arrays.asList("b41b95aaa2c453c9b75b3b29a9c2718e")); + Arrays.asList("595ba44c75d08dab98df222b8e61ab70")); executeTest("test Multiple SNP alleles", spec); } @@ -70,7 +70,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testBadRead() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -glm BOTH -I " + privateTestDir + "badRead.test.bam -o %s -L 1:22753424-22753464", 1, - Arrays.asList("d915535c1458733f09f82670092fcab6")); + Arrays.asList("360f9795facdaa14c0cb4b05207142e4")); executeTest("test bad read", spec); } @@ -78,7 +78,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testReverseTrim() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -I " + validationDataLocation + "CEUTrio.HiSeq.b37.chr20.10_11mb.bam -o %s -L 20:10289124 -L 20:10090289", 1, - Arrays.asList("e14c9b1f9f34d6c16de445bfa385be89")); + Arrays.asList("4b4a62429f8eac1e2f27ba5e2edea9e5")); executeTest("test reverse trim", spec); } @@ -86,7 +86,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMismatchedPLs() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -I " + privateTestDir + "mismatchedPLs.bam -o %s -L 1:24020341", 1, - Arrays.asList("935ee705ffe8cc6bf1d9efcceea271c8")); + Arrays.asList("cc892c91a93dbd8dbdf645803f35a0ee")); executeTest("test mismatched PLs", spec); } @@ -96,7 +96,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { // // -------------------------------------------------------------------------------------------------------------- - private final static String COMPRESSED_OUTPUT_MD5 = "af8187e2baf516dde1cddea787a52b8a"; + private final static String COMPRESSED_OUTPUT_MD5 = "3fc7d2681ff753e2d68605d7cf8b63e3"; @Test public void testCompressedOutput() { @@ -149,7 +149,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMinBaseQualityScore() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 --min_base_quality_score 26", 1, - Arrays.asList("6ee6537e9ebc1bfc7c6cf8f04b1582ff")); + Arrays.asList("04dc83d7dfb42b8cada91647bd9f32f1")); executeTest("test min_base_quality_score 26", spec); } @@ -157,7 +157,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testSLOD() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b36KGReference + " --computeSLOD --no_cmdline_in_header -glm BOTH --dbsnp " + b36dbSNP129 + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1, - Arrays.asList("55760482335497086458b09e415ecf54")); + Arrays.asList("4429a665a1048f958db3c204297cdb9f")); executeTest("test SLOD", spec); } @@ -165,7 +165,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testNDA() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " --annotateNDA -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1, - Arrays.asList("938e888a40182878be4c3cc4859adb69")); + Arrays.asList("f063e3573c513eaa9ce7d7df22143362")); executeTest("test NDA", spec); } @@ -173,7 +173,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testCompTrack() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b36KGReference + " --no_cmdline_in_header -glm BOTH -comp:FOO " + b36dbSNP129 + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1, - Arrays.asList("7dc186d420487e4e156a24ec8dea0951")); + Arrays.asList("d76e93e2676354dde832f08a508c6f88")); executeTest("test using comp track", spec); } @@ -187,17 +187,17 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { @Test public void testOutputParameterSitesOnly() { - testOutputParameters("-sites_only", "f99c7471127a6fb6f72e136bc873b2c9"); + testOutputParameters("-sites_only", "1a65172b9bd7a2023d48bc758747b34a"); } @Test public void testOutputParameterAllConfident() { - testOutputParameters("--output_mode EMIT_ALL_CONFIDENT_SITES", "9dbc9389db39cf9697e93e0bf529314f"); + testOutputParameters("--output_mode EMIT_ALL_CONFIDENT_SITES", "3f1fa34d8440f6f21654ce60c0ba8f28"); } @Test public void testOutputParameterAllSites() { - testOutputParameters("--output_mode EMIT_ALL_SITES", "8b26088a035e579c4afd3b46737291e4"); + testOutputParameters("--output_mode EMIT_ALL_SITES", "f240434b4d3c234f6f9e349e9ec05f4e"); } private void testOutputParameters(final String args, final String md5) { @@ -211,7 +211,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testConfidence() { WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_call_conf 10 ", 1, - Arrays.asList("4af83a883ecc03a23b0aa6dd4b8f1ceb")); + Arrays.asList("aec378bed312b3557c6dd7ec740c8091")); executeTest("test confidence 1", spec1); } @@ -222,12 +222,12 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { // -------------------------------------------------------------------------------------------------------------- @Test public void testHeterozyosity1() { - testHeterozosity( 0.01, "8dd37249e0a80afa86594c3f1e720760" ); + testHeterozosity( 0.01, "5da6b24033a6b02f466836443d49560e" ); } @Test public void testHeterozyosity2() { - testHeterozosity( 1.0 / 1850, "040d169e20fda56f8de009a6015eb384" ); + testHeterozosity( 1.0 / 1850, "1f284c4af967a3c26687164f9441fb16" ); } private void testHeterozosity(final double arg, final String md5) { @@ -251,7 +251,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,100,000", 1, - Arrays.asList("0e4713e4aa44f4f8fcfea7138295a627")); + Arrays.asList("cff553c53de970f64051ed5711407038")); executeTest(String.format("test multiple technologies"), spec); } @@ -270,7 +270,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -L 1:10,000,000-10,100,000" + " -baq CALCULATE_AS_NECESSARY", 1, - Arrays.asList("46ea5d1ceb8eed1d0db63c3577915d6c")); + Arrays.asList("f960a91963e614a6c8d8cda57836df24")); executeTest(String.format("test calling with BAQ"), spec); } @@ -289,7 +289,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,500,000", 1, - Arrays.asList("f6f8fbf733f20fbc1dd9ebaf8faefe6c")); + Arrays.asList("46a6d24c82ebb99d305462960fa09b7c")); executeTest(String.format("test indel caller in SLX"), spec); } @@ -304,7 +304,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -minIndelCnt 1" + " -L 1:10,000,000-10,100,000", 1, - Arrays.asList("4438ad0f03bbdd182d9bb59b15af0fa5")); + Arrays.asList("2be25321bbc6a963dba7ecba5dd76802")); executeTest(String.format("test indel caller in SLX with low min allele count"), spec); } @@ -317,7 +317,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,500,000", 1, - Arrays.asList("27b4ace2ad5a83d8cccb040f97f29183")); + Arrays.asList("d6b2657cd5a4a949968cdab50efce515")); executeTest(String.format("test indel calling, multiple technologies"), spec); } @@ -327,7 +327,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "indelAllelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1, - Arrays.asList("b8129bf754490cc3c76191d8cc4ec93f")); + Arrays.asList("9cff66a321284c362f393bc4db21f756")); executeTest("test MultiSample Pilot2 indels with alleles passed in", spec); } @@ -337,7 +337,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { baseCommandIndels + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "indelAllelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1, - Arrays.asList("591332fa0b5b22778cf820ee257049d2")); + Arrays.asList("90c8cfcf65152534c16ed81104fc3bcd")); executeTest("test MultiSample Pilot2 indels with alleles passed in and emitting all sites", spec); } @@ -345,13 +345,13 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultiSampleIndels1() { WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommandIndels + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10450700-10551000", 1, - Arrays.asList("d3d518448b01bf0f751824b3d946cd04")); + Arrays.asList("457b8f899cf1665de61e75084dbb79d0")); List result = executeTest("test MultiSample Pilot1 CEU indels", spec1).getFirst(); WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + result.get(0).getAbsolutePath() + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10450700-10551000", 1, - Arrays.asList("2ea18a3e8480718a80a415d3fea79f54")); + Arrays.asList("a13fe7aa3b9e8e091b3cf3442a056ec1")); executeTest("test MultiSample Pilot1 CEU indels using GENOTYPE_GIVEN_ALLELES", spec2); } @@ -361,7 +361,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommandIndelsb37 + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles " + privateTestDir + vcf + " -I " + validationDataLocation + "NA12878.HiSeq.WGS.bwa.cleaned.recal.hg19.20.bam -o %s -L " + validationDataLocation + vcf, 1, - Arrays.asList("d76eacc4021b78ccc0a9026162e814a7")); + Arrays.asList("d075ad318739c8c56bdce857da1e48b9")); executeTest("test GENOTYPE_GIVEN_ALLELES with no evidence in reads", spec); } @@ -373,7 +373,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 20:10,000,000-10,100,000", 1, - Arrays.asList("1e0d2c15546c3b0959b00ffb75488b56")); + Arrays.asList("91c632ab17a1dd89ed19ebb20324f905")); executeTest(String.format("test UG with base indel quality scores"), spec); } @@ -407,7 +407,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMinIndelFraction0() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( assessMinIndelFraction + " -minIndelFrac 0.0", 1, - Arrays.asList("90adefd39ed67865b0cb275ad0f07383")); + Arrays.asList("1d80e135d611fe19e1fb1882aa588a73")); executeTest("test minIndelFraction 0.0", spec); } @@ -415,7 +415,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMinIndelFraction25() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( assessMinIndelFraction + " -minIndelFrac 0.25", 1, - Arrays.asList("2fded43949e258f8e9f68893c61c1bdd")); + Arrays.asList("752139616752902fca13c312d8fe5e22")); executeTest("test minIndelFraction 0.25", spec); } @@ -423,7 +423,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMinIndelFraction100() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( assessMinIndelFraction + " -minIndelFrac 1", 1, - Arrays.asList("3f07efb768e08650a7ce333edd4f9a52")); + Arrays.asList("d66b9decf26e1704abda1a919ac149cd")); executeTest("test minIndelFraction 1.0", spec); } @@ -437,7 +437,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testNsInCigar() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "testWithNs.bam -o %s -L 8:141813600-141813700 -out_mode EMIT_ALL_SITES", 1, - Arrays.asList("4d36969d4f8f1094f1fb6e7e085c19f6")); + Arrays.asList("b62ba9777efc05af4c36e2d4ce3ee67c")); executeTest("test calling on reads with Ns in CIGAR", spec); } @@ -451,18 +451,18 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testReducedBam() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1, - Arrays.asList("092e42a712afb660ec79ff11c55933e2")); + Arrays.asList("f72ecd00b2913f63788faa7dabb1d102")); executeTest("test calling on a ReducedRead BAM", spec); } @Test public void testReducedBamSNPs() { - testReducedCalling("SNP", "c0de74ab8f4f14eb3a2c5d55c200ac5f"); + testReducedCalling("SNP", "f059743858004ceee325f2a7761a2362"); } @Test public void testReducedBamINDELs() { - testReducedCalling("INDEL", "9d5418ddf1b227ae4d463995507f2b1c"); + testReducedCalling("INDEL", "04845ba1ec7d8d8b0eab2ca6bdb9c1a6"); } @@ -483,7 +483,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testContaminationDownsampling() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 --contamination_fraction_to_filter 0.20", 1, - Arrays.asList("1f9071466fc40f4c6a0f58ac8e9135fb")); + Arrays.asList("b500ad5959bce69f888a2fac024647e5")); executeTest("test contamination_percentage_to_filter 0.20", spec); } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index 8422d856e..bb9efe15d 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -21,19 +21,19 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSample() { - HCTest(CEUTRIO_BAM, "", "839de31b41d4186e2b12a5601525e894"); + HCTest(CEUTRIO_BAM, "", "7122d4f0ef94c5274aa3047cfebe08ed"); } @Test public void testHaplotypeCallerSingleSample() { - HCTest(NA12878_BAM, "", "2b68faa0e0493d92491d74b8f731963a"); + HCTest(NA12878_BAM, "", "6cd6e6787521c07a7bae98766fd628ab"); } // TODO -- add more tests for GGA mode, especially with input alleles that are complex variants and/or not trimmed @Test public void testHaplotypeCallerMultiSampleGGA() { HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", - "a2d56179cd19a41f8bfb995e225320bb"); + "44df2a9da4fbd2162ae44c3f2a6ef01f"); } private void HCTestComplexVariants(String bam, String args, String md5) { @@ -44,7 +44,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSampleComplex() { - HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "fd8d2ae8db9d98e932b0a7f345631eec"); + HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "4a413eeb7a75cab0ab5370b4c08dcf8e"); } private void HCTestSymbolicVariants(String bam, String args, String md5) { @@ -55,7 +55,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerSingleSampleSymbolic() { - HCTestSymbolicVariants(NA12878_CHR20_BAM, "", "0761ff5cbf279be467833fa6708bf360"); + HCTestSymbolicVariants(NA12878_CHR20_BAM, "", "77cf5b5273828dd1605bb23a5aeafcaa"); } private void HCTestIndelQualityScores(String bam, String args, String md5) { @@ -66,20 +66,20 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerSingleSampleIndelQualityScores() { - HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "6380e25c1ec79c6ae2f891ced15bf4e1"); + HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "87ca97f90e74caee35c35616c065821c"); } @Test public void HCTestProblematicReadsModifiedInActiveRegions() { final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3 -L 4:49139026-49139965"; - final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("3a096d6139d15dcab82f5b091d08489d")); + final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("3df42d0550b51eb9b55aac61e8b3c452")); executeTest("HCTestProblematicReadsModifiedInActiveRegions: ", spec); } @Test public void HCTestStructuralIndels() { final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "AFR.structural.indels.bam") + " --no_cmdline_in_header -o %s -minPruning 6 -L 20:8187565-8187800 -L 20:18670537-18670730"; - final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("a518c7436544f2b5f71c9d9427ce1cce")); + final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("4dbc72b72e3e2d9d812d5a398490e213")); executeTest("HCTestStructuralIndels: ", spec); } @@ -93,7 +93,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { public void HCTestReducedBam() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1, - Arrays.asList("8a400b0c46f41447fcc35a907e34f384")); + Arrays.asList("f8c2745bf71f2659a57494fcaa2c103b")); executeTest("HC calling on a ReducedRead BAM", spec); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java index 8d0cefaa4..f8aec1489 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java @@ -284,7 +284,7 @@ public abstract class MicroScheduler implements MicroSchedulerMBean { protected boolean abortExecution() { final boolean abort = engine.exceedsRuntimeLimit(progressMeter.getRuntimeInNanoseconds(), TimeUnit.NANOSECONDS); if ( abort ) { - final AutoFormattingTime aft = new AutoFormattingTime(TimeUnit.SECONDS.convert(engine.getRuntimeLimitInNanoseconds(), TimeUnit.NANOSECONDS), 1, 4); + final AutoFormattingTime aft = new AutoFormattingTime(engine.getRuntimeLimitInNanoseconds(), -1, 4); logger.info("Aborting execution (cleanly) because the runtime has exceeded the requested maximum " + aft); } return abort; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java index af27d9c6f..24bac9deb 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java @@ -13,6 +13,7 @@ import org.broadinstitute.variant.vcf.VCFInfoHeaderLine; import org.broadinstitute.variant.variantcontext.Genotype; import org.broadinstitute.variant.variantcontext.GenotypesContext; import org.broadinstitute.variant.variantcontext.VariantContext; +import org.broadinstitute.variant.variantcontext.Allele; import java.util.Arrays; import java.util.HashMap; @@ -68,16 +69,49 @@ public class QualByDepth extends InfoFieldAnnotation implements StandardAnnotati return null; double QD = -10.0 * vc.getLog10PError() / (double)depth; - Map map = new HashMap(); + + if ( ! vc.isSNP() && ! vc.isSymbolic() ) { + // adjust for the event length + int averageLengthNum = 0; + int averageLengthDenom = 0; + int refLength = vc.getReference().length(); + for ( Allele a : vc.getAlternateAlleles() ) { + int numAllele = vc.getCalledChrCount(a); + int alleleSize; + if ( a.length() == refLength ) { + // SNP or MNP + byte[] a_bases = a.getBases(); + byte[] ref_bases = vc.getReference().getBases(); + int n_mismatch = 0; + for ( int idx = 0; idx < a_bases.length; idx++ ) { + if ( a_bases[idx] != ref_bases[idx] ) + n_mismatch++; + } + alleleSize = n_mismatch; + } + else if ( a.isSymbolic() ) { + alleleSize = 1; + } else { + alleleSize = Math.abs(refLength-a.length()); + } + averageLengthNum += alleleSize*numAllele; + averageLengthDenom += numAllele; + } + double averageLength = ( (double) averageLengthNum )/averageLengthDenom; + QD /= averageLength; + map.put(getKeyNames().get(1),String.format("%.2f",averageLength)); + } + map.put(getKeyNames().get(0), String.format("%.2f", QD)); return map; } - public List getKeyNames() { return Arrays.asList("QD"); } + public List getKeyNames() { return Arrays.asList("QD","AAL"); } public List getDescriptions() { - return Arrays.asList(new VCFInfoHeaderLine(getKeyNames().get(0), 1, VCFHeaderLineType.Float, "Variant Confidence/Quality by Depth")); + return Arrays.asList(new VCFInfoHeaderLine(getKeyNames().get(0), 1, VCFHeaderLineType.Float, "Variant Confidence/Quality by Depth"), + new VCFInfoHeaderLine(getKeyNames().get(1), 1, VCFHeaderLineType.Float, "Average Allele Length")); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactCallLogger.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactCallLogger.java index c9270a6a7..3681451c6 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactCallLogger.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactCallLogger.java @@ -50,7 +50,7 @@ public class ExactCallLogger implements Cloneable { return String.format("ExactCall %s:%d alleles=%s nSamples=%s orig.pNonRef=%.2f orig.runtime=%s", vc.getChr(), vc.getStart(), vc.getAlleles(), vc.getNSamples(), originalCall.getLog10PosteriorOfAFGT0(), - new AutoFormattingTime(runtime / 1e9).toString()); + new AutoFormattingTime(runtime).toString()); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SAMRecordCoordinateComparatorWithUnmappedReads.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SAMRecordCoordinateComparatorWithUnmappedReads.java deleted file mode 100644 index 3854a4a8c..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SAMRecordCoordinateComparatorWithUnmappedReads.java +++ /dev/null @@ -1,60 +0,0 @@ -/* - * The MIT License - * - * Copyright (c) 2009 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - */ -package org.broadinstitute.sting.gatk.walkers.indels; - -import net.sf.samtools.SAMRecord; -import net.sf.samtools.SAMRecordCoordinateComparator; - -/** - * Extends Picard's Comparator for sorting SAMRecords by coordinate. This one actually deals with unmapped reads - * (among other things) sitting at the same position as their mates (so that they both can be put into the same set). - */ -public class SAMRecordCoordinateComparatorWithUnmappedReads extends SAMRecordCoordinateComparator { - public int compare(final SAMRecord samRecord1, final SAMRecord samRecord2) { - int cmp = fileOrderCompare(samRecord1, samRecord2); - if ( cmp != 0 ) - return cmp; - - // deal with unmapped reads - if ( samRecord1.getReadUnmappedFlag() != samRecord2.getReadUnmappedFlag() ) - return (samRecord1.getReadUnmappedFlag()? 1: -1); - - if ( samRecord1.getReadNegativeStrandFlag() != samRecord2.getReadNegativeStrandFlag() ) - return (samRecord1.getReadNegativeStrandFlag()? 1: -1); - - // even the names can be the same - cmp = samRecord1.getReadName().compareTo(samRecord2.getReadName()); - if ( cmp != 0 ) - return cmp; - - if ( samRecord1.getDuplicateReadFlag() != samRecord2.getDuplicateReadFlag() ) - return (samRecord1.getDuplicateReadFlag()? -1: 1); - - if ( samRecord1.getReadPairedFlag() && samRecord2.getReadPairedFlag() && samRecord1.getFirstOfPairFlag() != samRecord2.getFirstOfPairFlag() ) - return (samRecord1.getFirstOfPairFlag()? -1: 1); - - // such a case was actually observed - return samRecord1.getMappingQuality() - samRecord2.getMappingQuality(); - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetector.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetector.java deleted file mode 100644 index 68be1629c..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetector.java +++ /dev/null @@ -1,2291 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.walkers.indels; - -import net.sf.samtools.*; -import org.apache.commons.jexl2.Expression; -import org.apache.commons.jexl2.JexlContext; -import org.apache.commons.jexl2.JexlEngine; -import org.apache.commons.jexl2.MapContext; -import org.broad.tribble.Feature; -import org.broadinstitute.sting.commandline.*; -import org.broadinstitute.sting.gatk.CommandLineGATK; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID; -import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource; -import org.broadinstitute.sting.gatk.filters.MappingQualityZeroFilter; -import org.broadinstitute.sting.gatk.filters.Platform454Filter; -import org.broadinstitute.sting.gatk.filters.PlatformUnitFilter; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.refdata.SeekableRODIterator; -import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack; -import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackBuilder; -import org.broadinstitute.sting.gatk.refdata.utils.LocationAwareSeekableRODIterator; -import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList; -import org.broadinstitute.sting.gatk.walkers.ReadFilters; -import org.broadinstitute.sting.gatk.walkers.ReadWalker; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.GenomeLocSortedSet; -import org.broadinstitute.sting.utils.SampleUtils; -import org.broadinstitute.sting.utils.codecs.refseq.RefSeqCodec; -import org.broadinstitute.sting.utils.codecs.refseq.RefSeqFeature; -import org.broadinstitute.sting.utils.codecs.refseq.Transcript; -import org.broadinstitute.variant.vcf.*; -import org.broadinstitute.sting.utils.collections.CircularArray; -import org.broadinstitute.sting.utils.collections.PrimitivePair; -import org.broadinstitute.sting.utils.exceptions.StingException; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; -import org.broadinstitute.sting.utils.interval.IntervalMergingRule; -import org.broadinstitute.sting.utils.interval.IntervalUtils; -import org.broadinstitute.sting.utils.interval.OverlappingIntervalIterator; -import org.broadinstitute.sting.utils.sam.AlignmentUtils; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.broadinstitute.variant.variantcontext.*; -import org.broadinstitute.variant.variantcontext.writer.VariantContextWriter; - -import java.io.*; -import java.util.*; - - -/** - * Tool for calling indels in Tumor-Normal paired sample mode; this tool supports single-sample mode as well, - * but this latter functionality is now superceded by UnifiedGenotyper. - * - *

- * This is a simple, counts-and-cutoffs based tool for calling indels from aligned (preferrably MSA cleaned) sequencing - * data. Supported output formats are: BED format, extended verbose output (tab separated), and VCF. The latter two outputs - * include additional statistics such as mismatches and base qualitites around the calls, read strandness (how many - * forward/reverse reads support ref and indel alleles) etc. It is highly recommended to use these additional - * statistics to perform post-filtering of the calls as the tool is tuned for sensitivity (in other words it will - * attempt to "call" anything remotely reasonable based only on read counts and will generate all the additional - * metrics for the post-processing tools to make the final decision). The calls are performed by default - * from a matched tumor-normal pair of samples. In this case, two (sets of) input bam files must be specified using tagged -I - * command line arguments: normal and tumor bam(s) must be passed with -I:normal and -I:tumor arguments, - * respectively. Indels are called from the tumor sample and annotated as germline - * if even a weak evidence for the same indel, not necessarily a confident call, exists in the normal sample, or as somatic - * if normal sample has coverage at the site but no indication for an indel. Note that strictly speaking the calling - * is not even attempted in normal sample: if there is an indel in normal that is not detected/does not pass a threshold - * in tumor sample, it will not be reported. - * - * To make indel calls and associated metrics for a single sample, this tool can be run with --unpaired flag (input - * bam tagging is not required in this case, and tags are completely ignored if still used: all input bams will be merged - * on the fly and assumed to represent a single sample - this tool does not check for sample id in the read groups). - * - * Which (putative) calls will make it into the output file(s) is controlled by an expression/list of expressions passed with -filter - * flag: if any of the expressions evaluate to TRUE, the site will be discarded. Otherwise the putative call and all the - * associated statistics will be printed into the output. Expressions recognize the following variables(in paired-sample - * somatic mode variables are prefixed with T_ and N_ for Tumor and Normal, e.g. N_COV and T_COV are defined instead of COV): - * COV for coverage at the site, INDEL_F for fraction of reads supporting consensus indel at the site (wrt total coverage), - * INDEL_CF for fraction of reads with consensus indel wrt all reads with an indel at the site, CONS_CNT for the count of - * reads supporting the consensus indel at the site. Conventional arithmetic and logical operations are supported. For instance, - * N_COV<4||T_COV<6||T_INDEL_F<0.3||T_INDEL_CF<0.7 instructs the tool to only output indel calls with at least 30% observed - * allelic fraction and with consensus indel making at least 70% of all indel observations at the site, and only at the sites - * where tumor coverage and normal coverage are at least 6 and 4, respectively. - *

Input

- *

- * Tumor and normal bam files (or single sample bam file(s) in --unpaired mode). - *

- * - *

Output

- *

- * Indel calls with associated metrics. - *

- * - *

Examples

- *
- * java -Xmx2g -jar GenomeAnalysisTK.jar \
- *   -R ref.fasta \
- *   -T SomaticIndelDetector \
- *   -o indels.vcf \
- *   -verbose indels.txt
- *   -I:normal normal.bam \
- *   -I:tumor tumor.bam
- * 
- * - */ - -@DocumentedGATKFeature( groupName = "Cancer-specific Variant Discovery Tools", extraDocs = {CommandLineGATK.class} ) -@ReadFilters({Platform454Filter.class, MappingQualityZeroFilter.class, PlatformUnitFilter.class}) -public class SomaticIndelDetector extends ReadWalker { -// @Output -// PrintStream out; - @Output(doc="File to write variants (indels) in VCF format",required=true) - protected VariantContextWriter vcf_writer = null; - - @Argument(fullName="outputFile", shortName="O", doc="output file name (BED format). DEPRECATED> Use --bed", required=true) - @Deprecated - java.io.File output_file; - - @Argument(fullName = "metrics_file", shortName = "metrics", doc = "File to print callability metrics output", required = false) - public PrintStream metricsWriter = null; - -// @Argument(fullName="vcf_format", shortName="vcf", doc="generate output file in VCF format", required=false) -// boolean FORMAT_VCF = false; - - @Hidden - @Input(fullName = "genotype_intervals", shortName = "genotype", - doc = "Calls will be made at each position within the specified interval(s), whether there is an indel or not", required = false) - public IntervalBinding genotypeIntervalsFile = null; - - @Hidden - @Argument(fullName="unpaired", shortName="unpaired", - doc="Perform unpaired calls (no somatic status detection)", required=false) - boolean call_unpaired = false; - boolean call_somatic ; - - @Argument(fullName="verboseOutput", shortName="verbose", - doc="Verbose output file in text format", required=false) - java.io.File verboseOutput = null; - - @Argument(fullName="bedOutput", shortName="bed", - doc="Lightweight bed output file (only positions and events, no stats/annotations)", required=false) - java.io.File bedOutput = null; - - @Deprecated - @Argument(fullName="minCoverage", shortName="minCoverage", - doc="indel calls will be made only at sites with tumor coverage of minCoverage or more reads; "+ - "with --unpaired (single sample) option, this value is used for minimum sample coverage. "+ - "INSTEAD USE: T_COV FILTER_EXPRESSIONS = new ArrayList(); - -//@Argument(fullName="blacklistedLanes", shortName="BL", -// doc="Name of lanes (platform units) that should be ignored. Reads coming from these lanes will never be seen "+ -// "by this application, so they will not contribute indels to consider and will not be counted.", required=false) -//PlatformUnitFilterHelper dummy; - - @Hidden - @Argument(fullName="indel_debug", shortName="idebug", doc="Detailed printout for debugging, do not turn this on", - required=false) Boolean DEBUG = false; - @Argument(fullName="window_size", shortName="ws", doc="Size (bp) of the sliding window used for accumulating the coverage. "+ - "May need to be increased to accomodate longer reads or longer deletions. A read can be fit into the "+ - "window if its length on the reference (i.e. read length + length of deletion gap(s) if any) is smaller "+ - "than the window size. Reads that do not fit will be ignored, so long deletions can not be called "+ - "if window is too small",required=false) int WINDOW_SIZE = 200; - @Argument(fullName="maxNumberOfReads",shortName="mnr",doc="Maximum number of reads to cache in the window; if number of reads exceeds this number,"+ - " the window will be skipped and no calls will be made from it",required=false) int MAX_READ_NUMBER = 10000; - - - - private WindowContext tumor_context; - private WindowContext normal_context; - private int currentContigIndex = -1; - private int contigLength = -1; // we see to much messy data with reads hanging out of contig ends... - private int currentPosition = -1; // position of the last read we've seen on the current contig - private String refName = null; - private java.io.Writer output = null; - private GenomeLoc location = null; - private long normalCallsMade = 0L, tumorCallsMade = 0L; - - boolean outOfContigUserWarned = false; - - private LocationAwareSeekableRODIterator refseqIterator=null; - -// private Set normalReadGroups; // we are going to remember which read groups are normals and which are tumors in order to be able -// private Set tumorReadGroups ; // to properly assign the reads coming from a merged stream - private Set normalSamples; // we are going to remember which samples are normal and which are tumor: - private Set tumorSamples ; // these are used only to generate genotypes for vcf output - - private int NQS_WIDTH = 5; // 5 bases on each side of the indel for NQS-style statistics - - private Writer bedWriter = null; - private Writer verboseWriter = null; - - - private static String annGenomic = "GENOMIC\t"; - private static String annIntron = "INTRON"; - private static String annUTR = "UTR"; - private static String annCoding = "CODING"; - private static String annUnknown = "UNKNOWN"; - - enum CallType { - NOCOVERAGE, - BADCOVERAGE, - NOEVIDENCE, - GERMLINE, - SOMATIC - }; - - private SAMRecord lastRead; - private byte[] refBases; - private ReferenceDataSource refData; - private Iterator genotypeIntervalIterator = null; - - // the current interval in the list of intervals, for which we want to do full genotyping - private GenomeLoc currentGenotypeInterval = null; - private long lastGenotypedPosition = -1; // last position on the currentGenotypeInterval, for which a call was already printed; - // can be 1 base before lastGenotyped start - - private JexlEngine jexlEngine = new JexlEngine(); - private ArrayList jexlExpressions = new ArrayList(); - - // the following arrays store indel source-specific (normal/tumor) metric names - // for fast access when populating JEXL expression contexts (see IndelPrecall.fillContext()) - private final static String[] normalMetricsCassette = new String[4]; - private final static String[] tumorMetricsCassette = new String[4]; - private final static String[] singleMetricsCassette = new String[4]; - private final static int C_COV=0; - private final static int C_CONS_CNT=1; - private final static int C_INDEL_F=2; - private final static int C_INDEL_CF=3; - static { - normalMetricsCassette[C_COV] = "N_COV"; - tumorMetricsCassette[C_COV] = "T_COV"; - singleMetricsCassette[C_COV] = "COV"; - normalMetricsCassette[C_CONS_CNT] = "N_CONS_CNT"; - tumorMetricsCassette[C_CONS_CNT] = "T_CONS_CNT"; - singleMetricsCassette[C_CONS_CNT] = "CONS_CNT"; - normalMetricsCassette[C_INDEL_F] = "N_INDEL_F"; - tumorMetricsCassette[C_INDEL_F] = "T_INDEL_F"; - singleMetricsCassette[C_INDEL_F] = "INDEL_F"; - normalMetricsCassette[C_INDEL_CF] = "N_INDEL_CF"; - tumorMetricsCassette[C_INDEL_CF] = "T_INDEL_CF"; - singleMetricsCassette[C_INDEL_CF] = "INDEL_CF"; - } - - // "/humgen/gsa-scr1/GATK_Data/refGene.sorted.txt" - - private Set getVCFHeaderInfo() { - Set headerInfo = new HashSet(); - - // first, the basic info - headerInfo.add(new VCFHeaderLine("source", "SomaticIndelDetector")); - headerInfo.add(new VCFHeaderLine("reference", getToolkit().getArguments().referenceFile.getName())); - headerInfo.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_KEY)); - - // FORMAT and INFO fields -// headerInfo.addAll(VCFUtils.getSupportedHeaderStrings()); - - headerInfo.addAll(VCFIndelAttributes.getAttributeHeaderLines()); - if ( call_somatic ) { - headerInfo.add(new VCFInfoHeaderLine(VCFConstants.SOMATIC_KEY, 0, VCFHeaderLineType.Flag, "Somatic event")); - } else { - } - - // all of the arguments from the argument collection - Set args = new HashSet(); - args.add(this); - args.addAll(getToolkit().getFilters()); - Map commandLineArgs = getToolkit().getApproximateCommandLineArguments(args); - for ( Map.Entry commandLineArg : commandLineArgs.entrySet() ) - headerInfo.add(new VCFHeaderLine(String.format("SID_%s", commandLineArg.getKey()), commandLineArg.getValue())); - // also, the list of input bams - for ( String fileName : getToolkit().getArguments().samFiles ) - headerInfo.add(new VCFHeaderLine("SID_bam_file_used", fileName)); - - return headerInfo; - } - - - @Override - public void initialize() { - - call_somatic = (call_unpaired ? false : true); - normal_context = new WindowContext(0,WINDOW_SIZE); - normalSamples = new HashSet(); - - if ( bedOutput != null && output_file != null ) { - throw new UserException.DeprecatedArgument("-O", "-O option is deprecated and -bed option replaces it; you can not use both at the same time"); - } - - if ( RefseqFileName != null ) { - logger.info("Using RefSeq annotations from "+RefseqFileName); - - RMDTrackBuilder builder = new RMDTrackBuilder(getToolkit().getReferenceDataSource().getReference().getSequenceDictionary(), - getToolkit().getGenomeLocParser(), - getToolkit().getArguments().unsafe); - RMDTrack refseq = builder.createInstanceOfTrack(RefSeqCodec.class,new File(RefseqFileName)); - - refseqIterator = new SeekableRODIterator(refseq.getHeader(), - refseq.getSequenceDictionary(), - getToolkit().getReferenceDataSource().getReference().getSequenceDictionary(), - getToolkit().getGenomeLocParser(), - refseq.getIterator()); - } - - if ( refseqIterator == null ) logger.info("No gene annotations available"); - - int nSams = getToolkit().getArguments().samFiles.size(); - - if ( call_somatic ) { - if ( nSams < 2 ) throw new UserException.BadInput("In default (paired sample) mode at least two bam files (normal and tumor) must be specified"); - tumor_context = new WindowContext(0,WINDOW_SIZE); - tumorSamples = new HashSet(); - } - - int nNorm = 0; - int nTum = 0; - for ( SAMReaderID rid : getToolkit().getReadsDataSource().getReaderIDs() ) { - Tags tags = rid.getTags() ; - if ( tags.getPositionalTags().isEmpty() && call_somatic ) - throw new UserException.BadInput("In default (paired sample) mode all input bam files must be tagged as either 'normal' or 'tumor'. Untagged file: "+ - getToolkit().getSourceFileForReaderID(rid)); - boolean normal = false; - boolean tumor = false; - for ( String s : tags.getPositionalTags() ) { // we allow additional unrelated tags (and we do not use them), but we REQUIRE one of Tumor/Normal to be present if --somatic is on - if ( "NORMAL".equals(s.toUpperCase()) ) { - normal = true; - nNorm++; - } - if ( "TUMOR".equals(s.toUpperCase()) ) { - tumor = true; - nTum++ ; - } - } - if ( call_somatic && normal && tumor ) throw new UserException.BadInput("Input bam file "+ - getToolkit().getSourceFileForReaderID(rid)+" is tagged both as normal and as tumor. Which one is it??"); - if ( call_somatic && !normal && ! tumor ) - throw new UserException.BadInput("In somatic mode all input bams must be tagged as either normal or tumor. Encountered untagged file: "+ - getToolkit().getSourceFileForReaderID(rid)); - if ( ! call_somatic && (normal || tumor) ) - System.out.println("WARNING: input bam file "+getToolkit().getSourceFileForReaderID(rid) - +" is tagged as Normal and/or Tumor, but somatic mode is not on. Tags will ne IGNORED"); - if ( call_somatic && tumor ) { - for ( SAMReadGroupRecord rg : getToolkit().getSAMFileHeader(rid).getReadGroups() ) { - tumorSamples.add(rg.getSample()); - } - } else { - for ( SAMReadGroupRecord rg : getToolkit().getSAMFileHeader(rid).getReadGroups() ) { - normalSamples.add(rg.getSample()); - } - } - if ( genotypeIntervalsFile != null ) { - - // read in the whole list of intervals for cleaning - GenomeLocSortedSet locs = IntervalUtils.sortAndMergeIntervals(getToolkit().getGenomeLocParser(), genotypeIntervalsFile.getIntervals(getToolkit()), IntervalMergingRule.OVERLAPPING_ONLY); - genotypeIntervalIterator = locs.iterator(); - - // wrap intervals requested for genotyping inside overlapping iterator, so that we actually - // genotype only on the intersections of the requested intervals with the -L intervals - genotypeIntervalIterator = new OverlappingIntervalIterator(genotypeIntervalIterator, getToolkit().getIntervals().iterator() ); - - currentGenotypeInterval = genotypeIntervalIterator.hasNext() ? genotypeIntervalIterator.next() : null; - - if ( DEBUG) System.out.println("DEBUG>> first genotyping interval="+currentGenotypeInterval); - - if ( currentGenotypeInterval != null ) lastGenotypedPosition = currentGenotypeInterval.getStart()-1; - } - - } - - location = getToolkit().getGenomeLocParser().createGenomeLoc(getToolkit().getSAMFileHeader().getSequence(0).getSequenceName(),1); - - try { - // we already checked that bedOutput and output_file are not set simultaneously - if ( bedOutput != null ) bedWriter = new FileWriter(bedOutput); - if ( output_file != null ) bedWriter = new FileWriter(output_file); - } catch (java.io.IOException e) { - throw new UserException.CouldNotReadInputFile(bedOutput, "Failed to open BED file for writing.", e); - } - try { - if ( verboseOutput != null ) verboseWriter = new FileWriter(verboseOutput); - } catch (java.io.IOException e) { - throw new UserException.CouldNotReadInputFile(verboseOutput, "Failed to open BED file for writing.", e); - } - - vcf_writer.writeHeader(new VCFHeader(getVCFHeaderInfo(), SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader()))) ; - refData = new ReferenceDataSource(getToolkit().getArguments().referenceFile); - - // Now initialize JEXL expressions: - if ( FILTER_EXPRESSIONS.size() == 0 ) { - if ( call_unpaired ) { - FILTER_EXPRESSIONS.add("COV<6||INDEL_F<0.3||INDEL_CF<0.7"); - } else { - FILTER_EXPRESSIONS.add("T_COV<6||N_COV<4||T_INDEL_F<0.3||T_INDEL_CF<0.7"); - } - } - for ( String s : FILTER_EXPRESSIONS ) { - try { - Expression e = jexlEngine.createExpression(s); - jexlExpressions.add(e); - } catch (Exception e) { - throw new UserException.BadArgumentValue("Filter expression", "Invalid expression used (" + s + "). Please see the JEXL docs for correct syntax.") ; - } - - } - } - - - @Override - public Integer map(ReferenceContext ref, GATKSAMRecord read, RefMetaDataTracker metaDataTracker) { - - // if ( read.getReadName().equals("428EFAAXX090610:2:36:1384:639#0") ) System.out.println("GOT READ"); - - if ( DEBUG ) { - // System.out.println("DEBUG>> read at "+ read.getAlignmentStart()+"-"+read.getAlignmentEnd()+ - // "("+read.getCigarString()+")"); - if ( read.getDuplicateReadFlag() ) System.out.println("DEBUG>> Duplicated read (IGNORED)"); - } - - if ( AlignmentUtils.isReadUnmapped(read) || - read.getDuplicateReadFlag() || - read.getNotPrimaryAlignmentFlag() || - read.getMappingQuality() == 0 ) { - return 0; // we do not need those reads! - } - - if ( read.getReferenceIndex() != currentContigIndex ) { - // we just jumped onto a new contig - if ( DEBUG ) System.out.println("DEBUG>>> Moved to contig "+read.getReferenceName()); - if ( read.getReferenceIndex() < currentContigIndex ) // paranoidal - throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.coordinate, read, "Read "+read.getReadName()+": contig is out of order; input BAM file is unsorted"); - - // print remaining indels from the previous contig (if any); - if ( call_somatic ) emit_somatic(1000000000, true); - else emit(1000000000,true); - - currentContigIndex = read.getReferenceIndex(); - currentPosition = read.getAlignmentStart(); - refName = new String(read.getReferenceName()); - - location = getToolkit().getGenomeLocParser().createGenomeLoc(refName,location.getStart(),location.getStop()); - contigLength = getToolkit().getGenomeLocParser().getContigInfo(refName).getSequenceLength(); - outOfContigUserWarned = false; - - lastGenotypedPosition = -1; - - normal_context.clear(); // reset coverage window; this will also set reference position to 0 - if ( call_somatic) tumor_context.clear(); - - refBases = new String(refData.getReference().getSequence(read.getReferenceName()).getBases()).toUpperCase().getBytes(); - } - - // we have reset the window to the new contig if it was required and emitted everything we collected - // on a previous contig. At this point we are guaranteed that we are set up properly for working - // with the contig of the current read. - - // NOTE: all the sanity checks and error messages below use normal_context only. We make sure that normal_context and - // tumor_context are synchronized exactly (windows are always shifted together by emit_somatic), so it's safe - - if ( read.getAlignmentStart() < currentPosition ) // oops, read out of order? - throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.coordinate, read, "Read "+read.getReadName() +" out of order on the contig\n"+ - "Read starts at "+refName+":"+read.getAlignmentStart()+"; last read seen started at "+refName+":"+currentPosition - +"\nLast read was: "+lastRead.getReadName()+" RG="+lastRead.getAttribute("RG")+" at "+lastRead.getAlignmentStart()+"-" - +lastRead.getAlignmentEnd()+" cigar="+lastRead.getCigarString()); - - currentPosition = read.getAlignmentStart(); - lastRead = read; - - if ( read.getAlignmentEnd() > contigLength ) { - if ( ! outOfContigUserWarned ) { - System.out.println("WARNING: Reads aligned past contig length on "+ location.getContig()+"; all such reads will be skipped"); - outOfContigUserWarned = true; - } - return 0; - } - - long alignmentEnd = read.getAlignmentEnd(); - Cigar c = read.getCigar(); - int lastNonClippedElement = 0; // reverse offset to the last unclipped element - CigarOperator op = null; - // moving backwards from the end of the cigar, skip trailing S or H cigar elements: - do { - lastNonClippedElement++; - op = c.getCigarElement( c.numCigarElements()-lastNonClippedElement ).getOperator(); - } while ( op == CigarOperator.H || op == CigarOperator.S ); - - // now op is the last non-S/H operator in the cigar. - - // a little trick here: we want to make sure that current read completely fits into the current - // window so that we can accumulate indel observations over the whole length of the read. - // The ::getAlignmentEnd() method returns the last position on the reference where bases from the - // read actually match (M cigar elements). After our cleaning procedure, we can have reads that end - // with I element, which is not gonna be counted into alignment length on the reference. On the other hand, - // in this program we assign insertions, internally, to the first base *after* the insertion position. - // Hence, we have to make sure that that extra base is already in the window or we will get IndexOutOfBounds. - - if ( op == CigarOperator.I) alignmentEnd++; - - if ( alignmentEnd > normal_context.getStop()) { - - // we don't emit anything until we reach a read that does not fit into the current window. - // At that point we try shifting the window to the start of that read (or reasonably close) and emit everything prior to - // that position. This is legitimate, since the reads are sorted and we are not gonna see any more coverage at positions - // below the current read's start. - // Clearly, we assume here that window is large enough to accomodate any single read, so simply shifting - // the window to around the read's start will ensure that the read fits... - - if ( DEBUG) System.out.println("DEBUG>> Window at "+normal_context.getStart()+"-"+normal_context.getStop()+", read at "+ - read.getAlignmentStart()+": trying to emit and shift" ); - if ( call_somatic ) emit_somatic( read.getAlignmentStart(), false ); - else emit( read.getAlignmentStart(), false ); - - // let's double check now that the read fits after the shift - if ( read.getAlignmentEnd() > normal_context.getStop()) { - // ooops, looks like the read does not fit into the window even after the latter was shifted!! - // we used to die over such reads and require user to run with larger window size. Now we - // just print a warning and discard the read (this means that our counts can be slightly off in - // th epresence of such reads) - //throw new UserException.BadArgumentValue("window_size", "Read "+read.getReadName()+": out of coverage window bounds. Probably window is too small, so increase the value of the window_size argument.\n"+ - // "Read length="+read.getReadLength()+"; cigar="+read.getCigarString()+"; start="+ - // read.getAlignmentStart()+"; end="+read.getAlignmentEnd()+ - // "; window start (after trying to accomodate the read)="+normal_context.getStart()+"; window end="+normal_context.getStop()); - System.out.println("WARNING: Read "+read.getReadName()+ - " is out of coverage window bounds. Probably window is too small and the window_size value must be increased.\n"+ - " The read is ignored in this run (so all the counts/statistics reported will not include it).\n"+ - " Read length="+read.getReadLength()+"; cigar="+read.getCigarString()+"; start="+ - read.getAlignmentStart()+"; end="+read.getAlignmentEnd()+ - "; window start (after trying to accomodate the read)="+normal_context.getStart()+"; window end="+normal_context.getStop()); - return 1; - } - } - - if ( call_somatic ) { - - Tags tags = getToolkit().getReaderIDForRead(read).getTags(); - boolean assigned = false; - for ( String s : tags.getPositionalTags() ) { - if ( "NORMAL".equals(s.toUpperCase()) ) { - normal_context.add(read,ref.getBases()); - assigned = true; - break; - } - if ( "TUMOR".equals(s.toUpperCase()) ) { - tumor_context.add(read,ref.getBases()); - assigned = true; - break; - } - } - if ( ! assigned ) - throw new StingException("Read "+read.getReadName()+" from "+getToolkit().getSourceFileForReaderID(getToolkit().getReaderIDForRead(read))+ - "has no Normal/Tumor tag associated with it"); - -// String rg = (String)read.getExtendedAttribute("RG"); -// if ( rg == null ) -// throw new UserException.MalformedBam(read, "Read "+read.getReadName()+" has no read group in merged stream. RG is required for somatic calls."); - -// if ( normalReadGroups.contains(rg) ) { -// normal_context.add(read,ref.getBases()); -// } else if ( tumorReadGroups.contains(rg) ) { -// tumor_context.add(read,ref.getBases()); -// } else { -// throw new UserException.MalformedBam(read, "Unrecognized read group in merged stream: "+rg); -// } - - if ( tumor_context.getReads().size() > MAX_READ_NUMBER ) { - System.out.println("WARNING: a count of "+MAX_READ_NUMBER+" reads reached in a window "+ - refName+':'+tumor_context.getStart()+'-'+tumor_context.getStop()+" in tumor sample. The whole window will be dropped."); - tumor_context.shift(WINDOW_SIZE); - normal_context.shift(WINDOW_SIZE); - } - if ( normal_context.getReads().size() > MAX_READ_NUMBER ) { - System.out.println("WARNING: a count of "+MAX_READ_NUMBER+" reads reached in a window "+ - refName+':'+normal_context.getStart()+'-'+normal_context.getStop()+" in normal sample. The whole window will be dropped"); - tumor_context.shift(WINDOW_SIZE); - normal_context.shift(WINDOW_SIZE); - } - - - } else { - normal_context.add(read, ref.getBases()); - if ( normal_context.getReads().size() > MAX_READ_NUMBER ) { - System.out.println("WARNING: a count of "+MAX_READ_NUMBER+" reads reached in a window "+ - refName+':'+normal_context.getStart()+'-'+normal_context.getStop()+". The whole window will be dropped"); - normal_context.shift(WINDOW_SIZE); - } - } - - return 1; - } - - /** An auxiliary shortcut: returns true if position(location.getContig(), p) is past l */ - private boolean pastInterval(long p, GenomeLoc l) { - return ( location.getContigIndex() > l.getContigIndex() || - location.getContigIndex() == l.getContigIndex() && p > l.getStop() ); - } - - /** Emit calls of the specified type across genotyping intervals, from position lastGenotypedPosition+1 to - * pos-1, inclusive. - * @param contigIndex - * @param pos - * @param call - */ - /* - private void emitNoCallsUpTo(int contigIndex, long pos, CallType call) { - - if ( contigIndex < currentGenotypeInterval.getContigIndex() || - contigIndex == currentGenotypeInterval.getContigIndex() && pos <= currentGenotypeInterval.getStart() ) return; - - if ( contigIndex == currentGenotypeInterval.getContigIndex() && pos >= currentGenotypeInterval.getStart() ) { - for ( long p = lastGenotypedPosition+1; p < pos; p++ ) { - - } - } - while( currentGenotypeInterval != null ) { - - while ( ) - if ( genotypeIntervalIterator.hasNext() ) { - currentGenotypeInterval = genotypeIntervalIterator.next() ; - if ( pastInterval(p,currentGenotypeInterval) ) { - // if we are about to jump over the whole next interval, we need to emit NO_COVERAGE calls there! - emitNoCoverageCalls(currentGenotypeInterval); - } - } else { - currentGenotypeInterval = null; - } - } - } -*/ - - /** Output indel calls up to the specified position and shift the window: after this method is executed, the - * first element of the window maps onto 'position', if possible, or at worst a few bases to the left of 'position' if we may need more - * reads to get full NQS-style statistics for an indel in the close proximity of 'position'. - * - * @param position - */ - private void emit(long position, boolean force) { - - long adjustedPosition = adjustPosition(position); - - if ( adjustedPosition == -1 ) { - // failed to find appropriate shift position, the data are probably to messy anyway so we drop them altogether - normal_context.shift((int)(position-normal_context.getStart())); - return; - } - long move_to = adjustedPosition; - - for ( int pos = normal_context.getStart() ; pos < Math.min(adjustedPosition,normal_context.getStop()+1) ; pos++ ) { - - boolean genotype = false; - // first let's see if we need to genotype current position: - - final long p = pos - 1; // our internally used positions (pos) are +1 compared to external format spec (e.g. vcf) - - if ( pos <= lastGenotypedPosition ) continue; - - while ( currentGenotypeInterval != null ) { - - // if we did not even reach next interval yet, no genotyping at current position: - if ( location.getContigIndex() < currentGenotypeInterval.getContigIndex() || - location.getContigIndex() == currentGenotypeInterval.getContigIndex() && - p < currentGenotypeInterval.getStart() ) break; - if ( pastInterval(p, currentGenotypeInterval) ) { - // we are past current genotyping interval, so we are done with it; let's load next interval: - currentGenotypeInterval = genotypeIntervalIterator.hasNext() ? genotypeIntervalIterator.next() : null; - continue; // re-enter the loop to check against the interval we just loaded - } - - // we reach this point only if p is inside current genotyping interval; set the flag and bail out: - genotype = true; - break; - } - -// if ( DEBUG ) System.out.println("DEBUG>> pos="+pos +"; genotyping interval="+currentGenotypeInterval+"; genotype="+genotype); - - if ( normal_context.indelsAt(pos).size() == 0 && ! genotype ) continue; - - IndelPrecall normalCall = new IndelPrecall(normal_context,pos,NQS_WIDTH); - JexlContext jc = new MapContext(); - normalCall.fillContext(jc,singleMetricsCassette); - boolean discard_event = false; - - for ( Expression e : jexlExpressions ) { - if ( ((Boolean)e.evaluate(jc)).booleanValue() ) { discard_event=true; break; } - } - - if ( discard_event && ! genotype ) { - normal_context.indelsAt(pos).clear(); - continue; //* - } - -// if ( normalCall.getCoverage() < minCoverage && ! genotype ) { -// if ( DEBUG ) { -// System.out.println("DEBUG>> Indel at "+pos+"; coverare in normal="+normalCall.getCoverage()+" (SKIPPED)"); -// } -// continue; // low coverage -// } - - if ( DEBUG ) System.out.println("DEBUG>> "+(normalCall.getAllVariantCount() == 0?"No Indel":"Indel")+" at "+pos); - - long left = Math.max( pos-NQS_WIDTH, normal_context.getStart() ); - long right = pos+( normalCall.getVariant() == null ? 0 : normalCall.getVariant().lengthOnRef())+NQS_WIDTH-1; - - if ( right >= adjustedPosition && ! force) { - // we are not asked to force-shift, and there is more coverage around the current indel that we still need to collect - - // we are not asked to force-shift, and there's still additional coverage to the right of current indel, so its too early to emit it; - // instead we shift only up to current indel pos - MISMATCH_WIDTH, so that we could keep collecting that coverage - move_to = adjustPosition(left); - if ( move_to == -1 ) { - // failed to find appropriate shift position, the data are probably to messy anyway so we drop them altogether - normal_context.shift((int)(adjustedPosition-normal_context.getStart())); - return; - } - if ( DEBUG ) System.out.println("DEBUG>> waiting for coverage; actual shift performed to "+ move_to); - break; - } - - // if indel is too close to the end of the window but we need to emit anyway (force-shift), adjust right: - if ( right > normal_context.getStop() ) right = normal_context.getStop(); - - // location = getToolkit().getGenomeLocParser().setStart(location,pos); - // location = getToolkit().getGenomeLocParser().setStop(location,pos); // retrieve annotation data - - location = getToolkit().getGenomeLocParser().createGenomeLoc(location.getContig(), pos); - - if ( ! discard_event ) normalCallsMade++; - printVCFLine(vcf_writer,normalCall, discard_event); - if ( bedWriter != null ) normalCall.printBedLine(bedWriter); - if ( verboseWriter != null ) printVerboseLine(verboseWriter, normalCall, discard_event); - lastGenotypedPosition = pos; - - normal_context.indelsAt(pos).clear(); - // we dealt with this indel; don't want to see it again - // (we might otherwise in the case when 1) there is another indel that follows - // within MISMATCH_WIDTH bases and 2) we'd need to wait for more coverage for that next indel) - } - - if ( DEBUG ) System.out.println("DEBUG>> Actual shift to " + move_to + " ("+adjustedPosition+")"); - normal_context.shift((int)(move_to - normal_context.getStart() ) ); - } - - /** A shortcut. Returns true if we got indels within the specified interval in single and only window context - * (for single-sample calls) or in either of the two window contexts (for two-sample/somatic calls) - * - */ - private boolean indelsPresentInInterval(long start, long stop) { - if ( tumor_context == null ) return normal_context.hasIndelsInInterval(start,stop); - return tumor_context.hasIndelsInInterval(start,stop) || - normal_context.hasIndelsInInterval(start,stop); - } - /** Takes the position, to which window shift is requested, and tries to adjust it in such a way that no NQS window is broken. - * Namely, this method checks, iteratively, if there is an indel within NQS_WIDTH bases ahead of initially requested or adjusted - * shift position. If there is such an indel, - * then shifting to that position would lose some or all NQS-window bases to the left of the indel (since it's not going to be emitted - * just yet). Instead, this method tries to readjust the shift position leftwards so that full NQS window to the left of the next indel - * is preserved. This method tries thie strategy 4 times (so that it would never walk away too far to the left), and if it fails to find - * an appropriate adjusted shift position (which could happen if there are many indels following each other at short intervals), it will give up, - * go back to the original requested shift position and try finding the first shift poisition that has no indel associated with it. - */ - - private long adjustPosition(long request) { - long initial_request = request; - int attempts = 0; - boolean failure = false; - while ( indelsPresentInInterval(request,request+NQS_WIDTH) ) { - request -= NQS_WIDTH; - if ( DEBUG ) System.out.println("DEBUG>> indel observations present within "+NQS_WIDTH+" bases ahead. Resetting shift to "+request); - attempts++; - if ( attempts == 4 ) { - if ( DEBUG ) System.out.println("DEBUG>> attempts to preserve full NQS window failed; now trying to find any suitable position.") ; - failure = true; - break; - } - } - - if ( failure ) { - // we tried 4 times but did not find a good shift position that would preserve full nqs window - // around all indels. let's fall back and find any shift position as long and there's no indel at the very - // first position after the shift (this is bad for other reasons); if it breaks a nqs window, so be it - request = initial_request; - attempts = 0; - while ( indelsPresentInInterval(request,request+1) ) { - request--; - if ( DEBUG ) System.out.println("DEBUG>> indel observations present within "+NQS_WIDTH+" bases ahead. Resetting shift to "+request); - attempts++; - if ( attempts == 50 ) { - System.out.println("WARNING: Indel at every position in the interval "+refName+":"+request+"-"+initial_request+ - ". Can not find a break to shift context window to; no calls will be attempted in the current window."); - return -1; - } - } - } - if ( DEBUG ) System.out.println("DEBUG>> Found acceptable target position "+request); - return request; - } - - /** Output somatic indel calls up to the specified position and shift the coverage array(s): after this method is executed - * first elements of the coverage arrays map onto 'position', or a few bases prior to the specified position - * if there is an indel in close proximity to 'position' so that we may get more coverage around it later. - * - * @param position - */ - private void emit_somatic(long position, boolean force) { - - long adjustedPosition = adjustPosition(position); - if ( adjustedPosition == -1 ) { - // failed to find appropriate shift position, the data are probably to messy anyway so we drop them altogether - normal_context.shift((int)(position-normal_context.getStart())); - tumor_context.shift((int)(position-tumor_context.getStart())); - return; - } - long move_to = adjustedPosition; - - if ( DEBUG ) System.out.println("DEBUG>> Emitting in somatic mode up to "+position+" force shift="+force+" current window="+tumor_context.getStart()+"-"+tumor_context.getStop()); - - for ( int pos = tumor_context.getStart() ; pos < Math.min(adjustedPosition,tumor_context.getStop()+1) ; pos++ ) { - - boolean genotype = false; - // first let's see if we need to genotype current position: - - final long p = pos - 1; // our internally used positions (pos) are +1 compared to external format spec (e.g. vcf) - - if ( pos <= lastGenotypedPosition ) continue; - - while ( currentGenotypeInterval != null ) { - - // if we did not even reach next interval yet, no genotyping at current position: - if ( location.getContigIndex() < currentGenotypeInterval.getContigIndex() || - location.getContigIndex() == currentGenotypeInterval.getContigIndex() && - p < currentGenotypeInterval.getStart() ) break; - if ( pastInterval(p, currentGenotypeInterval) ) { - // we are past current genotyping interval, so we are done with it; let's load next interval: - currentGenotypeInterval = genotypeIntervalIterator.hasNext() ? genotypeIntervalIterator.next() : null; - continue; // re-enter the loop to check against the interval we just loaded - } - - // we reach tjis point only if p is inside current genotyping interval; set the flag and bail out: - genotype = true; - break; - } -// if ( DEBUG) System.out.println("DEBUG>> pos="+pos +"; genotyping interval="+currentGenotypeInterval+"; genotype="+genotype); - - if ( tumor_context.indelsAt(pos).size() == 0 && ! genotype ) continue; // no indels in tumor - - if ( DEBUG && genotype ) System.out.println("DEBUG>> Genotyping requested at "+pos); - - IndelPrecall tumorCall = new IndelPrecall(tumor_context,pos,NQS_WIDTH); - IndelPrecall normalCall = new IndelPrecall(normal_context,pos,NQS_WIDTH); - - JexlContext jc = new MapContext(); - tumorCall.fillContext(jc,tumorMetricsCassette); - normalCall.fillContext(jc,normalMetricsCassette); - boolean discard_event = false; - - for ( Expression e : jexlExpressions ) { - if ( ((Boolean)e.evaluate(jc)).booleanValue() ) { discard_event=true; break; } - } - - if ( discard_event && ! genotype ) { - tumor_context.indelsAt(pos).clear(); - normal_context.indelsAt(pos).clear(); - continue; //* - } -// if ( tumorCall.getCoverage() < minCoverage && ! genotype ) { -// if ( DEBUG ) { -// System.out.println("DEBUG>> Indel in tumor at "+pos+"; coverare in tumor="+tumorCall.getCoverage()+" (SKIPPED)"); -// } -// continue; // low coverage -// } -// if ( normalCall.getCoverage() < minNormalCoverage && ! genotype ) { -// if ( DEBUG ) { -// System.out.println("DEBUG>> Indel in tumor at "+pos+"; coverare in normal="+normalCall.getCoverage()+" (SKIPPED)"); -// } -// continue; // low coverage -// } - - if ( DEBUG ) { - System.out.print("DEBUG>> "+(tumorCall.getAllVariantCount() == 0?"No Indel":"Indel")+" in tumor, "); - System.out.print("DEBUG>> "+(normalCall.getAllVariantCount() == 0?"No Indel":"Indel")+" in normal at "+pos); - } - - long left = Math.max( pos-NQS_WIDTH, tumor_context.getStart() ); - long right = pos+ ( tumorCall.getVariant() == null ? 0 : tumorCall.getVariant().lengthOnRef() )+NQS_WIDTH-1; - - if ( right >= adjustedPosition && ! force) { - // we are not asked to force-shift, and there is more coverage around the current indel that we still need to collect - - // we are not asked to force-shift, and there's still additional coverage to the right of current indel, so its too early to emit it; - // instead we shift only up to current indel pos - MISMATCH_WIDTH, so that we could keep collecting that coverage - move_to = adjustPosition(left); - if ( move_to == -1 ) { - // failed to find appropriate shift position, the data are probably to messy anyway so we drop them altogether - normal_context.shift((int)(adjustedPosition-normal_context.getStart())); - tumor_context.shift((int)(adjustedPosition-tumor_context.getStart())); - return; - } - if ( DEBUG ) System.out.println("DEBUG>> waiting for coverage; actual shift performed to "+ move_to); - break; - } - - if ( right > tumor_context.getStop() ) right = tumor_context.getStop(); // if indel is too close to the end of the window but we need to emit anyway (force-shift), adjust right - - location = getToolkit().getGenomeLocParser().createGenomeLoc(location.getContig(),pos); // retrieve annotation data - -// boolean haveCall = tumorCall.isCall(); // cache the value - - if ( ! discard_event ) tumorCallsMade++; - - printVCFLine(vcf_writer,normalCall,tumorCall,discard_event); - - if ( bedWriter != null ) tumorCall.printBedLine(bedWriter); - - if ( verboseWriter != null ) printVerboseLine(verboseWriter, normalCall, tumorCall, discard_event ); - lastGenotypedPosition = pos; - - tumor_context.indelsAt(pos).clear(); - normal_context.indelsAt(pos).clear(); - // we dealt with this indel; don't want to see it again - // (we might otherwise in the case when 1) there is another indel that follows - // within MISMATCH_WIDTH bases and 2) we'd need to wait for more coverage for that next indel) - } - - if ( DEBUG ) System.out.println("DEBUG>> Actual shift to " + move_to + " ("+adjustedPosition+")"); - tumor_context.shift((int)(move_to - tumor_context.getStart() ) ); - normal_context.shift((int)(move_to - normal_context.getStart() ) ); - } - - private String makeFullRecord(IndelPrecall normalCall, IndelPrecall tumorCall) { - StringBuilder fullRecord = new StringBuilder(); - if ( tumorCall.getVariant() != null || normalCall.getVariant() == null) { - fullRecord.append(tumorCall.makeEventString()); - } else { - fullRecord.append(normalCall.makeEventString()); - } - fullRecord.append('\t'); - fullRecord.append(normalCall.makeStatsString("N_")); - fullRecord.append('\t'); - fullRecord.append(tumorCall.makeStatsString("T_")); - fullRecord.append('\t'); - return fullRecord.toString(); - } - - private String makeFullRecord(IndelPrecall normalCall) { - StringBuilder fullRecord = new StringBuilder(); - fullRecord.append(normalCall.makeEventString()); - fullRecord.append('\t'); - fullRecord.append(normalCall.makeStatsString("")); - fullRecord.append('\t'); - return fullRecord.toString(); - } - - private String getAnnotationString(RODRecordList ann) { - if ( ann == null ) return annGenomic; - else { - StringBuilder b = new StringBuilder(); - - if ( RefSeqFeature.isExon(ann) ) { - if ( RefSeqFeature.isCodingExon(ann) ) b.append(annCoding); // both exon and coding = coding exon sequence - else b.append(annUTR); // exon but not coding = UTR - } else { - if ( RefSeqFeature.isCoding(ann) ) b.append(annIntron); // not in exon, but within the coding region = intron - else b.append(annUnknown); // we have no idea what this is. this may actually happen when we have a fully non-coding exon... - } - b.append('\t'); - b.append(((Transcript)ann.get(0).getUnderlyingObject()).getGeneName()); // there is at least one transcript in the list, guaranteed -// while ( it.hasNext() ) { // -// t.getGeneName() -// } - return b.toString(); - } - - } - - public void printVerboseLine(Writer verboseWriter, IndelPrecall normalCall, boolean discard_event) { - RODRecordList annotationList = (refseqIterator == null ? null : refseqIterator.seekForward(location)); - String annotationString = (refseqIterator == null ? "" : getAnnotationString(annotationList)); - - StringBuilder fullRecord = new StringBuilder(); - fullRecord.append(makeFullRecord(normalCall)); - fullRecord.append(annotationString); - if ( discard_event && normalCall.getVariant() != null ) fullRecord.append("\tFILTERED_NOCALL"); - try { - verboseWriter.write(fullRecord.toString()); - verboseWriter.write('\n'); - } catch (IOException e) { - throw new UserException.CouldNotCreateOutputFile(verboseOutput, "Write failed", e); - } - - } - - - public void printVerboseLine(Writer verboseWriter, IndelPrecall normalCall, IndelPrecall tumorCall, boolean discard_event) { - RODRecordList annotationList = (refseqIterator == null ? null : refseqIterator.seekForward(location)); - String annotationString = (refseqIterator == null ? "" : getAnnotationString(annotationList)); - - StringBuilder fullRecord = new StringBuilder(); - fullRecord.append(makeFullRecord(normalCall,tumorCall)); - - if ( normalCall.getVariant() == null && tumorCall.getVariant() == null ) { - // did not observe anything - if ( normalCall.getCoverage() >= minNormalCoverage && tumorCall.getCoverage() >= minCoverage ) fullRecord.append("REFERENCE"); - else { - if ( tumorCall.getCoverage() >= minCoverage ) fullRecord.append("REFERENCE"); // no coverage in normal but nothing in tumor - else { - // no coverage in tumor; if we have no coverage in normal, it can be anything; if we do have coverage in normal, - // this still could be a somatic event. so either way it is 'unknown' - fullRecord.append("UNKNOWN"); - } - } - - } - - if ( normalCall.getVariant() == null && tumorCall.getVariant() != null ) { - // looks like somatic call - if ( normalCall.getCoverage() >= minNormalCoverage ) fullRecord.append("SOMATIC"); // we confirm there is nothing in normal - else { - // low coverage in normal - fullRecord.append("EVENT_T"); // no coverage in normal, no idea whether it is germline or somatic - } - } - - if ( normalCall.getVariant() != null && tumorCall.getVariant() == null ) { - // it's likely germline (with missing observation in tumor - maybe loh? - if ( tumorCall.getCoverage() >= minCoverage ) fullRecord.append("GERMLINE_LOH"); // we confirm there is nothing in tumor - else { - // low coverage in tumor, maybe we missed the event - fullRecord.append("GERMLINE"); // no coverage in tumor but we already saw it in normal... - } - } - - if ( normalCall.getVariant() != null && tumorCall.getVariant() != null ) { - // events in both T/N, got to be germline! - fullRecord.append("GERMLINE"); - } - - - fullRecord.append('\t'); - fullRecord.append(annotationString); - - if ( discard_event && tumorCall.getVariant() != null ) fullRecord.append("\tFILTERED_NOCALL"); - - try { - verboseWriter.write(fullRecord.toString()); - verboseWriter.write('\n'); - } catch (IOException e) { - throw new UserException.CouldNotCreateOutputFile(verboseOutput, "Write failed", e); - } - } - - public void printVCFLine(VariantContextWriter vcf, IndelPrecall call, boolean discard_event) { - - long start = call.getPosition()-1; - // If the beginning of the chromosome is deleted (possible, however unlikely), it's unclear how to proceed. - // The suggestion is instead of putting the base before the indel, to put the base after the indel. - // For now, just don't print out that site. - if ( start == 0 ) - return; - - long stop = start; - - List alleles = new ArrayList(2); // actual observed (distinct!) alleles at the site - List homref_alleles = null; // when needed, will contain two identical copies of ref allele - needed to generate hom-ref genotype - - final byte referencePaddingBase = refBases[(int)start-1]; - - if ( call.getVariant() == null ) { - // we will need to create genotype with two (hom) ref alleles (below). - // we can not use 'alleles' list here, since that list is supposed to contain - // only *distinct* alleles observed at the site or VCFContext will frown upon us... - alleles.add( Allele.create(referencePaddingBase,true) ); - homref_alleles = new ArrayList(2); - homref_alleles.add( alleles.get(0)); - homref_alleles.add( alleles.get(0)); - } else { - // we always create alt allele when we observe anything but the ref, even if it is not a call! - // (Genotype will tell us whether it is an actual call or not!) - int event_length = call.getVariant().lengthOnRef(); - if ( event_length < 0 ) event_length = 0; - fillAlleleList(alleles,call,referencePaddingBase); - stop += event_length; - } - - GenotypesContext genotypes = GenotypesContext.create(); - for ( String sample : normalSamples ) { - GenotypeBuilder gb = new GenotypeBuilder(sample); - - gb=call.addStatsAttributes(gb); - gb.alleles(! discard_event - ? alleles // we made a call - put actual het genotype here: - : homref_alleles); // no call: genotype is ref/ref (but alleles still contain the alt if we observed anything at all) - genotypes.add(gb.make()); - - } - Set filters = null; - if ( call.getVariant() != null && discard_event ) { - filters = new HashSet(); - filters.add("NoCall"); - } - VariantContext vc = new VariantContextBuilder("IGv2_Indel_call", refName, start, stop, alleles) - .genotypes(genotypes).filters(filters).make(); - vcf.add(vc); - } - - /** Fills l with appropriate alleles depending on whether call is insertion or deletion - * (l MUST have a variant or this method will crash). It is guaranteed that the *first* allele added - * to the list is ref, and the next one is alt. - * @param l - * @param call - */ - private void fillAlleleList(List l, IndelPrecall call, byte referencePaddingBase) { - int event_length = call.getVariant().lengthOnRef(); - if ( event_length == 0 ) { // insertion - - l.add( Allele.create(referencePaddingBase,true) ); - l.add( Allele.create((char)referencePaddingBase + new String(call.getVariant().getBases()), false )); - - } else { //deletion: - l.add( Allele.create((char)referencePaddingBase + new String(call.getVariant().getBases()), true )); - l.add( Allele.create(referencePaddingBase,false) ); - } - } - - public void printVCFLine(VariantContextWriter vcf, IndelPrecall nCall, IndelPrecall tCall, boolean discard_event) { - - long start = tCall.getPosition()-1; - long stop = start; - - // If the beginning of the chromosome is deleted (possible, however unlikely), it's unclear how to proceed. - // The suggestion is instead of putting the base before the indel, to put the base after the indel. - // For now, just don't print out that site. - if ( start == 0 ) - return; - - GenotypeBuilder nGB = new GenotypeBuilder(); - GenotypeBuilder tGB = new GenotypeBuilder(); - - nCall.addStatsAttributes(nGB); - tCall.addStatsAttributes(tGB); - - Map attrs = new HashMap(); - - boolean isSomatic = false; - if ( nCall.getVariant() == null && tCall.getVariant() != null ) { - isSomatic = true; - attrs.put(VCFConstants.SOMATIC_KEY,true); - } - List alleles = new ArrayList(2); // all alleles at the site - // List normal_alleles = null; // all alleles at the site - List homRefAlleles = null; - -// if ( nCall.getVariant() == null || tCall.getVariant() == null ) { - homRefAlleles = new ArrayList(2) ; // we need this for somatic calls (since normal is ref-ref), and also for no-calls -// } - boolean homRefT = ( tCall.getVariant() == null ); - boolean homRefN = ( nCall.getVariant() == null ); - final byte referencePaddingBase = refBases[(int)start-1]; - if ( tCall.getVariant() == null && nCall.getVariant() == null) { - // no indel at all ; create base-representation ref/ref alleles for genotype construction - alleles.add( Allele.create(referencePaddingBase,true) ); - } else { - // we got indel(s) - int event_length = 0; - if ( tCall.getVariant() != null ) { - // indel in tumor - event_length = tCall.getVariant().lengthOnRef(); - fillAlleleList(alleles, tCall, referencePaddingBase); - } else { - event_length = nCall.getVariant().lengthOnRef(); - fillAlleleList(alleles, nCall, referencePaddingBase); - } - if ( event_length > 0 ) stop += event_length; - } - homRefAlleles.add( alleles.get(0)); - homRefAlleles.add( alleles.get(0)); - - GenotypesContext genotypes = GenotypesContext.create(); - - for ( String sample : normalSamples ) { - genotypes.add(nGB.name(sample).alleles(homRefN ? homRefAlleles : alleles).make()); - } - - for ( String sample : tumorSamples ) { - genotypes.add(tGB.name(sample).alleles(homRefT ? homRefAlleles : alleles).make()); - } - - Set filters = null; - if ( tCall.getVariant() != null && discard_event ) { - filters = new HashSet(); - filters.add("NoCall"); - } - - VariantContext vc = new VariantContextBuilder("IGv2_Indel_call", refName, start, stop, alleles) - .genotypes(genotypes).filters(filters).attributes(attrs).make(); - vcf.add(vc); - } - - @Override - public void onTraversalDone(Integer result) { - if ( DEBUG ) { - System.out.println("DEBUG>> Emitting last window at "+normal_context.getStart()+"-"+normal_context.getStop()); - } - if ( call_somatic ) emit_somatic(1000000000, true); - else emit(1000000000,true); // emit everything we might have left - - if ( metricsWriter != null ) { - metricsWriter.println(String.format("Normal calls made %d", normalCallsMade)); - metricsWriter.println(String.format("Tumor calls made %d", tumorCallsMade)); - metricsWriter.close(); - } - - try { - if ( bedWriter != null ) bedWriter.close(); - if ( verboseWriter != null ) verboseWriter.close(); - } catch (IOException e) { - System.out.println("Failed to close output BED file gracefully, data may be lost"); - e.printStackTrace(); - } - super.onTraversalDone(result); - } - - @Override - public Integer reduce(Integer value, Integer sum) { - if ( value == -1 ) { - onTraversalDone(sum); - System.exit(1); - } - sum += value; - return sum; - } - - @Override - public Integer reduceInit() { - return 0; - } - - - static class IndelVariant { - public static enum Type { I, D}; - private String bases; - private Type type; - private ArrayList fromStartOffsets = null; - private ArrayList fromEndOffsets = null; - - private Set reads = new HashSet(); // keep track of reads that have this indel - private Set samples = new HashSet(); // which samples had the indel described by this object - - public IndelVariant(ExpandedSAMRecord read , Type type, String bases) { - this.type = type; - this.bases = bases.toUpperCase(); - addObservation(read); - fromStartOffsets = new ArrayList(); - fromEndOffsets = new ArrayList(); - } - - /** Adds another observation for the current indel. It is assumed that the read being registered - * does contain the observation, no checks are performed. Read's sample is added to the list of samples - * this indel was observed in as well. - * @param read - */ - public void addObservation(ExpandedSAMRecord read) { - if ( reads.contains(read) ) { - //TODO fix CleanedReadInjector and reinstate exception here: duplicate records may signal a problem with the bam - // seeing the same read again can mean only one thing: the input bam file is corrupted and contains - // duplicate records. We KNOW that this may happen for the time being due to bug in CleanedReadInjector - // so this is a short-term patch: don't cry, but just ignore the duplicate record - - //throw new StingException("Attempting to add indel observation that was already registered"); - return; - } - reads.add(read); - String sample = null; - if ( read.getSAMRecord().getReadGroup() != null ) sample = read.getSAMRecord().getReadGroup().getSample(); - if ( sample != null ) samples.add(sample); - } - - - /** Returns length of the event on the reference (number of deleted bases - * for deletions, -1 for insertions. - * @return - */ - public int lengthOnRef() { - if ( type == Type.D ) return bases.length(); - else return 0; - } - - - public void addSample(String sample) { - if ( sample != null ) - samples.add(sample); - } - - public void addReadPositions(int fromStart, int fromEnd) { - fromStartOffsets.add(fromStart); - fromEndOffsets.add(fromEnd); - } - - public List getOffsetsFromStart() { return fromStartOffsets ; } - public List getOffsetsFromEnd() { return fromEndOffsets; } - - public String getSamples() { - StringBuffer sb = new StringBuffer(); - Iterator i = samples.iterator(); - while ( i.hasNext() ) { - sb.append(i.next()); - if ( i.hasNext() ) - sb.append(","); - } - return sb.toString(); - } - - public Set getReadSet() { return reads; } - - public int getCount() { return reads.size(); } - - public String getBases() { return bases; } - - public Type getType() { return type; } - - @Override - public boolean equals(Object o) { - if ( ! ( o instanceof IndelVariant ) ) return false; - IndelVariant that = (IndelVariant)o; - return ( this.type == that.type && this.bases.equals(that.bases) ); - } - - public boolean equals(Type type, String bases) { - return ( this.type == type && this.bases.equals(bases.toUpperCase()) ); - } - } - - /** - * Utility class that encapsulates the logic related to collecting all the stats and counts required to - * make (or discard) a call, as well as the calling heuristics that uses those data. - */ - class IndelPrecall { -// private boolean DEBUG = false; - private int NQS_MISMATCH_CUTOFF = 1000000; - private double AV_MISMATCHES_PER_READ = 1.5; - - private int nqs = 0; - private IndelVariant consensus_indel = null; // indel we are going to call - private long pos = -1 ; // position on the ref - private int total_coverage = 0; // total number of reads overlapping with the event - private int consensus_indel_count = 0; // number of reads, in which consensus indel was observed - private int all_indel_count = 0 ; // number of reads, in which any indel was observed at current position - - private int total_mismatches_in_nqs_window = 0; // total number of mismatches in the nqs window around the indel - private int total_bases_in_nqs_window = 0; // total number of bases in the nqs window (some reads may not fully span the window so it's not coverage*nqs_size) - private int total_base_qual_in_nqs_window = 0; // sum of qualitites of all the bases in the nqs window - private int total_mismatching_base_qual_in_nqs_window = 0; // sum of qualitites of all mismatching bases in the nqs window - - private int indel_read_mismatches_in_nqs_window = 0; // mismatches inside the nqs window in indel-containing reads only - private int indel_read_bases_in_nqs_window = 0; // number of bases in the nqs window from indel-containing reads only - private int indel_read_base_qual_in_nqs_window = 0; // sum of qualitites of bases in nqs window from indel-containing reads only - private int indel_read_mismatching_base_qual_in_nqs_window = 0; // sum of qualitites of mismatching bases in the nqs window from indel-containing reads only - - - private int consensus_indel_read_mismatches_in_nqs_window = 0; // mismatches within the nqs window from consensus indel reads only - private int consensus_indel_read_bases_in_nqs_window = 0; // number of bases in the nqs window from consensus indel-containing reads only - private int consensus_indel_read_base_qual_in_nqs_window = 0; // sum of qualitites of bases in nqs window from consensus indel-containing reads only - private int consensus_indel_read_mismatching_base_qual_in_nqs_window = 0; // sum of qualitites of mismatching bases in the nqs window from consensus indel-containing reads only - - - private double consensus_indel_read_total_mm = 0.0; // sum of all mismatches in reads that contain consensus indel - private double all_indel_read_total_mm = 0.0; // sum of all mismatches in reads that contain any indel at given position - private double all_read_total_mm = 0.0; // sum of all mismatches in all reads - - private double consensus_indel_read_total_mapq = 0.0; // sum of mapping qualitites of all reads with consensus indel - private double all_indel_read_total_mapq = 0.0 ; // sum of mapping qualitites of all reads with (any) indel at current position - private double all_read_total_mapq = 0.0; // sum of all mapping qualities of all reads - - private PrimitivePair.Int consensus_indel_read_orientation_cnt = new PrimitivePair.Int(); - private PrimitivePair.Int all_indel_read_orientation_cnt = new PrimitivePair.Int(); - private PrimitivePair.Int all_read_orientation_cnt = new PrimitivePair.Int(); - - private int from_start_median = 0; - private int from_start_mad = 0; - private int from_end_median = 0; - private int from_end_mad = 0; - - /** Makes an empty call (no-call) with all stats set to 0 - * - * @param position - */ - public IndelPrecall(long position) { - this.pos = position; - } - - public IndelPrecall(WindowContext context, long position, int nqs_width) { - this.pos = position; - this.nqs = nqs_width; - total_coverage = context.coverageAt(pos,true); - List variants = context.indelsAt(pos); - findConsensus(variants); - - // pos is the first base after the event: first deleted base or first base after insertion. - // hence, [pos-nqs, pos+nqs-1] (inclusive) is the window with nqs bases on each side of a no-event or an insertion - // and [pos-nqs, pos+Ndeleted+nqs-1] is the window with nqs bases on each side of a deletion. - // we initialize the nqs window for no-event/insertion case - long left = Math.max( pos-nqs, context.getStart() ); - long right = Math.min(pos+nqs-1, context.getStop()); -//if ( pos == 3534096 ) System.out.println("pos="+pos +" total reads: "+context.getReads().size()); - Iterator read_iter = context.getReads().iterator(); - - - while ( read_iter.hasNext() ) { - ExpandedSAMRecord rec = read_iter.next(); - SAMRecord read = rec.getSAMRecord(); - byte[] flags = rec.getExpandedMMFlags(); - byte[] quals = rec.getExpandedQuals(); - int mm = rec.getMMCount(); - - - if( read.getAlignmentStart() > pos || read.getAlignmentEnd() < pos ) continue; - - long local_right = right; // end of nqs window for this particular read. May need to be advanced further right - // if read has a deletion. The gap in the middle of nqs window will be skipped - // automatically since flags/quals are set to -1 there - - boolean read_has_a_variant = false; - boolean read_has_consensus = ( consensus_indel!= null && consensus_indel.getReadSet().contains(rec) ); - for ( IndelVariant v : variants ) { - if ( v.getReadSet().contains(rec) ) { - read_has_a_variant = true; - local_right += v.lengthOnRef(); - break; - } - } - - if ( read_has_consensus ) { - consensus_indel_read_total_mm += mm; - consensus_indel_read_total_mapq += read.getMappingQuality(); - if ( read.getReadNegativeStrandFlag() ) consensus_indel_read_orientation_cnt.second++; - else consensus_indel_read_orientation_cnt.first++; - } - if ( read_has_a_variant ) { - all_indel_read_total_mm += mm; - all_indel_read_total_mapq += read.getMappingQuality(); - if ( read.getReadNegativeStrandFlag() ) all_indel_read_orientation_cnt.second++; - else all_indel_read_orientation_cnt.first++; - } - - all_read_total_mm+= mm; - all_read_total_mapq += read.getMappingQuality(); - if ( read.getReadNegativeStrandFlag() ) all_read_orientation_cnt.second++; - else all_read_orientation_cnt.first++; - - for ( int pos_in_flags = Math.max((int)(left - read.getAlignmentStart()),0); - pos_in_flags <= Math.min((int)local_right-read.getAlignmentStart(),flags.length - 1); - pos_in_flags++) { - - if ( flags[pos_in_flags] == -1 ) continue; // gap (deletion), skip it; we count only bases aligned to the ref - total_bases_in_nqs_window++; - if ( read_has_consensus ) consensus_indel_read_bases_in_nqs_window++; - if ( read_has_a_variant ) indel_read_bases_in_nqs_window++; - - if ( quals[pos_in_flags] != -1 ) { - - total_base_qual_in_nqs_window += quals[pos_in_flags]; - if ( read_has_a_variant ) indel_read_base_qual_in_nqs_window += quals[pos_in_flags]; - if ( read_has_consensus ) consensus_indel_read_base_qual_in_nqs_window += quals[pos_in_flags]; - } - - if ( flags[pos_in_flags] == 1 ) { // it's a mismatch - total_mismatches_in_nqs_window++; - total_mismatching_base_qual_in_nqs_window += quals[pos_in_flags]; - - if ( read_has_consensus ) { - consensus_indel_read_mismatches_in_nqs_window++; - consensus_indel_read_mismatching_base_qual_in_nqs_window += quals[pos_in_flags]; - } - - if ( read_has_a_variant ) { - indel_read_mismatches_in_nqs_window++; - indel_read_mismatching_base_qual_in_nqs_window += quals[pos_in_flags]; - } - } - } -// if ( pos == 3534096 ) { -// System.out.println(read.getReadName()); -// System.out.println(" cons nqs bases="+consensus_indel_read_bases_in_nqs_window); -// System.out.println(" qual sum="+consensus_indel_read_base_qual_in_nqs_window); -// } - - } - - // compute median/mad for offsets from the read starts/ends - if ( consensus_indel != null ) { - from_start_median = median(consensus_indel.getOffsetsFromStart()) ; - from_start_mad = mad(consensus_indel.getOffsetsFromStart(),from_start_median); - from_end_median = median(consensus_indel.getOffsetsFromEnd()) ; - from_end_mad = mad(consensus_indel.getOffsetsFromEnd(),from_end_median); - } - } - - /** As a side effect will sort l! - * - * @param l - * @return - */ - private int median(List l) { - Collections.sort(l); - int k = l.size()/2; - return ( l.size() % 2 == 0 ? - (l.get(k-1).intValue()+l.get(k).intValue())/2 : - l.get(k).intValue()); - } - - private int median(int[] l) { - Arrays.sort(l); - int k = l.length/2; - return ( l.length % 2 == 0 ? - (l[k-1]+l[k])/2 : - l[k]); - } - - private int mad(List l, int med) { - int [] diff = new int[l.size()]; - for ( int i = 0; i < l.size(); i++ ) { - diff[i] = Math.abs(l.get(i).intValue() - med); - } - return median(diff); - } - - public long getPosition() { return pos; } - - public boolean hasObservation() { return consensus_indel != null; } - - public int getCoverage() { return total_coverage; } - - public double getTotalMismatches() { return all_read_total_mm; } - public double getConsensusMismatches() { return consensus_indel_read_total_mm; } - public double getAllVariantMismatches() { return all_indel_read_total_mm; } - - /** Returns average number of mismatches per consensus indel-containing read */ - public double getAvConsensusMismatches() { - return ( consensus_indel_count != 0 ? consensus_indel_read_total_mm/consensus_indel_count : 0.0 ); - } - - /** Returns average number of mismatches per read across all reads matching the ref (not containing any indel variants) */ - public double getAvRefMismatches() { - int coverage_ref = total_coverage-all_indel_count; - return ( coverage_ref != 0 ? (all_read_total_mm - all_indel_read_total_mm )/coverage_ref : 0.0 ); - } - - public PrimitivePair.Int getConsensusStrandCounts() { - return consensus_indel_read_orientation_cnt; - } - - public PrimitivePair.Int getRefStrandCounts() { - return new PrimitivePair.Int(all_read_orientation_cnt.first-all_indel_read_orientation_cnt.first, - all_read_orientation_cnt.second - all_indel_read_orientation_cnt.second); - } - - /** Returns a sum of mapping qualities of all reads spanning the event. */ - public double getTotalMapq() { return all_read_total_mapq; } - - /** Returns a sum of mapping qualities of all reads, in which the consensus variant is observed. */ - public double getConsensusMapq() { return consensus_indel_read_total_mapq; } - - /** Returns a sum of mapping qualities of all reads, in which any variant is observed at the current event site. */ - public double getAllVariantMapq() { return all_indel_read_total_mapq; } - - /** Returns average mapping quality per consensus indel-containing read. */ - public double getAvConsensusMapq() { - return ( consensus_indel_count != 0 ? consensus_indel_read_total_mapq/consensus_indel_count : 0.0 ); - } - - /** Returns average number of mismatches per read across all reads matching the ref (not containing any indel variants). */ - public double getAvRefMapq() { - int coverage_ref = total_coverage-all_indel_count; - return ( coverage_ref != 0 ? (all_read_total_mapq - all_indel_read_total_mapq )/coverage_ref : 0.0 ); - } - - /** Returns fraction of bases in NQS window around the indel that are mismatches, across all reads, - * in which consensus indel is observed. NOTE: NQS window for indel containing reads is defined around - * the indel itself (e.g. for a 10-base deletion spanning [X,X+9], the 5-NQS window is {[X-5,X-1],[X+10,X+15]} - * */ - public double getNQSConsensusMMRate() { - if ( consensus_indel_read_bases_in_nqs_window == 0 ) return 0; - return ((double)consensus_indel_read_mismatches_in_nqs_window)/consensus_indel_read_bases_in_nqs_window; - } - - /** Returns fraction of bases in NQS window around the indel start position that are mismatches, across all reads - * that align to the ref (i.e. contain no indel observation at the current position). NOTE: NQS window for ref - * reads is defined around the event start position, NOT around the actual consensus indel. - * */ - public double getNQSRefMMRate() { - int num_ref_bases = total_bases_in_nqs_window - indel_read_bases_in_nqs_window; - if ( num_ref_bases == 0 ) return 0; - return ((double)(total_mismatches_in_nqs_window - indel_read_mismatches_in_nqs_window))/num_ref_bases; - } - - /** Returns average base quality in NQS window around the indel, across all reads, - * in which consensus indel is observed. NOTE: NQS window for indel containing reads is defined around - * the indel itself (e.g. for a 10-base deletion spanning [X,X+9], the 5-NQS window is {[X-5,X-1],[X+10,X+15]} - * */ - public double getNQSConsensusAvQual() { - if ( consensus_indel_read_bases_in_nqs_window == 0 ) return 0; - return ((double)consensus_indel_read_base_qual_in_nqs_window)/consensus_indel_read_bases_in_nqs_window; - } - - /** Returns fraction of bases in NQS window around the indel start position that are mismatches, across all reads - * that align to the ref (i.e. contain no indel observation at the current position). NOTE: NQS window for ref - * reads is defined around the event start position, NOT around the actual consensus indel. - * */ - public double getNQSRefAvQual() { - int num_ref_bases = total_bases_in_nqs_window - indel_read_bases_in_nqs_window; - if ( num_ref_bases == 0 ) return 0; - return ((double)(total_base_qual_in_nqs_window - indel_read_base_qual_in_nqs_window))/num_ref_bases; - } - - public int getTotalNQSMismatches() { return total_mismatches_in_nqs_window; } - - public int getAllVariantCount() { return all_indel_count; } - public int getConsensusVariantCount() { return consensus_indel_count; } - -// public boolean failsNQSMismatch() { -// //TODO wrong fraction: mismatches are counted only in indel-containing reads, but total_coverage is used! -// return ( indel_read_mismatches_in_nqs_window > NQS_MISMATCH_CUTOFF ) || -// ( indel_read_mismatches_in_nqs_window > total_coverage * AV_MISMATCHES_PER_READ ); -// } - - public IndelVariant getVariant() { return consensus_indel; } - - public void fillContext(JexlContext context,String[] cassette) { - context.set(cassette[C_INDEL_F],((double)consensus_indel_count)/total_coverage); - context.set(cassette[C_INDEL_CF],((double)consensus_indel_count/all_indel_count)); - context.set(cassette[C_COV],total_coverage); - context.set(cassette[C_CONS_CNT],consensus_indel_count); - } -/* - public boolean isCall() { - boolean ret = ( consensus_indel_count >= minIndelCount && - (double)consensus_indel_count > minFraction * total_coverage && - (double) consensus_indel_count > minConsensusFraction*all_indel_count && total_coverage >= minCoverage); - if ( DEBUG && ! ret ) System.out.println("DEBUG>> NOT a call: count="+consensus_indel_count+ - " total_count="+all_indel_count+" cov="+total_coverage+ - " minConsensusF="+((double)consensus_indel_count)/all_indel_count+ - " minF="+((double)consensus_indel_count)/total_coverage); - - return ret; -// return true; - } -*/ - /** Utility method: finds the indel variant with the largest count (ie consensus) among all the observed - * variants, and sets the counts of consensus observations and all observations of any indels (including non-consensus) - * @param variants - * @return - */ - private void findConsensus(List variants) { - for ( IndelVariant var : variants ) { - if ( DEBUG ) System.out.println("DEBUG>> Variant "+var.getBases()+" (cnt="+var.getCount()+")"); - int cnt = var.getCount(); - all_indel_count +=cnt; - if ( cnt > consensus_indel_count ) { - consensus_indel = var; - consensus_indel_count = cnt; - } - } - if ( DEBUG && consensus_indel != null ) System.out.println("DEBUG>> Returning: "+consensus_indel.getBases()+ - " (cnt="+consensus_indel.getCount()+") with total count of "+all_indel_count); - } - - - - public void printBedLine(Writer bed) { - int event_length; - if ( consensus_indel == null ) event_length = 0; - else { - event_length = consensus_indel.lengthOnRef(); - if ( event_length < 0 ) event_length = 0; - } - - StringBuffer message = new StringBuffer(); - message.append(refName+"\t"+(pos-1)+"\t"); - message.append((pos-1+event_length)+"\t"); - if ( consensus_indel != null ) { - message.append((event_length>0? "-":"+")+consensus_indel.getBases()); - } else { - message.append('.'); - } - message.append(":"+all_indel_count+"/"+total_coverage); - try { - bed.write(message.toString()+"\n"); - } catch (IOException e) { - throw new UserException.CouldNotCreateOutputFile(bedOutput, "Error encountered while writing into output BED file", e); - } - } - - public String makeEventString() { - int event_length; - if ( consensus_indel == null ) event_length = 0; - else { - event_length = consensus_indel.lengthOnRef(); - if ( event_length < 0 ) event_length = 0; - } - StringBuffer message = new StringBuffer(); - message.append(refName); - message.append('\t'); - message.append(pos-1); - message.append('\t'); - message.append(pos-1+event_length); - message.append('\t'); - if ( consensus_indel != null ) { - message.append((event_length>0?'-':'+')); - message.append(consensus_indel.getBases()); - } else { - message.append('.'); - } - return message.toString(); - } - - public String makeStatsString(String prefix) { - StringBuilder message = new StringBuilder(); - message.append(prefix+"OBS_COUNTS[C/A/T]:"+getConsensusVariantCount()+"/"+getAllVariantCount()+"/"+getCoverage()); - message.append('\t'); - message.append(prefix+"AV_MM[C/R]:"+String.format("%.2f/%.2f",getAvConsensusMismatches(), - getAvRefMismatches())); - message.append('\t'); - message.append(prefix+"AV_MAPQ[C/R]:"+String.format("%.2f/%.2f",getAvConsensusMapq(), - getAvRefMapq())); - message.append('\t'); - message.append(prefix+"NQS_MM_RATE[C/R]:"+String.format("%.2f/%.2f",getNQSConsensusMMRate(),getNQSRefMMRate())); - message.append('\t'); - message.append(prefix+"NQS_AV_QUAL[C/R]:"+String.format("%.2f/%.2f",getNQSConsensusAvQual(),getNQSRefAvQual())); - - PrimitivePair.Int strand_cons = getConsensusStrandCounts(); - PrimitivePair.Int strand_ref = getRefStrandCounts(); - message.append('\t'); - message.append(prefix+"STRAND_COUNTS[C/C/R/R]:"+strand_cons.first+"/"+strand_cons.second+"/"+strand_ref.first+"/"+strand_ref.second); - - message.append('\t'); - message.append(prefix+"OFFSET_RSTART:"+from_start_median+"/"+from_start_mad); - message.append('\t'); - message.append(prefix+"OFFSET_REND:"+from_end_median+"/"+from_end_mad); - - return message.toString(); - - } - - /** - * Places alignment statistics into attribute map and returns the map. If attr parameter is null, - * a new map is allocated, filled and returned. If attr is not null, new attributes are added to that - * preexisting map, and the same instance of the (updated) map is returned. - * - * @param attr - * @return - */ - public Map makeStatsAttributes(Map attr) { - if ( attr == null ) attr = new HashMap(); - - VCFIndelAttributes.recordDepth(getConsensusVariantCount(),getAllVariantCount(),getCoverage(),attr); - - VCFIndelAttributes.recordAvMM(getAvConsensusMismatches(),getAvRefMismatches(),attr); - - VCFIndelAttributes.recordAvMapQ(getAvConsensusMapq(),getAvRefMapq(),attr); - - VCFIndelAttributes.recordNQSMMRate(getNQSConsensusMMRate(),getNQSRefMMRate(),attr); - - VCFIndelAttributes.recordNQSAvQ(getNQSConsensusAvQual(),getNQSRefAvQual(),attr); - - VCFIndelAttributes.recordOffsetFromStart(from_start_median,from_start_mad,attr); - - VCFIndelAttributes.recordOffsetFromEnd(from_end_median,from_end_mad,attr); - - PrimitivePair.Int strand_cons = getConsensusStrandCounts(); - PrimitivePair.Int strand_ref = getRefStrandCounts(); - - VCFIndelAttributes.recordStrandCounts(strand_cons.first,strand_cons.second,strand_ref.first,strand_ref.second,attr); - return attr; - } - - - /** - * Adds alignment statistics directly into the genotype builder object. - * - * @param gb - * @return - */ - public GenotypeBuilder addStatsAttributes(GenotypeBuilder gb) { - if ( gb == null ) gb = new GenotypeBuilder(); - - gb = VCFIndelAttributes.recordDepth(getConsensusVariantCount(),getAllVariantCount(),getCoverage(),gb); - - gb = VCFIndelAttributes.recordAvMM(getAvConsensusMismatches(),getAvRefMismatches(),gb); - - gb = VCFIndelAttributes.recordAvMapQ(getAvConsensusMapq(),getAvRefMapq(),gb); - - gb = VCFIndelAttributes.recordNQSMMRate(getNQSConsensusMMRate(),getNQSRefMMRate(),gb); - - gb = VCFIndelAttributes.recordNQSAvQ(getNQSConsensusAvQual(),getNQSRefAvQual(),gb); - - gb = VCFIndelAttributes.recordOffsetFromStart(from_start_median,from_start_mad,gb); - - gb = VCFIndelAttributes.recordOffsetFromEnd(from_end_median,from_end_mad,gb); - - PrimitivePair.Int strand_cons = getConsensusStrandCounts(); - PrimitivePair.Int strand_ref = getRefStrandCounts(); - - gb = VCFIndelAttributes.recordStrandCounts(strand_cons.first,strand_cons.second,strand_ref.first,strand_ref.second,gb); - return gb; - } - - } - - interface IndelListener { - public void addObservation(int pos, IndelVariant.Type t, String bases, int fromStart, int fromEnd, ExpandedSAMRecord r); - } - - class WindowContext implements IndelListener { - private Set reads; - private int start=0; // where the window starts on the ref, 1-based - private CircularArray< List< IndelVariant > > indels; - - private List emptyIndelList = new ArrayList(); - - - public WindowContext(int start, int length) { - this.start = start; - indels = new CircularArray< List >(length); -// reads = new LinkedList(); - reads = new HashSet(); - } - - /** Returns 1-based reference start position of the interval this object keeps context for. - * - * @return - */ - public int getStart() { return start; } - - /** Returns 1-based reference stop position (inclusive) of the interval this object keeps context for. - * - * @return - */ - public int getStop() { return start + indels.length() - 1; } - - /** Resets reference start position to 0 and clears the context. - * - */ - public void clear() { - start = 0; - reads.clear(); - indels.clear(); - } - - /** - * Returns true if any indel observations are present in the specified interval - * [begin,end] (1-based, inclusive). Interval can be partially of fully outside of the - * current context window: positions outside of the window will be ignored. - * @param begin - * @param end - */ - public boolean hasIndelsInInterval(long begin, long end) { - for ( long k = Math.max(start,begin); k < Math.min(getStop(),end); k++ ) { - if ( indelsAt(k) != emptyIndelList ) return true; - } - return false; - } - - public Set getReads() { return reads; } - - /** Returns the number of reads spanning over the specified reference position - * (regardless of whether they have a base or indel at that specific location). - * The second argument controls whether to count with indels in mind (this is relevant for insertions only, - * deletions do not require any special treatment since they occupy non-zero length on the ref and since - * alignment can not start or end with a deletion). For insertions, note that, internally, we assign insertions - * to the reference position right after the actual event, and we count all events assigned to a given position. - * This count (reads with indels) should be contrasted to reads without indels, or more rigorously, reads - * that support the ref rather than the indel. Few special cases may occur here: - * 1) an alignment that ends (as per getAlignmentEnd()) right before the current position but has I as its - * last element: we have to count that read into the "coverage" at the current position for the purposes of indel - * assessment, as the indel in that read will be counted at the current position, so the total coverage - * should be consistent with that. - */ - /* NOT IMPLEMENTED: 2) alsignments that start exactly at the current position do not count for the purpose of insertion - * assessment since they do not contribute any evidence to either Ref or Alt=insertion hypothesis, unless - * the alignment starts with I (so that we do have evidence for an indel assigned to the current position and - * read should be counted). For deletions, reads starting at the current position should always be counted (as they - * show no deletion=ref). - * @param refPos position on the reference; must be within the bounds of the window - */ - public int coverageAt(final long refPos, boolean countForIndels) { - int cov = 0; - for ( ExpandedSAMRecord read : reads ) { - if ( read.getSAMRecord().getAlignmentStart() > refPos || read.getSAMRecord().getAlignmentEnd() < refPos ) { - if ( countForIndels && read.getSAMRecord().getAlignmentEnd() == refPos - 1) { - Cigar c = read.getSAMRecord().getCigar(); - if ( c.getCigarElement(c.numCigarElements()-1).getOperator() == CigarOperator.I ) cov++; - } - continue; - } - cov++; - } - return cov; - } - - - /** Shifts current window to the right along the reference contig by the specified number of bases. - * The context will be updated accordingly (indels and reads that go out of scope will be dropped). - * @param offset - */ - public void shift(int offset) { - start += offset; - - indels.shiftData(offset); - if ( indels.get(0) != null && indels.get(0).size() != 0 ) { - IndelVariant indel = indels.get(0).get(0); - - System.out.println("WARNING: Indel(s) at first position in the window ("+refName+":"+start+"): currently not supported: "+ - (indel.getType()==IndelVariant.Type.I?"+":"-")+indel.getBases()+"; read: "+indel.getReadSet().iterator().next().getSAMRecord().getReadName()+"; site ignored"); - indels.get(0).clear(); -// throw new StingException("Indel found at the first position ("+start+") after a shift was performed: currently not supported: "+ -// (indel.getType()==IndelVariant.Type.I?"+":"-")+indel.getBases()+"; reads: "+indel.getReadSet().iterator().next().getSAMRecord().getReadName()); - } - - Iterator read_iter = reads.iterator(); - - while ( read_iter.hasNext() ) { - ExpandedSAMRecord r = read_iter.next(); - if ( r.getSAMRecord().getAlignmentEnd() < start ) { // discard reads and associated data that went out of scope - read_iter.remove(); - } - } - } - - public void add(SAMRecord read, byte [] ref) { - - if ( read.getAlignmentStart() < start ) return; // silently ignore reads starting before the window start - - ExpandedSAMRecord er = new ExpandedSAMRecord(read,ref,read.getAlignmentStart()-start,this); - //TODO duplicate records may actually indicate a problem with input bam file; throw an exception when the bug in CleanedReadInjector is fixed - if ( reads.contains(er)) return; // ignore duplicate records - reads.add(er); - } - - public void addObservation(int pos, IndelVariant.Type type, String bases, int fromStart, int fromEnd, ExpandedSAMRecord rec) { - List indelsAtSite; - try { - indelsAtSite = indels.get(pos); - } catch (IndexOutOfBoundsException e) { - SAMRecord r = rec.getSAMRecord(); - System.out.println("Failed to add indel observation, probably out of coverage window bounds (trailing indel?):\nRead "+ - r.getReadName()+": "+ - "read length="+r.getReadLength()+"; cigar="+r.getCigarString()+"; start="+ - r.getAlignmentStart()+"; end="+r.getAlignmentEnd()+"; window start="+getStart()+ - "; window end="+getStop()); - throw e; - } - - if ( indelsAtSite == null ) { - indelsAtSite = new ArrayList(); - indels.set(pos, indelsAtSite); - } - - IndelVariant indel = null; - for ( IndelVariant v : indelsAtSite ) { - if ( ! v.equals(type, bases) ) continue; - - indel = v; - indel.addObservation(rec); - break; - } - - if ( indel == null ) { // not found: - indel = new IndelVariant(rec, type, bases); - indelsAtSite.add(indel); - } - indel.addReadPositions(fromStart,fromEnd); - } - - public List indelsAt( final long refPos ) { - List l = indels.get((int)( refPos - start )); - if ( l == null ) return emptyIndelList; - else return l; - } - - - } - - - class ExpandedSAMRecord { - private SAMRecord read; - private byte[] mismatch_flags; - private byte[] expanded_quals; - private int mms; - - public ExpandedSAMRecord(SAMRecord r, byte [] ref, long offset, IndelListener l) { - - read = r; - final long rStart = read.getAlignmentStart(); - final long rStop = read.getAlignmentEnd(); - final byte[] readBases = read.getReadString().toUpperCase().getBytes(); - - ref = new String(ref).toUpperCase().getBytes(); - - mismatch_flags = new byte[(int)(rStop-rStart+1)]; - expanded_quals = new byte[(int)(rStop-rStart+1)]; - - // now let's extract indels: - - Cigar c = read.getCigar(); - final int nCigarElems = c.numCigarElements(); - - - int readLength = 0; // length of the aligned part of the read NOT counting clipped bases - for ( CigarElement cel : c.getCigarElements() ) { - - switch(cel.getOperator()) { - case H: - case S: - case D: - case N: - case P: - break; // do not count gaps or clipped bases - case I: - case M: - case EQ: - case X: - readLength += cel.getLength(); - break; // advance along the gapless block in the alignment - default : - throw new IllegalArgumentException("Unexpected operator in cigar string: "+cel.getOperator()); - } - } - - int fromStart = 0; - int posOnRead = 0; - int posOnRef = 0; // the chunk of reference ref[] that we have access to is aligned with the read: - // its start on the actual full reference contig is r.getAlignmentStart() - for ( int i = 0 ; i < nCigarElems ; i++ ) { - - final CigarElement ce = c.getCigarElement(i); - IndelVariant.Type type = null; - String indel_bases = null; - int eventPosition = posOnRef; - - switch(ce.getOperator()) { - case H: break; // hard clipped reads do not have clipped indel_bases in their sequence, so we just ignore the H element... - case I: - type = IndelVariant.Type.I; - indel_bases = read.getReadString().substring(posOnRead,posOnRead+ce.getLength()); - // will increment position on the read below, there's no 'break' statement yet... - case S: - // here we also skip soft-clipped indel_bases on the read; according to SAM format specification, - // alignment start position on the reference points to where the actually aligned - // (not clipped) indel_bases go, so we do not need to increment reference position here - posOnRead += ce.getLength(); - break; - case D: - type = IndelVariant.Type.D; - indel_bases = new String( ref, posOnRef, ce.getLength() ); - for( int k = 0 ; k < ce.getLength(); k++, posOnRef++ ) mismatch_flags[posOnRef] = expanded_quals[posOnRef] = -1; - - break; - case M: - case EQ: - case X: - for ( int k = 0; k < ce.getLength(); k++, posOnRef++, posOnRead++ ) { - if ( readBases[posOnRead] != ref[posOnRef] ) { // mismatch! - mms++; - mismatch_flags[posOnRef] = 1; - } - expanded_quals[posOnRef] = read.getBaseQualities()[posOnRead]; - } - fromStart += ce.getLength(); - break; // advance along the gapless block in the alignment - default : - throw new IllegalArgumentException("Unexpected operator in cigar string: "+ce.getOperator()); - } - - if ( type == null ) continue; // element was not an indel, go grab next element... - - // we got an indel if we are here... - if ( i == 0 ) logger.debug("Indel at the start of the read "+read.getReadName()); - if ( i == nCigarElems - 1) logger.debug("Indel at the end of the read "+read.getReadName()); - - // note that here we will be assigning indels to the first deleted base or to the first - // base after insertion, not to the last base before the event! - int fromEnd = readLength - fromStart; - if ( type == IndelVariant.Type.I ) fromEnd -= ce.getLength(); - - l.addObservation((int)(offset+eventPosition), type, indel_bases, fromStart, fromEnd, this); - - if ( type == IndelVariant.Type.I ) fromStart += ce.getLength(); - - } - } - - public SAMRecord getSAMRecord() { return read; } - - public byte [] getExpandedMMFlags() { return mismatch_flags; } - - public byte [] getExpandedQuals() { return expanded_quals; } - - public int getMMCount() { return mms; } - - public boolean equals(Object o) { - if ( this == o ) return true; - if ( read == null ) return false; - if ( o instanceof SAMRecord ) return read.equals(o); - if ( o instanceof ExpandedSAMRecord ) return read.equals(((ExpandedSAMRecord)o).read); - return false; - } - - - } - -} - - -class VCFIndelAttributes { - public static String ALLELIC_DEPTH_KEY = VCFConstants.GENOTYPE_ALLELE_DEPTHS; - public static String DEPTH_TOTAL_KEY = VCFConstants.DEPTH_KEY; - - public static String MAPQ_KEY = "MQS"; - - public static String MM_KEY = "MM"; - - public static String NQS_MMRATE_KEY = "NQSMM"; - - public static String NQS_AVQ_KEY = "NQSBQ"; - - public static String STRAND_COUNT_KEY = "SC"; - public static String RSTART_OFFSET_KEY = "RStart"; - public static String REND_OFFSET_KEY = "REnd"; - - public static Set getAttributeHeaderLines() { - Set lines = new HashSet(); - - lines.add(new VCFFormatHeaderLine(ALLELIC_DEPTH_KEY, 2, VCFHeaderLineType.Integer, "# of reads supporting consensus reference/indel at the site")); - lines.add(new VCFFormatHeaderLine(DEPTH_TOTAL_KEY, 1, VCFHeaderLineType.Integer, "Total coverage at the site")); - - lines.add(new VCFFormatHeaderLine(MAPQ_KEY, 2, VCFHeaderLineType.Float, "Average mapping qualities of ref-/consensus indel-supporting reads")); - - lines.add(new VCFFormatHeaderLine(MM_KEY, 2, VCFHeaderLineType.Float, "Average # of mismatches per ref-/consensus indel-supporting read")); - - lines.add(new VCFFormatHeaderLine(NQS_MMRATE_KEY, 2, VCFHeaderLineType.Float, "Within NQS window: fraction of mismatching bases in ref/consensus indel-supporting reads")); - - lines.add(new VCFFormatHeaderLine(NQS_AVQ_KEY, 2, VCFHeaderLineType.Float, "Within NQS window: average quality of bases in ref-/consensus indel-supporting reads")); - - lines.add(new VCFFormatHeaderLine(STRAND_COUNT_KEY, 4, VCFHeaderLineType.Integer, "Strandness: counts of forward-/reverse-aligned reference and indel-supporting reads (FwdRef,RevRef,FwdIndel,RevIndel)")); - - lines.add(new VCFFormatHeaderLine(RSTART_OFFSET_KEY, 2, VCFHeaderLineType.Integer, "Median/mad of indel offsets from the starts of the reads")); - lines.add(new VCFFormatHeaderLine(REND_OFFSET_KEY, 2, VCFHeaderLineType.Integer, "Median/mad of indel offsets from the ends of the reads")); - - return lines; - } - - public static Map recordStrandCounts(int cnt_cons_fwd, int cnt_cons_rev, int cnt_ref_fwd, int cnt_ref_rev, Map attrs) { - attrs.put(STRAND_COUNT_KEY, new Integer[] {cnt_cons_fwd, cnt_cons_rev, cnt_ref_fwd, cnt_ref_rev} ); - return attrs; - } - - public static GenotypeBuilder recordStrandCounts(int cnt_cons_fwd, int cnt_cons_rev, int cnt_ref_fwd, int cnt_ref_rev, - GenotypeBuilder gb) { - return gb.attribute(STRAND_COUNT_KEY, new Integer[] {cnt_ref_fwd, cnt_ref_rev,cnt_cons_fwd, cnt_cons_rev } ); - } - - public static Map recordDepth(int cnt_cons, int cnt_indel, int cnt_total, Map attrs) { - attrs.put(ALLELIC_DEPTH_KEY, new Integer[] {cnt_total-cnt_indel, cnt_cons} ); - attrs.put(DEPTH_TOTAL_KEY, cnt_total); - return attrs; - } - - public static GenotypeBuilder recordDepth(int cnt_cons, int cnt_indel, int cnt_total, GenotypeBuilder gb) { - return gb.AD(new int[] {cnt_total-cnt_indel,cnt_cons} ).DP(cnt_total); - } - - public static Map recordAvMapQ(double cons, double ref, Map attrs) { - attrs.put(MAPQ_KEY, new Float[] {(float)ref, (float)cons} ); - return attrs; - } - - public static GenotypeBuilder recordAvMapQ(double cons, double ref, GenotypeBuilder gb) { - return gb.attribute(MAPQ_KEY,new float[] {(float)ref, (float)cons} ); - } - - public static Map recordAvMM(double cons, double ref, Map attrs) { - attrs.put(MM_KEY, new Float[] {(float)ref, (float)cons} ); - return attrs; - } - - public static GenotypeBuilder recordAvMM(double cons, double ref, GenotypeBuilder gb) { - return gb.attribute(MM_KEY, new float[] {(float)ref, (float)cons} ); - } - - public static Map recordNQSMMRate(double cons, double ref, Map attrs) { - attrs.put(NQS_MMRATE_KEY, new Float[] {(float)ref, (float)cons} ); - return attrs; - } - - public static GenotypeBuilder recordNQSMMRate(double cons, double ref, GenotypeBuilder gb) { - return gb.attribute(NQS_MMRATE_KEY, new float[] {(float)ref, (float)cons} ); - } - - public static Map recordNQSAvQ(double cons, double ref, Map attrs) { - attrs.put(NQS_AVQ_KEY, new float[] {(float)ref, (float)cons} ); - return attrs; - } - - public static GenotypeBuilder recordNQSAvQ(double cons, double ref, GenotypeBuilder gb) { - return gb.attribute(NQS_AVQ_KEY, new float[] {(float)ref, (float)cons} ); - } - - public static Map recordOffsetFromStart(int median, int mad, Map attrs) { - attrs.put(RSTART_OFFSET_KEY, new Integer[] {median, mad} ); - return attrs; - } - - public static GenotypeBuilder recordOffsetFromStart(int median, int mad, GenotypeBuilder gb) { - return gb.attribute(RSTART_OFFSET_KEY, new int[] {median, mad} ); - } - - public static Map recordOffsetFromEnd(int median, int mad, Map attrs) { - attrs.put(REND_OFFSET_KEY, new Integer[] {median, mad} ); - return attrs; - } - - public static GenotypeBuilder recordOffsetFromEnd(int median, int mad, GenotypeBuilder gb) { - return gb.attribute(REND_OFFSET_KEY, new int[] {median, mad} ); - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPed.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPed.java index 98fe6636c..295e6f3cd 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPed.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPed.java @@ -80,6 +80,9 @@ public class VariantsToBinaryPed extends RodWalker { @Argument(fullName="majorAlleleFirst",required=false,doc="Sets the major allele to be 'reference' for the bim file, rather than the ref allele") boolean majorAlleleFirst = false; + @Argument(fullName="checkAlternateAlleles",required=false,doc="Checks that alternate alleles actually appear in samples, erroring out if they do not") + boolean checkAlternateAlleles = false; + enum OutputMode { INDIVIDUAL_MAJOR,SNP_MAJOR } private static double APPROX_CM_PER_BP = 1000000.0/750000.0; @@ -473,7 +476,8 @@ public class VariantsToBinaryPed extends RodWalker { System.arraycopy(ref.getBases(), 0, observedRefBases, 0, refLength); final Allele observedRefAllele = Allele.create(observedRefBases); vc.validateReferenceBases(reportedRefAllele, observedRefAllele); - vc.validateAlternateAlleles(); + if ( checkAlternateAlleles ) + vc.validateAlternateAlleles(); } private String getReferenceAllele(VariantContext vc) { diff --git a/public/java/src/org/broadinstitute/sting/utils/AminoAcid.java b/public/java/src/org/broadinstitute/sting/utils/AminoAcid.java deleted file mode 100755 index 0b47093fa..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/AminoAcid.java +++ /dev/null @@ -1,93 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.utils; - -/** - * Represents a single amino acid. - */ -public class AminoAcid { - private String name; - private String threeLetterCode; - private String letter; - - - /** - * Constructor. - * - * @param letter The 1 letter code. (eg. I). This is '*' for the stop codon. - * @param name The full name of the AA (eg. Isoleucine). - * @param code The 3 letter code. (eg. Ile). - */ - public AminoAcid( String letter, String name, String code) { - this.name = name; - this.threeLetterCode = code; - this.letter = letter; - } - - /** Equality based on the amino acid code. */ - public boolean equals(Object o) { - if (this == o) { return true; } - if (o == null || !(o instanceof AminoAcid)) { return false; } - - final AminoAcid aminoAcid = (AminoAcid) o; - return !(getCode() != null ? !getCode().equals(aminoAcid.getCode()) : aminoAcid.getCode() != null); - } - - /** Hashes the three letter code. */ - public int hashCode() { - return (getCode() != null ? getCode().hashCode() : 0); - } - - /** - * Returns the full name of this AA. - */ - public String getName() { return name; } - - /** - * Returns the 1 letter code for this AA. - */ - public String getLetter() { return letter; } - - /** - * Returns the 3 letter code. - */ - public String getCode() { return threeLetterCode; } - - - /** Returns true if the amino acid is really just a stop codon. */ - public boolean isStop() { - return "*".equals(getLetter()); - } - - /** Returns true if the amino acid is really just a stop codon. */ - public boolean isUnknown() { - return "X".equals(getLetter()); - } - - public String toString() { - return name; - } -} diff --git a/public/java/src/org/broadinstitute/sting/utils/AminoAcidTable.java b/public/java/src/org/broadinstitute/sting/utils/AminoAcidTable.java deleted file mode 100755 index 1ae28ffb3..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/AminoAcidTable.java +++ /dev/null @@ -1,214 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.utils; - -import java.util.HashMap; - -/** - * A simple {codon -> amino acid name} lookup table. - * Handles differences between mitochondrial and nuclear genes. - */ -public class AminoAcidTable { - - - protected static final AminoAcid UNKNOWN = new AminoAcid("X" , "Unknown", "Unk"); - protected static final AminoAcid ISOLEUCINE = new AminoAcid("I" , "Isoleucine", "Ile"); - protected static final AminoAcid LEUCINE = new AminoAcid("L" , "Leucine", "Leu"); - protected static final AminoAcid VALINE = new AminoAcid("V" , "Valine", "Val"); - protected static final AminoAcid PHENYLALANINE = new AminoAcid("F" , "Phenylalanine", "Phe"); - protected static final AminoAcid METHIONINE = new AminoAcid("M" , "Methionine", "Met"); - protected static final AminoAcid CYSTEINE = new AminoAcid("C" , "Cysteine", "Cys"); - protected static final AminoAcid ALANINE = new AminoAcid("A" , "Alanine", "Ala"); - protected static final AminoAcid STOP_CODON = new AminoAcid("*" , "Stop Codon", "Stop"); - protected static final AminoAcid GLYCINE = new AminoAcid("G" , "Glycine", "Gly"); - protected static final AminoAcid PROLINE = new AminoAcid("P" , "Proline", "Pro"); - protected static final AminoAcid THEONINE = new AminoAcid("T" , "Threonine", "Thr"); - protected static final AminoAcid SERINE = new AminoAcid("S" , "Serine", "Ser"); - protected static final AminoAcid TYROSINE = new AminoAcid("Y" , "Tyrosine", "Tyr"); - protected static final AminoAcid TRYPTOPHAN = new AminoAcid("W" , "Tryptophan", "Trp"); - protected static final AminoAcid GLUTAMINE = new AminoAcid("Q" , "Glutamine", "Gln"); - protected static final AminoAcid ASPARAGINE = new AminoAcid("N" , "Asparagine", "Asn"); - protected static final AminoAcid HISTIDINE = new AminoAcid("H" , "Histidine", "His"); - protected static final AminoAcid GLUTAMIC_ACID = new AminoAcid("E" , "Glutamic acid", "Glu"); - protected static final AminoAcid ASPARTIC_ACID = new AminoAcid("D" , "Aspartic acid", "Asp"); - protected static final AminoAcid LYSINE = new AminoAcid("K" , "Lysine", "Lys"); - protected static final AminoAcid ARGININE = new AminoAcid("R" , "Arginine", "Arg"); - - protected static HashMap aminoAcidTable = new HashMap(); - protected static HashMap mitochondrialAminoAcidTable = new HashMap(); - - static { - //populate the tables - aminoAcidTable.put("ATT", ISOLEUCINE); - aminoAcidTable.put("ATC", ISOLEUCINE); - aminoAcidTable.put("ATA", ISOLEUCINE); - - - aminoAcidTable.put("CTT", LEUCINE); - aminoAcidTable.put("CTC", LEUCINE); - aminoAcidTable.put("CTA", LEUCINE); - aminoAcidTable.put("CTG", LEUCINE); - aminoAcidTable.put("TTA", LEUCINE); - aminoAcidTable.put("TTG", LEUCINE); - - - aminoAcidTable.put("GTT", VALINE); - aminoAcidTable.put("GTC", VALINE); - aminoAcidTable.put("GTA", VALINE); - aminoAcidTable.put("GTG", VALINE); - - - aminoAcidTable.put("TTT", PHENYLALANINE); - aminoAcidTable.put("TTC", PHENYLALANINE); - - - aminoAcidTable.put("ATG", METHIONINE); - - aminoAcidTable.put("TGT", CYSTEINE); - aminoAcidTable.put("TGC", CYSTEINE); - - aminoAcidTable.put("GCT", ALANINE); - aminoAcidTable.put("GCC", ALANINE); - aminoAcidTable.put("GCA", ALANINE); - aminoAcidTable.put("GCG", ALANINE); - - - aminoAcidTable.put("GGT", GLYCINE); - aminoAcidTable.put("GGC", GLYCINE); - aminoAcidTable.put("GGA", GLYCINE); - aminoAcidTable.put("GGG", GLYCINE); - - - aminoAcidTable.put("CCT", PROLINE); - aminoAcidTable.put("CCC", PROLINE); - aminoAcidTable.put("CCA", PROLINE); - aminoAcidTable.put("CCG", PROLINE); - - - - - aminoAcidTable.put("ACT", THEONINE); - aminoAcidTable.put("ACC", THEONINE); - aminoAcidTable.put("ACA", THEONINE); - aminoAcidTable.put("ACG", THEONINE); - - - - aminoAcidTable.put("TCT", SERINE); - aminoAcidTable.put("TCC", SERINE); - aminoAcidTable.put("TCA", SERINE); - aminoAcidTable.put("TCG", SERINE); - aminoAcidTable.put("AGT", SERINE); - aminoAcidTable.put("AGC", SERINE); - - aminoAcidTable.put("TAT", TYROSINE); - aminoAcidTable.put("TAC", TYROSINE); - - - - aminoAcidTable.put("TGG", TRYPTOPHAN); - - - aminoAcidTable.put("CAA", GLUTAMINE); - aminoAcidTable.put("CAG", GLUTAMINE); - - - aminoAcidTable.put("AAT", ASPARAGINE); - aminoAcidTable.put("AAC", ASPARAGINE); - - - aminoAcidTable.put("CAT", HISTIDINE); - aminoAcidTable.put("CAC", HISTIDINE); - - - aminoAcidTable.put("GAA", GLUTAMIC_ACID); - aminoAcidTable.put("GAG", GLUTAMIC_ACID); - - - - aminoAcidTable.put("GAT", ASPARTIC_ACID); - aminoAcidTable.put("GAC", ASPARTIC_ACID); - - - aminoAcidTable.put("AAA", LYSINE); - aminoAcidTable.put("AAG", LYSINE); - - - aminoAcidTable.put("CGT", ARGININE); - aminoAcidTable.put("CGC", ARGININE); - aminoAcidTable.put("CGA", ARGININE); - aminoAcidTable.put("CGG", ARGININE); - aminoAcidTable.put("AGA", ARGININE); - aminoAcidTable.put("AGG", ARGININE); - - - aminoAcidTable.put("TAA", STOP_CODON ); - aminoAcidTable.put("TAG", STOP_CODON); - aminoAcidTable.put("TGA", STOP_CODON); - - - //populate the mitochondrial AA table - mitochondrialAminoAcidTable.putAll(aminoAcidTable); - mitochondrialAminoAcidTable.put("AGA", STOP_CODON); - mitochondrialAminoAcidTable.put("AGG", STOP_CODON); - mitochondrialAminoAcidTable.put("ATA", METHIONINE); - mitochondrialAminoAcidTable.put("TGA", TRYPTOPHAN); - } - - /** - * Returns the amino acid encoded by the given codon in a eukaryotic genome. - * - * @param codon The 3-letter mRNA nucleotide codon 5' to 3'. Expects T's instead of U's. Not case sensitive. - * - * @return The amino acid matching the given codon, or the UNKNOWN amino acid if the codon string doesn't match anything - */ - public static AminoAcid getEukaryoticAA(String codon) { - codon = codon.toUpperCase(); - final AminoAcid aa = aminoAcidTable.get(codon); - return aa == null ? UNKNOWN : aa; - } - - - /** - * Returns the amino acid encoded by the given codon in a mitochondrial genome. - * - * @param codon The 3-letter mRNA nucleotide codon 5' to 3'. Expects T's instead of U's. Not case sensitive. - * @param isFirstCodon If this is the 1st codon in the gene, then "ATT" encodes Methyonine - * - * @return The amino acid matching the given codon in mitochondrial genes, or the UNKNOWN amino acid if the codon string doesn't match anything - */ - public static AminoAcid getMitochondrialAA(String codon, boolean isFirstCodon) { - codon = codon.toUpperCase(); - final AminoAcid aa = mitochondrialAminoAcidTable.get(codon); - if(aa == null) { - return UNKNOWN; - } else if(isFirstCodon && codon.equals("ATT")) { - return METHIONINE; //special case - 'ATT' in the first codon of a mitochondrial gene codes for methionine instead of isoleucine - } else { - return aa; - } - } -} diff --git a/public/java/src/org/broadinstitute/sting/utils/AutoFormattingTime.java b/public/java/src/org/broadinstitute/sting/utils/AutoFormattingTime.java index 4455666e8..fdcb8ba03 100644 --- a/public/java/src/org/broadinstitute/sting/utils/AutoFormattingTime.java +++ b/public/java/src/org/broadinstitute/sting/utils/AutoFormattingTime.java @@ -1,32 +1,104 @@ package org.broadinstitute.sting.utils; +import java.util.concurrent.TimeUnit; + /** - * Simple utility class that makes it convenient to print unit adjusted times + * Conveniently print a time with an automatically determined time unit + * + * For example, if the amount of time is 10^6 seconds, instead of printing + * out 10^6 seconds, prints out 11.57 days instead. + * + * Dynamically uses time units: + * + * - seconds: s + * - minutes: m + * - hours : h + * - days : d + * - weeks : w + * + * @author depristo + * @since 2009 */ public class AutoFormattingTime { + private static final double NANOSECONDS_PER_SECOND = 1e9; + + /** + * Width a la format's %WIDTH.PERCISIONf + */ private final int width; // for format + + /** + * Precision a la format's %WIDTH.PERCISIONf + */ private final int precision; // for format - double timeInSeconds; // in Seconds - private final String formatString; + /** + * The elapsed time in nanoseconds + */ + private final long nanoTime; + + /** + * Create a new autoformatting time with elapsed time nanoTime in nanoseconds + * @param nanoTime the elapsed time in nanoseconds + * @param width the width >= 0 (a la format's %WIDTH.PERCISIONf) to use to display the format, or -1 if none is required + * @param precision the precision to display the time at. Must be >= 0; + */ + public AutoFormattingTime(final long nanoTime, final int width, int precision) { + if ( width < -1 ) throw new IllegalArgumentException("Width " + width + " must be >= -1"); + if ( precision < 0 ) throw new IllegalArgumentException("Precision " + precision + " must be >= 0"); - public AutoFormattingTime(double timeInSeconds, final int width, int precision) { this.width = width; - this.timeInSeconds = timeInSeconds; + this.nanoTime = nanoTime; this.precision = precision; - this.formatString = "%" + width + "." + precision + "f %s"; } - public AutoFormattingTime(double timeInSeconds, int precision) { - this(timeInSeconds, 6, precision); + /** + * @see #AutoFormattingTime(long, int, int) but with default width and precision + * @param nanoTime + */ + public AutoFormattingTime(final long nanoTime) { + this(nanoTime, 6, 1); } + /** + * @see #AutoFormattingTime(long, int, int) but with time specificied as a double in seconds + */ + public AutoFormattingTime(final double timeInSeconds, final int width, final int precision) { + this(secondsToNano(timeInSeconds), width, precision); + } + + /** + * @see #AutoFormattingTime(long) but with time specificied as a double in seconds + */ public AutoFormattingTime(double timeInSeconds) { - this(timeInSeconds, 1); + this(timeInSeconds, 6, 1); } + /** + * Precomputed format string suitable for string.format with the required width and precision + */ + private String getFormatString() { + final StringBuilder b = new StringBuilder("%"); + if ( width != -1 ) + b.append(width); + b.append(".").append(precision).append("f %s"); + return b.toString(); + } + + /** + * Get the time associated with this object in nanoseconds + * @return the time in nanoseconds + */ + public long getTimeInNanoSeconds() { + return nanoTime; + } + + /** + * Get the time associated with this object in seconds, as a double + * @return time in seconds as a double + */ public double getTimeInSeconds() { - return timeInSeconds; + return TimeUnit.NANOSECONDS.toSeconds(getTimeInNanoSeconds()); } /** @@ -44,15 +116,16 @@ public class AutoFormattingTime { } /** - * Instead of 10000 s, returns 2.8 hours - * @return + * Get a string representation of this time, automatically converting the time + * to a human readable unit with width and precision provided during construction + * @return a non-null string */ public String toString() { - double unitTime = timeInSeconds; + double unitTime = getTimeInSeconds(); String unit = "s"; - if ( timeInSeconds > 120 ) { - unitTime = timeInSeconds / 60; // minutes + if ( unitTime > 120 ) { + unitTime /= 60; // minutes unit = "m"; if ( unitTime > 120 ) { @@ -64,13 +137,24 @@ public class AutoFormattingTime { unit = "d"; if ( unitTime > 20 ) { - unitTime /= 7; // days + unitTime /= 7; // weeks unit = "w"; } } } } - return String.format(formatString, unitTime, unit); + return String.format(getFormatString(), unitTime, unit); } + + + /** + * Convert a time in seconds as a double into nanoseconds as a long + * @param timeInSeconds an elapsed time in seconds, as a double + * @return an equivalent value in nanoseconds as a long + */ + private static long secondsToNano(final double timeInSeconds) { + return (long)(NANOSECONDS_PER_SECOND * timeInSeconds); + } + } diff --git a/public/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java b/public/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java index dbffacfbc..c18eef23f 100644 --- a/public/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java +++ b/public/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java @@ -287,6 +287,11 @@ public final class GenomeLocParser { return new GenomeLoc(contig, index, start, stop); } + public GenomeLoc createGenomeLocOnContig(final String contig, final int start, final int stop) { + GenomeLoc contigLoc = createOverEntireContig(contig); + return new GenomeLoc(contig, getContigIndex(contig), start, stop).intersect(contigLoc); + } + /** * validate a position or interval on the genome as valid * diff --git a/public/java/src/org/broadinstitute/sting/utils/QualityUtils.java b/public/java/src/org/broadinstitute/sting/utils/QualityUtils.java index 861f172d9..844fa6b90 100755 --- a/public/java/src/org/broadinstitute/sting/utils/QualityUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/QualityUtils.java @@ -14,7 +14,8 @@ public class QualityUtils { public final static double ERROR_RATE_OF_MAX_QUAL_SCORE = qualToErrorProbRaw(MAX_QUAL_SCORE); public final static double MIN_REASONABLE_ERROR = 0.0001; - public final static byte MAX_REASONABLE_Q_SCORE = 60; // quals above this value are extremely suspicious + public final static byte MAX_REASONABLE_Q_SCORE = 60; // bams containing quals above this value are extremely suspicious and we should warn the user + public final static byte MAX_GATK_USABLE_Q_SCORE = 40; // quals above this value should be capped down to this value (because they are too high) public final static byte MIN_USABLE_Q_SCORE = 6; public final static int MAPPING_QUALITY_UNAVAILABLE = 255; @@ -73,7 +74,7 @@ public class QualityUtils { } public static double qualToErrorProb(final double qual) { - return Math.pow(10.0, ((double) qual)/-10.0); + return Math.pow(10.0, qual/-10.0); } diff --git a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java index 0d12d53cc..c12dfcee9 100644 --- a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java +++ b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java @@ -31,7 +31,7 @@ public class ActiveRegion implements HasGenomeLocation { this.isActive = isActive; this.genomeLocParser = genomeLocParser; this.extension = extension; - extendedLoc = genomeLocParser.createGenomeLoc(activeRegionLoc.getContig(), activeRegionLoc.getStart() - extension, activeRegionLoc.getStop() + extension); + extendedLoc = genomeLocParser.createGenomeLocOnContig(activeRegionLoc.getContig(), activeRegionLoc.getStart() - extension, activeRegionLoc.getStop() + extension); fullExtentReferenceLoc = extendedLoc; } diff --git a/public/java/src/org/broadinstitute/sting/utils/collections/CircularArray.java b/public/java/src/org/broadinstitute/sting/utils/collections/CircularArray.java deleted file mode 100644 index 5a669e65c..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/collections/CircularArray.java +++ /dev/null @@ -1,231 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.utils.collections; - -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; - - -/** This class, closely resembling a deque (except that it is not dynamically grown), - * provides an object with array-like interface and efficient - * implementation of shift operation. Use this class when some kind of sliding window is required: - * e.g. an array (window) is populated from some stream of data, and then the window is shifted. - * If most of the data in the window remains the same so that only a few old elements sjould be popped from - * and a few new elements pushed onto the array, both re-populating the whole array from the data and - * shifting a regular array would be grossly inefficient. Instead, shiftData(int N) method of circular array - * efficiently pops out N first elements and makes last N elements available. - * - * Consider an example of reading a character stream A,B,C,D,....,Z into an array with requirement of keeping - * last 5 letters. First, we would read first 5 letters same way as we would with a regular array:

- * - * - * CircularArray a(5);
- * for ( int i = 0; i < 5; i++ ) a.set(i, readChar());
- *
- *
- * and then on the arrival of each next character we shift the array:

- * - * - * a.shiftData(1); a.set(4, readChar() );
- *
- *
- * After the lines from the above example are executed, the array will logically look as:
- * - * B,C,D,E,F,

- * - * e.g. as if we had a regular array, shifted it one element down and added new element on the top. - * - * - * @author asivache - * - */ -public class CircularArray { - - - private Object[] data ; - private int offset; - - /** Creates an array of fixed length */ - public CircularArray(int length) { - if ( length <= 0 ) throw new ReviewedStingException("CircularArray length must be positive. Passed: "+length); - data = new Object[length]; - offset = 0; - } - - /** Returns length of the array */ - public int length() { - return data.length; - } - - /** Gets i-th element of the array - * - * @throws IndexOutOfBoundsException if value of i is illegal - */ - @SuppressWarnings("unchecked") - public T get(int i) { - if ( i < 0 || i >= data.length ) - throw new IndexOutOfBoundsException("Length of CircularArray: "+data.length+"; element requested: "+i); - return (T)(data [ ( offset + i ) % data.length ]); - } - - /** Sets i-th element of the array to the specified value. - * - * @throws IndexOutOfBoundsException if value of i is illegal - */ - public void set(int i, T value) { - if ( i < 0 || i >= data.length ) - throw new IndexOutOfBoundsException("Length of CircularArray: "+data.length+"; set element request at: "+i); - data [ ( offset + i ) % data.length ] = value; - } - - /** Set all elements to null. - * - */ - public void clear() { - for ( int i = 0 ; i < data.length ; i++ ) data[i] = null; - offset = 0; - } - - /** Efficient shift-down of the array data. After this operation, array.get(0), array.get(1), etc will - * be returning what array.get(shift), array.get(shift+1),... were returning before the shift was performed, - * and last shift elements of the array will be reset to 0. - * @param shift - */ - public void shiftData(int shift) { - if ( shift >= data.length ) { - // if we shift by more than the length of stored data, we lose - // all that data completely, so we just re-initialize the array. - // This is not the operating mode CircularArray is intended for - // but we can handle it, just in case. - for ( int i = 0 ; i < data.length ; i++ ) data[i] = null; - offset = 0; - return; - } - - // shift < data.length, so at least some data should be preserved - - final int newOffset = ( offset+shift ) % data.length; - if ( newOffset < offset ) { - // wrap-around! - for ( int i = offset ; i < data.length ; i++ ) data[i] = null; - for ( int i = 0; i < newOffset ; i++ ) data[i] = null; - } else { - for ( int i = offset ; i < newOffset ; i++ ) data[i] = null; - } - offset = newOffset; - } - - - - /** Implements primitive int type-based circular array. See CircularArray for details. - * - * @author asivache - * - */ - public static class Int { - private int [] data ; - private int offset; - - /** Creates an array of fixed length */ - public Int(int length) { - if ( length <= 0 ) throw new ReviewedStingException("CircularArray length must be positive. Passed: "+length); - data = new int[length]; // automaticaly initialized to zeros - offset = 0; - } - - /** Returns length of the array */ - public int length() { - return data.length; - } - - /** Gets i-th element of the array - * - * @throws IndexOutOfBoundsException if value of i is illegal - */ - public int get(int i) { - if ( i < 0 || i >= data.length ) - throw new IndexOutOfBoundsException("Length of CircularArray: "+data.length+"; element requested: "+i); - return data [ ( offset + i ) % data.length ]; - } - - /** Sets i-th element of the array to the specified value. - * - * @throws IndexOutOfBoundsException if value of i is illegal - */ - public void set(int i, int value) { - if ( i < 0 || i >= data.length ) - throw new IndexOutOfBoundsException("Length of CircularArray: "+data.length+"; set element request at: "+i); - data [ ( offset + i ) % data.length ] = value; - } - - /** Increments i-th element of the array by the specified value (value can be negative). - * - * @throws IndexOutOfBoundsException if i is illegal - */ - public void increment(int i, int value) { - if ( i < 0 || i >= data.length ) - throw new IndexOutOfBoundsException("Length of CircularArray: "+data.length+"; increment element request at: "+i); - data [ ( offset + i ) % data.length ] += value; - } - - /** Set all elements to 0. - * - */ - public void clear() { - for ( int i = 0 ; i < data.length ; i++ ) data[i] = 0; - offset = 0; - } - - /** Efficient shift-down of the array data. After this operation, array.get(0), array.get(1), etc will - * be returning what array.get(shift), array.get(shift+1),... were returning before the shift was performed, - * and last shift elements of the array will be reset to 0. - * @param shift - */ - public void shiftData(int shift) { - if ( shift >= data.length ) { - // if we shift by more than the length of stored data, we lose - // all that data completely, so we just re-initialize the array. - // This is not the operating mode CircularArray is intended for - // but we can handle it, just in case. - for ( int i = 0 ; i < data.length ; i++ ) data[i] = 0; - offset = 0; - return; - } - - // shift < data.length, so at least some data should be preserved - - final int newOffset = ( offset+shift ) % data.length; - if ( newOffset < offset ) { - // wrap-around! - for ( int i = offset ; i < data.length ; i++ ) data[i] = 0; - for ( int i = 0; i < newOffset ; i++ ) data[i] = 0; - } else { - for ( int i = offset ; i < newOffset ; i++ ) data[i] = 0; - } - offset = newOffset; - } - } - -} diff --git a/public/java/src/org/broadinstitute/sting/utils/interval/OverlappingIntervalIterator.java b/public/java/src/org/broadinstitute/sting/utils/interval/OverlappingIntervalIterator.java deleted file mode 100755 index 29ffb13e4..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/interval/OverlappingIntervalIterator.java +++ /dev/null @@ -1,195 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.utils.interval; - -import org.broadinstitute.sting.gatk.iterators.PushbackIterator; -import org.broadinstitute.sting.utils.GenomeLoc; - -import java.util.Iterator; - -/** - * Created by IntelliJ IDEA. - * User: asivache - * Date: Oct 7, 2010 - * Time: 2:40:02 PM - * To change this template use File | Settings | File Templates. - */ - -/** This class provides an adapter to Iterator that returns only (parts of) underlying iterator's - * intervals overlapping with specified "master set" of bounding intervals. The underlying iterator must return - * NON-overlapping intervals in coordinate-sorted order, otherwise the behavior is unspecified. If the master set is represented by - * another interval iterator, it should return sorted and NON-overlapping intervals. - * - */ -public class OverlappingIntervalIterator implements Iterator { - PushbackIterator iter = null; - PushbackIterator boundBy = null; - - GenomeLoc prefetchedOverlap = null; - GenomeLoc currentBound = null; - GenomeLoc currentInterval = null; - - - /** Creates new overlapping iterator that will internally traverse intervals and return only - * overlaps of those with set of intervals returned by boundBy. - * @param intervals - * @param boundBy - */ - public OverlappingIntervalIterator(Iterator intervals, Iterator boundBy) { - this.iter = new PushbackIterator(intervals); - this.boundBy = new PushbackIterator(boundBy); - - if ( iter.hasNext() && boundBy.hasNext() ) { - currentInterval = iter.next(); // load first interval - currentBound = boundBy.next(); // load first bounding interval - fetchNextOverlap(); - } - } - - /** Traverses both iterators in sync, until the first overlap between the two is reached. If no overlap is found - * until the end of the either of the two streams, leaves prefetchedOverlap set to null - */ - private void fetchNextOverlap() { - - prefetchedOverlap = null; - // System.out.println("Fetching... (interval="+currentInterval+"; bound="+currentBound+")"); - while ( currentInterval != null && currentBound != null ) { - - if ( currentInterval.isBefore(currentBound) ) { -// System.out.println(currentInterval +" is before "+currentBound ); - if ( ! iter.hasNext() ) currentInterval = null; - else currentInterval = iter.next(); - continue; - } - - if ( currentInterval.isPast(currentBound) ) { -// System.out.println(currentInterval +" is past "+currentBound ); - if ( ! boundBy.hasNext() ) currentBound = null; - else currentBound = boundBy.next(); - continue; - } - - // we are at this point only if currentInterval overlaps with currentBound - - prefetchedOverlap = currentInterval.intersect(currentBound); -// System.out.println("Fetched next overlap: "+prefetchedOverlap); - // now we need to advance at least one of the iterators, so that we would not - // call the same overlap again - - // however we still do not know if we are done with either current interval or current bound, because - // two special situations are possible: - // - // 1) next interval overlaps with 2) current interval also overlaps with - // the same bounding interval; next bounding interval; note that - // note that in this case next in this case next bound necessarily - // interval necessarily starts before starts before the next interval - // the next bound - // - // curr. int next int. curr. int - // ----- ------ -------------------------- - // ------------------- --------- ------------- - // curr. bound curr. bound next bound - - // To solve this issue we update either only currentInterval or only currentBound to their next value, - // whichever of those next values (intervals) comes first on the reference genome; - // the rest of the traversal to the next overlap will be performed on the next invocation of - // fetchNextOverlap(). - - advanceToNearest(); - - break; // now that we computed the overlap and advanced (at least one of) the intervals/bounds to - // the next location, we are done - bail out from the loop. - } - - } - - private void advanceToNearest() { - if ( ! iter.hasNext() ) { - currentBound = boundBy.hasNext() ? boundBy.next() : null; - } else { - if ( ! boundBy.hasNext() ) currentInterval = iter.hasNext() ? iter.next() : null; - else { - // both intervals and bounds have next value available; let's check which comes first: - GenomeLoc nextInterval = iter.next(); - GenomeLoc nextBound = boundBy.next(); - - if ( nextInterval.compareTo(nextBound) < 0 ) { - currentInterval = nextInterval; - boundBy.pushback(nextBound); - } else { - currentBound = nextBound; - iter.pushback(nextInterval); - } - - } - } - } - - /** - * Returns true if the iteration has more elements. (In other - * words, returns true if next would return an element - * rather than throwing an exception.) - * - * @return true if the iterator has more elements. - */ - public boolean hasNext() { - return prefetchedOverlap != null; - } - - /** - * Returns the next element in the iteration. - * - * @return the next element in the iteration. - * @throws java.util.NoSuchElementException - * iteration has no more elements. - */ - public GenomeLoc next() { - if ( prefetchedOverlap == null ) - throw new java.util.NoSuchElementException("Illegal call to next(): Overlapping iterator has no more overlaps"); - GenomeLoc ret = prefetchedOverlap; // cache current prefetched overlap - fetchNextOverlap(); // prefetch next overlap - return ret ; - } - - /** - * Removes from the underlying collection the last element returned by the - * iterator (optional operation). This method can be called only once per - * call to next. The behavior of an iterator is unspecified if - * the underlying collection is modified while the iteration is in - * progress in any way other than by calling this method. - * - * @throws UnsupportedOperationException if the remove - * operation is not supported by this Iterator. - * @throws IllegalStateException if the next method has not - * yet been called, or the remove method has already - * been called after the last call to the next - * method. - */ - public void remove() { - throw new UnsupportedOperationException("remove() method is not supported by OverlappingIntervalIterator"); - //To change body of implemented methods use File | Settings | File Templates. - } -} diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java index 567514f8c..3e0f36799 100644 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java @@ -76,7 +76,7 @@ public class BaseRecalibration { quantizationInfo = recalibrationReport.getQuantizationInfo(); if (quantizationLevels == 0) // quantizationLevels == 0 means no quantization, preserve the quality scores quantizationInfo.noQuantization(); - else if (quantizationLevels > 0 && quantizationLevels != quantizationInfo.getQuantizationLevels()) // any other positive value means, we want a different quantization than the one pre-calculated in the recalibration report. Negative values mean the user did not provide a quantization argument, and just wnats to use what's in the report. + else if (quantizationLevels > 0 && quantizationLevels != quantizationInfo.getQuantizationLevels()) // any other positive value means, we want a different quantization than the one pre-calculated in the recalibration report. Negative values mean the user did not provide a quantization argument, and just wants to use what's in the report. quantizationInfo.quantizeQualityScores(quantizationLevels); this.disableIndelQuals = disableIndelQuals; @@ -233,9 +233,9 @@ public class BaseRecalibration { * Note that this calculation is a constant for each rgKey and errorModel. We need only * compute this value once for all data. * - * @param rgKey - * @param errorModel - * @return + * @param rgKey read group key + * @param errorModel the event type + * @return global delta Q */ private double calculateGlobalDeltaQ(final int rgKey, final EventType errorModel) { double result = 0.0; diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/QualQuantizer.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/QualQuantizer.java index a5a3104a0..b5370e268 100644 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/QualQuantizer.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/QualQuantizer.java @@ -175,8 +175,7 @@ public class QualQuantizer { } /** - * Human readable name of this interval: e.g., 10-12 - * @return + * @return Human readable name of this interval: e.g., 10-12 */ public String getName() { return qStart + "-" + qEnd; @@ -188,8 +187,7 @@ public class QualQuantizer { } /** - * Returns the error rate (in real space) of this interval, or 0 if there are no obserations - * @return + * @return the error rate (in real space) of this interval, or 0 if there are no observations */ @Ensures("result >= 0.0") public double getErrorRate() { @@ -202,9 +200,7 @@ public class QualQuantizer { } /** - * Returns the QUAL of the error rate of this interval, or the fixed - * qual if this interval was created with a fixed qual. - * @return + * @return the QUAL of the error rate of this interval, or the fixed qual if this interval was created with a fixed qual. */ @Ensures("result >= 0 && result <= QualityUtils.MAX_QUAL_SCORE") public byte getQual() { @@ -254,7 +250,7 @@ public class QualQuantizer { final QualInterval right = this.compareTo(toMerge) < 0 ? toMerge : this; if ( left.qEnd + 1 != right.qStart ) - throw new ReviewedStingException("Attempting to merge non-continguous intervals: left = " + left + " right = " + right); + throw new ReviewedStingException("Attempting to merge non-contiguous intervals: left = " + left + " right = " + right); final long nCombinedObs = left.nObservations + right.nObservations; final long nCombinedErr = left.nErrors + right.nErrors; @@ -343,8 +339,7 @@ public class QualQuantizer { } /** - * Helper function that finds and mergest together the lowest penalty pair - * of intervals + * Helper function that finds and merges together the lowest penalty pair of intervals * @param intervals */ @Requires("! intervals.isEmpty()") diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatum.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatum.java index 1ab3b10c4..9b58a5900 100755 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatum.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatum.java @@ -28,9 +28,10 @@ package org.broadinstitute.sting.utils.recalibration; import com.google.java.contract.Ensures; import com.google.java.contract.Invariant; import com.google.java.contract.Requires; +import org.apache.commons.math.optimization.fitting.GaussianFunction; +import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.QualityUtils; -import java.util.Random; /** * An individual piece of recalibration data. Each bin counts up the number of observations and the number @@ -149,7 +150,7 @@ public class RecalDatum { //--------------------------------------------------------------------------------------------------------------- /** - * Returns the error rate (in real space) of this interval, or 0 if there are no obserations + * Returns the error rate (in real space) of this interval, or 0 if there are no observations * @return the empirical error rate ~= N errors / N obs */ @Ensures("result >= 0.0") @@ -262,6 +263,9 @@ public class RecalDatum { @Requires("empiricalQuality == UNINITIALIZED") @Ensures("empiricalQuality != UNINITIALIZED") private synchronized void calcEmpiricalQuality() { + + // TODO -- add code for Bayesian estimate of Qemp here + final double empiricalQual = -10 * Math.log10(getEmpiricalErrorRate()); empiricalQuality = Math.min(empiricalQual, (double) QualityUtils.MAX_RECALIBRATED_Q_SCORE); } @@ -276,4 +280,65 @@ public class RecalDatum { private double calcExpectedErrors() { return getNumObservations() * QualityUtils.qualToErrorProb(estimatedQReported); } -} + + static final boolean DEBUG = false; + static final double RESOLUTION_BINS_PER_QUAL = 1.0; + + static public double bayesianEstimateOfEmpiricalQuality(final double nObservations, final double nErrors, final double QReported) { + + final int numBins = (QualityUtils.MAX_REASONABLE_Q_SCORE + 1) * (int)RESOLUTION_BINS_PER_QUAL; + + final double[] log10Posteriors = new double[numBins]; + + for ( int bin = 0; bin < numBins; bin++ ) { + + final double QEmpOfBin = bin / RESOLUTION_BINS_PER_QUAL; + + log10Posteriors[bin] = log10QempPrior(QEmpOfBin, QReported) + log10Likelihood(QEmpOfBin, nObservations, nErrors); + + if ( DEBUG ) + System.out.println(String.format("bin = %d, Qreported = %f, nObservations = %f, nErrors = %f, posteriors = %f", bin, QReported, nObservations, nErrors, log10Posteriors[bin])); + } + + if ( DEBUG ) + System.out.println(String.format("Qreported = %f, nObservations = %f, nErrors = %f", QReported, nObservations, nErrors)); + + final double[] normalizedPosteriors = MathUtils.normalizeFromLog10(log10Posteriors); + final int MLEbin = MathUtils.maxElementIndex(normalizedPosteriors); + + final double Qemp = MLEbin / RESOLUTION_BINS_PER_QUAL; + return Qemp; + } + + static final double[] log10QempPriorCache = new double[QualityUtils.MAX_GATK_USABLE_Q_SCORE + 1]; + static { + // f(x) = a + b*exp(-((x - c)^2 / (2*d^2))) + // Note that b is the height of the curve's peak, c is the position of the center of the peak, and d controls the width of the "bell". + final double GF_a = 0.0; + final double GF_b = 0.9; + final double GF_c = 0.0; + final double GF_d = 0.5; + + final GaussianFunction gaussian = new GaussianFunction(GF_a, GF_b, GF_c, GF_d); + for ( int i = 0; i <= QualityUtils.MAX_GATK_USABLE_Q_SCORE; i++ ) + log10QempPriorCache[i] = Math.log10(gaussian.value((double) i)); + } + + static public double log10QempPrior(final double Qempirical, final double Qreported) { + final int difference = Math.min(Math.abs((int) (Qempirical - Qreported)), QualityUtils.MAX_GATK_USABLE_Q_SCORE); + if ( DEBUG ) + System.out.println(String.format("Qemp = %f, log10Priors = %f", Qempirical, log10QempPriorCache[difference])); + return log10QempPriorCache[difference]; + } + + static public double log10Likelihood(final double Qempirical, final double nObservations, final double nErrors) { + // this is just a straight binomial PDF + double log10Prob = MathUtils.log10BinomialProbability((int)nObservations, (int)nErrors, QualityUtils.qualToErrorProbLog10((byte)(int)Qempirical)); + if ( log10Prob == Double.NEGATIVE_INFINITY ) + log10Prob = -Double.MAX_VALUE; + + if ( DEBUG ) + System.out.println(String.format("Qemp = %f, log10Likelihood = %f", Qempirical, log10Prob)); + return log10Prob; + } +} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationReport.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationReport.java index 081221da4..6ecac1394 100644 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationReport.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationReport.java @@ -9,8 +9,7 @@ import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.recalibration.covariates.Covariate; -import java.io.File; -import java.io.PrintStream; +import java.io.*; import java.util.*; /** @@ -199,7 +198,7 @@ public class RecalibrationReport { private RecalDatum getRecalDatum(final GATKReportTable reportTable, final int row, final boolean hasEstimatedQReportedColumn) { final double nObservations = asDouble(reportTable.get(row, RecalUtils.NUMBER_OBSERVATIONS_COLUMN_NAME)); final double nErrors = asDouble(reportTable.get(row, RecalUtils.NUMBER_ERRORS_COLUMN_NAME)); - final double empiricalQuality = (Double) reportTable.get(row, RecalUtils.EMPIRICAL_QUALITY_COLUMN_NAME); + final double empiricalQuality = asDouble(reportTable.get(row, RecalUtils.EMPIRICAL_QUALITY_COLUMN_NAME)); // the estimatedQreported column only exists in the ReadGroup table final double estimatedQReported = hasEstimatedQReportedColumn ? diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java index 7ef05edd8..5ad4f5c8b 100644 --- a/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java @@ -236,18 +236,6 @@ public class AlignmentUtils { return n; } - public static int getNumAlignedBases(final SAMRecord r) { - int n = 0; - final Cigar cigar = r.getCigar(); - if (cigar == null) return 0; - - for (final CigarElement e : cigar.getCigarElements()) - if (e.getOperator() == CigarOperator.M) - n += e.getLength(); - - return n; - } - public static int getNumAlignedBasesCountingSoftClips(final SAMRecord r) { int n = 0; final Cigar cigar = r.getCigar(); @@ -512,47 +500,6 @@ public class AlignmentUtils { return true; } - /** - * Returns true is read is mapped and mapped uniquely (Q>0). - * - * @param read - * @return - */ - public static boolean isReadUniquelyMapped(SAMRecord read) { - return (!AlignmentUtils.isReadUnmapped(read)) && read.getMappingQuality() > 0; - } - - /** - * Returns the array of base qualitites in the order the bases were read on the machine (i.e. always starting from - * cycle 1). In other words, if the read is unmapped or aligned in the forward direction, the read's own base - * qualities are returned as stored in the SAM record; if the read is aligned in the reverse direction, the array - * of read's base qualitites is inverted (in this case new array is allocated and returned). - * - * @param read - * @return - */ - public static byte[] getQualsInCycleOrder(SAMRecord read) { - if (isReadUnmapped(read) || !read.getReadNegativeStrandFlag()) return read.getBaseQualities(); - - return Utils.reverse(read.getBaseQualities()); - } - - /** - * Returns the array of original base qualitites (before recalibration) in the order the bases were read on the machine (i.e. always starting from - * cycle 1). In other words, if the read is unmapped or aligned in the forward direction, the read's own base - * qualities are returned as stored in the SAM record; if the read is aligned in the reverse direction, the array - * of read's base qualitites is inverted (in this case new array is allocated and returned). If no original base qualities - * are available this method will throw a runtime exception. - * - * @param read - * @return - */ - public static byte[] getOriginalQualsInCycleOrder(SAMRecord read) { - if (isReadUnmapped(read) || !read.getReadNegativeStrandFlag()) return read.getOriginalBaseQualities(); - - return Utils.reverse(read.getOriginalBaseQualities()); - } - /** * Takes the alignment of the read sequence readSeq to the reference sequence refSeq * starting at 0-based position refIndex on the refSeq and specified by its cigar. diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ComparableSAMRecord.java b/public/java/src/org/broadinstitute/sting/utils/sam/ComparableSAMRecord.java deleted file mode 100755 index 31deb7535..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/sam/ComparableSAMRecord.java +++ /dev/null @@ -1,68 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.utils.sam; - -import net.sf.samtools.SAMRecord; - -public class ComparableSAMRecord implements Comparable { - - private SAMRecord record; - - public ComparableSAMRecord(SAMRecord record) { - this.record = record; - } - - public SAMRecord getRecord() { - return record; - } - - public int compareTo(ComparableSAMRecord o) { - // first sort by start position -- with not coverflow because both are guaranteed to be positive. - int comparison = record.getAlignmentStart() - o.record.getAlignmentStart(); - // if the reads have the same start position, we must give a non-zero comparison - // (because java Sets often require "consistency with equals") - if ( comparison == 0 ) - comparison = record.getReadName().compareTo(o.getRecord().getReadName()); - // if the read names are the same, use the first of the pair if appropriate - if ( comparison == 0 && record.getReadPairedFlag() ) - comparison = ( record.getFirstOfPairFlag() ? -1 : 1); - return comparison; - } - - public boolean equals(Object obj) { - if ( !(obj instanceof ComparableSAMRecord) ) - return false; - if ( this == obj ) - return true; - - ComparableSAMRecord csr = (ComparableSAMRecord)obj; - if(record.getAlignmentStart() != csr.record.getAlignmentStart()) - return false; - if ( !record.getReadName().equals(csr.getRecord().getReadName()) ) - return false; - return ( record.getFirstOfPairFlag() == csr.record.getFirstOfPairFlag() ); - } -} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/threading/ThreadEfficiencyMonitor.java b/public/java/src/org/broadinstitute/sting/utils/threading/ThreadEfficiencyMonitor.java index 9159f5657..68ccd8e58 100644 --- a/public/java/src/org/broadinstitute/sting/utils/threading/ThreadEfficiencyMonitor.java +++ b/public/java/src/org/broadinstitute/sting/utils/threading/ThreadEfficiencyMonitor.java @@ -133,7 +133,7 @@ public class ThreadEfficiencyMonitor { */ public synchronized void printUsageInformation(final Logger logger, final Priority priority) { logger.debug("Number of threads monitored: " + getnThreadsAnalyzed()); - logger.debug("Total runtime " + new AutoFormattingTime(TimeUnit.MILLISECONDS.toSeconds(getTotalTime()))); + logger.debug("Total runtime " + new AutoFormattingTime(TimeUnit.MILLISECONDS.toNanos(getTotalTime()))); for ( final State state : State.values() ) { logger.debug(String.format("\tPercent of time spent %s is %.2f", state.getUserFriendlyName(), getStatePercent(state))); } diff --git a/public/java/test/org/broadinstitute/sting/AutoFormattingTimeUnitTest.java b/public/java/test/org/broadinstitute/sting/AutoFormattingTimeUnitTest.java new file mode 100644 index 000000000..4292a5ef4 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/AutoFormattingTimeUnitTest.java @@ -0,0 +1,117 @@ +/* + * Copyright (c) 2012 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting; + +import org.broadinstitute.sting.utils.AutoFormattingTime; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.concurrent.TimeUnit; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * UnitTests for the AutoFormatting + * + * User: depristo + * Date: 8/24/12 + * Time: 11:25 AM + * To change this template use File | Settings | File Templates. + */ +public class AutoFormattingTimeUnitTest extends BaseTest { + @DataProvider(name = "AutoFormattingTimeUnitSelection") + public Object[][] makeTimeData() { + List tests = new ArrayList(); + tests.add(new Object[]{TimeUnit.SECONDS.toNanos(10), "s"}); + tests.add(new Object[]{TimeUnit.MINUTES.toNanos(10), "m"}); + tests.add(new Object[]{TimeUnit.HOURS.toNanos(10), "h"}); + tests.add(new Object[]{TimeUnit.DAYS.toNanos(10), "d"}); + tests.add(new Object[]{TimeUnit.DAYS.toNanos(1000), "w"}); + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "AutoFormattingTimeUnitSelection") + public void testUnitSelection(final long nano, final String expectedUnit) throws InterruptedException { + final AutoFormattingTime time = new AutoFormattingTime(nano); + testBasic(time, nano, time.getWidth(), time.getPrecision()); + Assert.assertTrue(time.toString().endsWith(expectedUnit), "TimeUnit " + time.toString() + " didn't contain expected time unit " + expectedUnit); + } + + @Test(dataProvider = "AutoFormattingTimeUnitSelection") + public void testSecondsAsDouble(final long nano, final String expectedUnit) throws InterruptedException { + final double inSeconds = nano * 1e-9; + final long nanoFromSeconds = (long)(inSeconds * 1e9); + final AutoFormattingTime time = new AutoFormattingTime(inSeconds); + testBasic(time, nanoFromSeconds, time.getWidth(), time.getPrecision()); + } + + @DataProvider(name = "AutoFormattingTimeWidthAndPrecision") + public Object[][] makeTimeWidthAndPrecision() { + List tests = new ArrayList(); + for ( final int width : Arrays.asList(-1, 1, 2, 6, 20) ) { + for ( final int precision : Arrays.asList(1, 2) ) { + tests.add(new Object[]{100.123456 * 1e9, width, precision}); + tests.add(new Object[]{0.123456 * 1e9, width, precision}); + } + } + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "AutoFormattingTimeWidthAndPrecision") + public void testWidthAndPrecision(final double inSeconds, final int width, final int precision) throws InterruptedException { + final AutoFormattingTime time = new AutoFormattingTime(inSeconds, width, precision); + final long nanoFromSeconds = (long)(inSeconds * 1e9); + testBasic(time, nanoFromSeconds, width, precision); + final Matcher match = matchToString(time); + match.matches(); + final String widthString = match.group(1); + final String precisionString = match.group(2); + if ( width != -1 ) { + final int actualWidth = widthString.length() + 1 + precisionString.length(); + Assert.assertTrue(actualWidth >= width, "width string '" + widthString + "' not >= the expected width " + width); + } + Assert.assertEquals(precisionString.length(), precision, "precision string '" + precisionString + "' not the expected precision " + precision); + } + + private static Matcher matchToString(final AutoFormattingTime time) { + Pattern pattern = Pattern.compile("(\\s*\\d*)\\.(\\d*) \\w"); + return pattern.matcher(time.toString()); + } + + private static void testBasic(final AutoFormattingTime aft, final long nano, final int expectedWidth, final int expectedPrecision) { + Assert.assertEquals(aft.getTimeInNanoSeconds(), nano); + assertEqualsDoubleSmart(aft.getTimeInSeconds(), nano * 1e-9, 1e-3, "Time in seconds not within tolerance of nanoSeconds"); + Assert.assertEquals(aft.getWidth(), expectedWidth); + Assert.assertEquals(aft.getPrecision(), expectedPrecision); + Assert.assertNotNull(aft.toString(), "TimeUnit toString returned null"); + final Matcher match = matchToString(aft); + Assert.assertTrue(match.matches(), "toString " + aft.toString() + " doesn't match our expected format"); + } +} diff --git a/public/java/test/org/broadinstitute/sting/BaseTest.java b/public/java/test/org/broadinstitute/sting/BaseTest.java index 76e25a3c0..a1f84b8c2 100755 --- a/public/java/test/org/broadinstitute/sting/BaseTest.java +++ b/public/java/test/org/broadinstitute/sting/BaseTest.java @@ -301,7 +301,11 @@ public abstract class BaseTest { Assert.assertTrue(actualSet.equals(expectedSet), info); // note this is necessary due to testng bug for set comps } - public static final void assertEqualsDoubleSmart(final double actual, final double expected, final double tolerance) { + public static void assertEqualsDoubleSmart(final double actual, final double expected, final double tolerance) { + assertEqualsDoubleSmart(actual, expected, tolerance, null); + } + + public static void assertEqualsDoubleSmart(final double actual, final double expected, final double tolerance, final String message) { if ( Double.isNaN(expected) ) // NaN == NaN => false unfortunately Assert.assertTrue(Double.isNaN(actual), "expected is nan, actual is not"); else if ( Double.isInfinite(expected) ) // NaN == NaN => false unfortunately @@ -309,7 +313,9 @@ public abstract class BaseTest { else { final double delta = Math.abs(actual - expected); final double ratio = Math.abs(actual / expected - 1.0); - Assert.assertTrue(delta < tolerance || ratio < tolerance, "expected = " + expected + " actual = " + actual + " not within tolerance " + tolerance); + Assert.assertTrue(delta < tolerance || ratio < tolerance, "expected = " + expected + " actual = " + actual + + " not within tolerance " + tolerance + + (message == null ? "" : "message: " + message)); } } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsTest.java b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java similarity index 94% rename from public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsTest.java rename to public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java index 185eca85c..4cda1455e 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java @@ -45,7 +45,7 @@ import java.util.*; * Test the Active Region Traversal Contract * http://iwww.broadinstitute.org/gsa/wiki/index.php/Active_Region_Traversal_Contract */ -public class TraverseActiveRegionsTest extends BaseTest { +public class TraverseActiveRegionsUnitTest extends BaseTest { private class DummyActiveRegionWalker extends ActiveRegionWalker { private final double prob; @@ -103,7 +103,8 @@ public class TraverseActiveRegionsTest extends BaseTest { private List intervals; - private static final String testBAM = "TraverseActiveRegionsTest.bam"; + private static final String testBAM = "TraverseActiveRegionsUnitTest.bam"; + private static final String testBAI = "TraverseActiveRegionsUnitTest.bai"; @BeforeClass private void init() throws FileNotFoundException { @@ -117,7 +118,8 @@ public class TraverseActiveRegionsTest extends BaseTest { // TODO: reads which are partially between intervals (in/outside extension) // TODO: duplicate reads - // TODO: should we assign reads which are completely outside intervals but within extension? + // TODO: reads which are completely outside intervals but within extension + // TODO: test the extension itself intervals = new ArrayList(); @@ -148,6 +150,8 @@ public class TraverseActiveRegionsTest extends BaseTest { private void createBAM(List reads) { File outFile = new File(testBAM); outFile.deleteOnExit(); + File indexFile = new File(testBAI); + indexFile.deleteOnExit(); SAMFileWriter out = new SAMFileWriterFactory().makeBAMWriter(reads.get(0).getHeader(), true, outFile); for (GATKSAMRecord read : ReadUtils.sortReadsByCoordinate(reads)) { @@ -237,6 +241,21 @@ public class TraverseActiveRegionsTest extends BaseTest { Assert.assertEquals(intervalStops.size(), 0, "Interval stop location does not match an active region stop location"); } + @Test + public void testActiveRegionExtensionOnContig() { + DummyActiveRegionWalker walker = new DummyActiveRegionWalker(); + + Collection activeRegions = getActiveRegions(walker, intervals).values(); + for (ActiveRegion activeRegion : activeRegions) { + GenomeLoc loc = activeRegion.getExtendedLoc(); + + // Contract: active region extensions must stay on the contig + Assert.assertTrue(loc.getStart() > 0, "Active region extension begins at location " + loc.getStart() + ", past the left end of the contig"); + int refLen = dictionary.getSequence(loc.getContigIndex()).getSequenceLength(); + Assert.assertTrue(loc.getStop() <= refLen, "Active region extension ends at location " + loc.getStop() + ", past the right end of the contig"); + } + } + @Test public void testPrimaryReadMapping() { DummyActiveRegionWalker walker = new DummyActiveRegionWalker(); diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java index b097e3d34..90e1d5c34 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java @@ -32,7 +32,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testHasAnnotsAsking1() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -G Standard --variant " + privateTestDir + "vcfexample2.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1, - Arrays.asList("fbfbd4d13b7ba3d76e8e186902e81378")); + Arrays.asList("a127623a26bac4c17c9df491e170ed88")); executeTest("test file has annotations, asking for annotations, #1", spec); } @@ -40,7 +40,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testHasAnnotsAsking2() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -G Standard --variant " + privateTestDir + "vcfexample3.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000", 1, - Arrays.asList("19aef8914efc497192f89a9038310ca5")); + Arrays.asList("13e24e6b9dfa241df5baa2c3f53415b9")); executeTest("test file has annotations, asking for annotations, #2", spec); } @@ -66,7 +66,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testNoAnnotsAsking1() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -G Standard --variant " + privateTestDir + "vcfexample2empty.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1, - Arrays.asList("4f0b8033da18e6cf6e9b8d5d36c21ba2")); + Arrays.asList("07cb4d427235878aeec0066d7d298e54")); executeTest("test file doesn't have annotations, asking for annotations, #1", spec); } @@ -74,7 +74,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testNoAnnotsAsking2() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -G Standard --variant " + privateTestDir + "vcfexample3empty.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000", 1, - Arrays.asList("64ca176d587dfa2b3b9dec9f7999305c")); + Arrays.asList("e579097677d5e56a5776151251947961")); executeTest("test file doesn't have annotations, asking for annotations, #2", spec); } @@ -82,7 +82,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testExcludeAnnotations() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -G Standard -XA FisherStrand -XA ReadPosRankSumTest --variant " + privateTestDir + "vcfexample2empty.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1, - Arrays.asList("f33f417fad98c05d9cd08ffa22943b0f")); + Arrays.asList("348314945436ace71ce6b1a52559d9ee")); executeTest("test exclude annotations", spec); } @@ -90,7 +90,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testOverwritingHeader() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -G Standard --variant " + privateTestDir + "vcfexample4.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,001,292", 1, - Arrays.asList("0c810f6c4abef9d9dc5513ca872d3d22")); + Arrays.asList("ae7930e37a66c0aa4cfe0232736864fe")); executeTest("test overwriting header", spec); } @@ -98,7 +98,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testNoReads() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -G Standard --variant " + privateTestDir + "vcfexample3empty.vcf -L " + privateTestDir + "vcfexample3empty.vcf", 1, - Arrays.asList("1c423b7730b9805e7b885ece924286e0")); + Arrays.asList("a0ba056c2625033e5e859fd6bcec1256")); executeTest("not passing it any reads", spec); } @@ -106,7 +106,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testDBTagWithDbsnp() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " --dbsnp " + b36dbSNP129 + " -G Standard --variant " + privateTestDir + "vcfexample3empty.vcf -L " + privateTestDir + "vcfexample3empty.vcf", 1, - Arrays.asList("54d7d5bb9404652857adf5e50d995f30")); + Arrays.asList("0be7da17340111a94e8581ee3808c88a")); executeTest("getting DB tag with dbSNP", spec); } @@ -114,7 +114,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testMultipleIdsWithDbsnp() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " --alwaysAppendDbsnpId --dbsnp " + b36dbSNP129 + " -G Standard --variant " + privateTestDir + "vcfexample3withIDs.vcf -L " + privateTestDir + "vcfexample3withIDs.vcf", 1, - Arrays.asList("5fe63e511061ed4f91d938e72e7e3c39")); + Arrays.asList("e40e625302a496ede42eed61c2ce524b")); executeTest("adding multiple IDs with dbSNP", spec); } @@ -122,7 +122,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testDBTagWithHapMap() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " --comp:H3 " + privateTestDir + "fakeHM3.vcf -G Standard --variant " + privateTestDir + "vcfexample3empty.vcf -L " + privateTestDir + "vcfexample3empty.vcf", 1, - Arrays.asList("cc7184263975595a6e2473d153227146")); + Arrays.asList("cb50876477d3e035b6eda5d720d7ba8d")); executeTest("getting DB tag with HM3", spec); } @@ -130,7 +130,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testNoQuals() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " --variant " + privateTestDir + "noQual.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L " + privateTestDir + "noQual.vcf -A QualByDepth", 1, - Arrays.asList("aea983adc01cd059193538cc30adc17d")); + Arrays.asList("458412261d61797d39f802c1e03d63f6")); executeTest("test file doesn't have QUALs", spec); } @@ -138,7 +138,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testUsingExpression() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " --resource:foo " + privateTestDir + "targetAnnotations.vcf -G Standard --variant " + privateTestDir + "vcfexample3empty.vcf -E foo.AF -L " + privateTestDir + "vcfexample3empty.vcf", 1, - Arrays.asList("2b0e8cdfd691779befc5ac123d1a1887")); + Arrays.asList("39defa8108dca9fa3e54b22a7da43f77")); executeTest("using expression", spec); } @@ -146,7 +146,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testUsingExpressionWithID() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " --resource:foo " + privateTestDir + "targetAnnotations.vcf -G Standard --variant " + privateTestDir + "vcfexample3empty.vcf -E foo.ID -L " + privateTestDir + "vcfexample3empty.vcf", 1, - Arrays.asList("3de1d1998203518098ffae233f3e2352")); + Arrays.asList("a917edd58a0c235e9395bfc2d2020a8c")); executeTest("using expression with ID", spec); }