From f313e14e4ef2c3f933505bb16527313ce09e618c Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Tue, 12 Jul 2011 08:50:58 -0400 Subject: [PATCH 01/35] Now deletes the dump directory on ant clean Moving diffengine tests from private to public --- build.xml | 1 + .../diffengine/DiffEngineUnitTest.java | 229 ++++++++++++++++ .../walkers/diffengine/DiffNodeUnitTest.java | 249 ++++++++++++++++++ .../diffengine/DiffableReaderUnitTest.java | 143 ++++++++++ .../diffengine/DifferenceUnitTest.java | 95 +++++++ 5 files changed, 717 insertions(+) create mode 100644 public/java/test/org/broadinstitute/sting/gatk/walkers/diffengine/DiffEngineUnitTest.java create mode 100644 public/java/test/org/broadinstitute/sting/gatk/walkers/diffengine/DiffNodeUnitTest.java create mode 100644 public/java/test/org/broadinstitute/sting/gatk/walkers/diffengine/DiffableReaderUnitTest.java create mode 100644 public/java/test/org/broadinstitute/sting/gatk/walkers/diffengine/DifferenceUnitTest.java diff --git a/build.xml b/build.xml index 80627fae0..068c69316 100644 --- a/build.xml +++ b/build.xml @@ -981,6 +981,7 @@ + diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/diffengine/DiffEngineUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/diffengine/DiffEngineUnitTest.java new file mode 100644 index 000000000..cd6c3598a --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/diffengine/DiffEngineUnitTest.java @@ -0,0 +1,229 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +// our package +package org.broadinstitute.sting.gatk.walkers.diffengine; + + +// the imports for unit testing. + +import org.broadinstitute.sting.BaseTest; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.*; + +/** + * Basic unit test for DifferableReaders in reduced reads + */ +public class DiffEngineUnitTest extends BaseTest { + DiffEngine engine; + + @BeforeClass(enabled = true) + public void createDiffEngine() { + engine = new DiffEngine(); + } + + // -------------------------------------------------------------------------------- + // + // Difference testing routines + // + // -------------------------------------------------------------------------------- + + private class DifferenceTest extends TestDataProvider { + public DiffElement tree1, tree2; + public List differences; + + private DifferenceTest(String tree1, String tree2) { + this(tree1, tree2, Collections.emptyList()); + } + + private DifferenceTest(String tree1, String tree2, String difference) { + this(tree1, tree2, Arrays.asList(difference)); + } + + private DifferenceTest(String tree1, String tree2, List differences) { + super(DifferenceTest.class); + this.tree1 = DiffNode.fromString(tree1); + this.tree2 = DiffNode.fromString(tree2); + this.differences = differences; + } + + public String toString() { + return String.format("tree1=%s tree2=%s diff=%s", + tree1.toOneLineString(), tree2.toOneLineString(), differences); + } + } + + @DataProvider(name = "trees") + public Object[][] createTrees() { + new DifferenceTest("A=X", "A=X"); + new DifferenceTest("A=X", "A=Y", "A:X!=Y"); + new DifferenceTest("A=X", "B=X", Arrays.asList("A:X!=MISSING", "B:MISSING!=X")); + new DifferenceTest("A=(X=1)", "B=(X=1)", Arrays.asList("A:(X=1)!=MISSING", "B:MISSING!=(X=1)")); + new DifferenceTest("A=(X=1)", "A=(X=1)"); + new DifferenceTest("A=(X=1 Y=2)", "A=(X=1 Y=2)"); + new DifferenceTest("A=(X=1 Y=2 B=(Z=3))", "A=(X=1 Y=2 B=(Z=3))"); + new DifferenceTest("A=(X=1)", "A=(X=2)", "A.X:1!=2"); + new DifferenceTest("A=(X=1 Y=2 B=(Z=3))", "A=(X=1 Y=2 B=(Z=4))", "A.B.Z:3!=4"); + new DifferenceTest("A=(X=1)", "A=(X=1 Y=2)", "A.Y:MISSING!=2"); + new DifferenceTest("A=(X=1 Y=2 B=(Z=3))", "A=(X=1 Y=2)", "A.B:(Z=3)!=MISSING"); + return DifferenceTest.getTests(DifferenceTest.class); + } + + @Test(enabled = true, dataProvider = "trees") + public void testDiffs(DifferenceTest test) { + logger.warn("Test tree1: " + test.tree1.toOneLineString()); + logger.warn("Test tree2: " + test.tree2.toOneLineString()); + + List diffs = engine.diff(test.tree1, test.tree2); + logger.warn("Test expected diff : " + test.differences); + logger.warn("Observed diffs : " + diffs); + } + + // -------------------------------------------------------------------------------- + // + // Low-level routines for summarizing differences + // + // -------------------------------------------------------------------------------- + + @Test(enabled = true) + public void testLongestCommonPostfix() { + testLongestCommonPostfixHelper("A", "A", 1); + testLongestCommonPostfixHelper("A", "B", 0); + testLongestCommonPostfixHelper("A.B", "A.B", 2); + testLongestCommonPostfixHelper("A.B.C", "A.B.C", 3); + testLongestCommonPostfixHelper("A.B.C", "X.B.C", 2); + testLongestCommonPostfixHelper("A.B.C", "X.Y.C", 1); + testLongestCommonPostfixHelper("A.B.C", "X.Y.Z", 0); + testLongestCommonPostfixHelper("A.B.C", "A.X.C", 1); + testLongestCommonPostfixHelper("A.B.C", "A.X.Z", 0); + testLongestCommonPostfixHelper("A.B.C", "A.B.Z", 0); + } + + public void testLongestCommonPostfixHelper(String p1, String p2, int expected) { + String[] parts1 = p1.split("\\."); + String[] parts2 = p2.split("\\."); + int obs = DiffEngine.longestCommonPostfix(parts1, parts2); + Assert.assertEquals(obs, expected, "p1=" + p1 + " p2=" + p2 + " failed"); + } + + @Test(enabled = true, dependsOnMethods = "testLongestCommonPostfix") + public void testSummarizePath() { + testSummarizePathHelper("A", "A", "A"); + testSummarizePathHelper("A", "B", "*"); + testSummarizePathHelper("A.B", "A.B", "A.B"); + testSummarizePathHelper("A.B", "X.B", "*.B"); + testSummarizePathHelper("A.B", "X.Y", "*.*"); + testSummarizePathHelper("A.B.C", "A.B.C", "A.B.C"); + testSummarizePathHelper("A.B.C", "X.B.C", "*.B.C"); + testSummarizePathHelper("A.B.C", "X.Y.C", "*.*.C"); + testSummarizePathHelper("A.B.C", "X.Y.Z", "*.*.*"); + testSummarizePathHelper("A.B.C", "A.X.C", "*.*.C"); + testSummarizePathHelper("A.B.C", "A.X.Z", "*.*.*"); + testSummarizePathHelper("A.B.C", "A.B.Z", "*.*.*"); + } + + public void testSummarizePathHelper(String p1, String p2, String expected) { + String[] parts1 = DiffEngine.diffNameToPath(p1); + String[] parts2 = DiffEngine.diffNameToPath(p2); + int obs = DiffEngine.longestCommonPostfix(parts1, parts2); + String path = DiffEngine.summarizedPath(parts2, obs); + Assert.assertEquals(path, expected, "p1=" + p1 + " p2=" + p2 + " failed"); + } + + // -------------------------------------------------------------------------------- + // + // High-level difference summary + // + // -------------------------------------------------------------------------------- + + private class SummarizeDifferenceTest extends TestDataProvider { + List diffs = new ArrayList(); + List expecteds = new ArrayList(); + + public SummarizeDifferenceTest() { super(SummarizeDifferenceTest.class); } + + public SummarizeDifferenceTest addDiff(String... diffsToAdd) { + diffs.addAll(Arrays.asList(diffsToAdd)); + return this; + } + + public SummarizeDifferenceTest addSummary(String... expectedSummary) { + expecteds.addAll(Arrays.asList(expectedSummary)); + return this; + } + + public String toString() { + return String.format("diffs=%s => expected=%s", diffs, expecteds); + } + + public void test() { + List diffPaths = new ArrayList(diffs.size()); + for ( String diff : diffs ) { diffPaths.add(DiffEngine.diffNameToPath(diff)); } + + List sumDiffs = engine.summarizedDifferencesOfPaths(diffPaths); + + Assert.assertEquals(sumDiffs.size(), expecteds.size(), "Unexpected number of summarized differences: " + sumDiffs); + + for ( int i = 0; i < sumDiffs.size(); i++ ) { + DiffEngine.SummarizedDifference sumDiff = sumDiffs.get(i); + String expected = expecteds.get(i); + String[] pathCount = expected.split(":"); + String path = pathCount[0]; + int count = Integer.valueOf(pathCount[1]); + Assert.assertEquals(sumDiff.getPath(), path, "Unexpected path at: " + expected + " obs=" + sumDiff + " all=" + sumDiffs); + Assert.assertEquals(sumDiff.getCount(), count, "Unexpected counts at: " + expected + " obs=" + sumDiff + " all=" + sumDiffs); + } + } + } + + @DataProvider(name = "summaries") + public Object[][] createSummaries() { + new SummarizeDifferenceTest().addDiff("A", "A").addSummary("A:2"); + new SummarizeDifferenceTest().addDiff("A", "B").addSummary("A:1", "B:1"); + new SummarizeDifferenceTest().addDiff("A", "A", "A").addSummary("A:3"); + new SummarizeDifferenceTest().addDiff("A", "A", "A", "B").addSummary("A:3", "B:1"); + new SummarizeDifferenceTest().addDiff("A", "A", "A", "B", "B").addSummary("A:3", "B:2"); + new SummarizeDifferenceTest().addDiff("A", "A", "A", "B", "B", "C").addSummary("A:3", "B:2", "C:1"); + new SummarizeDifferenceTest().addDiff("A.X", "A.X").addSummary("A.X:2"); + new SummarizeDifferenceTest().addDiff("A.X", "A.X", "B.X").addSummary("*.X:3", "A.X:2", "B.X:1"); + new SummarizeDifferenceTest().addDiff("A.X", "A.X", "B.X", "B.X").addSummary("*.X:4", "A.X:2", "B.X:2"); + new SummarizeDifferenceTest().addDiff("A.B.C", "X.B.C").addSummary("*.B.C:2", "A.B.C:1", "X.B.C:1"); + new SummarizeDifferenceTest().addDiff("A.B.C", "X.Y.C", "X.Y.C").addSummary("*.*.C:3", "X.Y.C:2", "A.B.C:1"); + new SummarizeDifferenceTest().addDiff("A.B.C", "A.X.C", "X.Y.C").addSummary("*.*.C:3", "A.B.C:1", "A.X.C:1", "X.Y.C:1"); + new SummarizeDifferenceTest().addDiff("A.B.C", "A.X.C", "B.X.C").addSummary("*.*.C:3", "*.X.C:2", "A.B.C:1", "A.X.C:1", "B.X.C:1"); + new SummarizeDifferenceTest().addDiff("A.B.C", "A.X.C", "B.X.C", "B.X.C").addSummary("*.*.C:4", "*.X.C:3", "B.X.C:2", "A.B.C:1", "A.X.C:1"); + + return SummarizeDifferenceTest.getTests(SummarizeDifferenceTest.class); + } + + + @Test(enabled = true, dependsOnMethods = "testSummarizePath", dataProvider = "summaries") + public void testSummarizeDifferences(SummarizeDifferenceTest test) { + test.test(); + } +} \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/diffengine/DiffNodeUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/diffengine/DiffNodeUnitTest.java new file mode 100644 index 000000000..534416d29 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/diffengine/DiffNodeUnitTest.java @@ -0,0 +1,249 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +// our package +package org.broadinstitute.sting.gatk.walkers.diffengine; + + +// the imports for unit testing. + + +import org.broadinstitute.sting.BaseTest; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.*; + +/** + * Basic unit test for DifferableReaders in reduced reads + */ +public class DiffNodeUnitTest extends BaseTest { + // Data is: + // MY_ROOT + // fields: A=A, B=B + // nodes: C, D + // C: fields: E=E, nodes: none + // D: fields: F=F, G=G, nodes: none + static DiffNode MY_ROOT = DiffNode.rooted("MY_ROOT"); + static DiffValue Value_A = new DiffValue("A", MY_ROOT, "A"); + static DiffValue Value_B = new DiffValue("B", MY_ROOT, "B"); + static DiffNode NODE_C = DiffNode.empty("C", MY_ROOT); + static DiffNode NODE_D = DiffNode.empty("D", MY_ROOT); + static DiffValue Value_E = new DiffValue("E", NODE_C, "E"); + static DiffValue Value_F = new DiffValue("F", NODE_D, "F"); + static DiffValue Value_G = new DiffValue("G", NODE_D, "G"); + + static { + MY_ROOT.add(Value_A); + MY_ROOT.add(Value_B); + MY_ROOT.add(NODE_C); + MY_ROOT.add(NODE_D); + NODE_C.add(Value_E); + NODE_D.add(Value_F); + NODE_D.add(Value_G); + } + + + // -------------------------------------------------------------------------------- + // + // Element testing routines + // + // -------------------------------------------------------------------------------- + + private class ElementTest extends TestDataProvider { + public DiffElement elt; + public String name; + public String fullName; + public DiffElement parent; + + private ElementTest(DiffValue elt, DiffValue parent, String name, String fullName) { + this(elt.getBinding(), parent.getBinding(), name, fullName); + } + + private ElementTest(DiffElement elt, DiffElement parent, String name, String fullName) { + super(ElementTest.class); + this.elt = elt; + this.name = name; + this.fullName = fullName; + this.parent = parent; + } + + public String toString() { + return String.format("ElementTest elt=%s name=%s fullName=%s parent=%s", + elt.toOneLineString(), name, fullName, parent.getName()); + } + } + + @DataProvider(name = "elementdata") + public Object[][] createElementData() { + new ElementTest(MY_ROOT.getBinding(), DiffElement.ROOT, "MY_ROOT", "MY_ROOT"); + new ElementTest(NODE_C, MY_ROOT, "C", "MY_ROOT.C"); + new ElementTest(NODE_D, MY_ROOT, "D", "MY_ROOT.D"); + new ElementTest(Value_A, MY_ROOT, "A", "MY_ROOT.A"); + new ElementTest(Value_B, MY_ROOT, "B", "MY_ROOT.B"); + new ElementTest(Value_E, NODE_C, "E", "MY_ROOT.C.E"); + new ElementTest(Value_F, NODE_D, "F", "MY_ROOT.D.F"); + new ElementTest(Value_G, NODE_D, "G", "MY_ROOT.D.G"); + return TestDataProvider.getTests(ElementTest.class); + } + + @Test(enabled = true, dataProvider = "elementdata") + public void testElementMethods(ElementTest test) { + Assert.assertNotNull(test.elt.getName()); + Assert.assertNotNull(test.elt.getParent()); + Assert.assertEquals(test.elt.getName(), test.name); + Assert.assertEquals(test.elt.getParent(), test.parent); + Assert.assertEquals(test.elt.fullyQualifiedName(), test.fullName); + } + + // -------------------------------------------------------------------------------- + // + // DiffValue testing routines + // + // -------------------------------------------------------------------------------- + + private class LeafTest extends TestDataProvider { + public DiffValue diffvalue; + public Object value; + + private LeafTest(DiffValue diffvalue, Object value) { + super(LeafTest.class); + this.diffvalue = diffvalue; + this.value = value; + } + + public String toString() { + return String.format("LeafTest diffvalue=%s value=%s", diffvalue.toOneLineString(), value); + } + } + + @DataProvider(name = "leafdata") + public Object[][] createLeafData() { + new LeafTest(Value_A, "A"); + new LeafTest(Value_B, "B"); + new LeafTest(Value_E, "E"); + new LeafTest(Value_F, "F"); + new LeafTest(Value_G, "G"); + return TestDataProvider.getTests(LeafTest.class); + } + + @Test(enabled = true, dataProvider = "leafdata") + public void testLeafMethods(LeafTest test) { + Assert.assertNotNull(test.diffvalue.getValue()); + Assert.assertEquals(test.diffvalue.getValue(), test.value); + } + + // -------------------------------------------------------------------------------- + // + // Node testing routines + // + // -------------------------------------------------------------------------------- + + private class NodeTest extends TestDataProvider { + public DiffNode node; + public Set fields; + public Set subnodes; + public Set allNames; + + private NodeTest(DiffNode node, List fields, List subnodes) { + super(NodeTest.class); + this.node = node; + this.fields = new HashSet(fields); + this.subnodes = new HashSet(subnodes); + this.allNames = new HashSet(fields); + allNames.addAll(subnodes); + } + + public String toString() { + return String.format("NodeTest node=%s fields=%s subnodes=%s", + node.toOneLineString(), fields, subnodes); + } + } + + @DataProvider(name = "nodedata") + public Object[][] createData1() { + new NodeTest(MY_ROOT, Arrays.asList("A", "B"), Arrays.asList("C", "D")); + new NodeTest(NODE_C, Arrays.asList("E"), Collections.emptyList()); + new NodeTest(NODE_D, Arrays.asList("F", "G"), Collections.emptyList()); + return TestDataProvider.getTests(NodeTest.class); + } + + @Test(enabled = true, dataProvider = "nodedata") + public void testNodeAccessors(NodeTest test) { + Assert.assertNotNull(test.node.getElements()); + + for ( String name : test.allNames ) { + DiffElement elt = test.node.getElement(name); + Assert.assertNotNull(elt, "Failed to find field " + elt + " in " + test.node); + Assert.assertEquals(elt.getName(), name); + Assert.assertEquals(elt.getValue().isAtomic(), test.fields.contains(name), "Failed atomic/compound expectation: " + test.node); + } + } + + // NOTE: add routines are being implicitly tested by the creation of the data structures + + @Test(enabled = true, dataProvider = "nodedata") + public void testCounts(NodeTest test) { + Assert.assertEquals(test.node.getElements().size(), test.allNames.size()); + Assert.assertEquals(test.node.getElementNames(), test.allNames); + } + + // -------------------------------------------------------------------------------- + // + // fromString testing routines + // + // -------------------------------------------------------------------------------- + + private class FromStringTest extends TestDataProvider { + public String string; + public DiffElement expected; + + private FromStringTest(String string, DiffElement expected) { + super(FromStringTest.class); + this.string = string; + this.expected = expected; + } + + public String toString() { + return String.format("FromStringTest string=%s expected=%s", string, expected.toOneLineString()); + } + } + + @DataProvider(name = "fromstringdata") + public Object[][] createFromData() { + new FromStringTest("A=A", Value_A.getBinding()); + new FromStringTest("B=B", Value_B.getBinding()); + new FromStringTest("C=(E=E)", NODE_C.getBinding()); + new FromStringTest("D=(F=F G=G)", NODE_D.getBinding()); + return TestDataProvider.getTests(FromStringTest.class); + } + + @Test(enabled = true, dataProvider = "fromstringdata") + public void parseFromString(FromStringTest test) { + logger.warn("Testing from string: " + test.string); + DiffElement elt = DiffNode.fromString(test.string); + Assert.assertEquals(elt.toOneLineString(), test.expected.toOneLineString()); + } +} \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/diffengine/DiffableReaderUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/diffengine/DiffableReaderUnitTest.java new file mode 100644 index 000000000..5738b643f --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/diffengine/DiffableReaderUnitTest.java @@ -0,0 +1,143 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +// our package +package org.broadinstitute.sting.gatk.walkers.diffengine; + + +// the imports for unit testing. + + +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.variantcontext.Allele; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.Test; + +import java.io.File; +import java.util.*; + +/** + * Basic unit test for DifferableReaders in reduced reads + */ +public class DiffableReaderUnitTest extends BaseTest { + DiffEngine engine; + + File vcfFile = new File(testDir + "diffTestMaster.vcf"); + File bamFile = new File(testDir + "exampleBAM.bam"); + + @BeforeClass(enabled = true) + public void createDiffEngine() { + engine = new DiffEngine(); + } + + @Test(enabled = true) + public void testPluggableDiffableReaders() { + logger.warn("testPluggableDiffableReaders"); + Map readers = engine.getReaders(); + Assert.assertNotNull(readers); + Assert.assertTrue(readers.size() > 0); + Assert.assertNotNull(readers.get("VCF")); + for ( Map.Entry e : engine.getReaders().entrySet() ) { + logger.warn("Found diffable reader: " + e.getKey()); + Assert.assertEquals(e.getValue().getName(), e.getKey()); + Assert.assertEquals(e.getValue(), engine.getReader(e.getKey())); + } + } + + private static void testLeaf(DiffNode rec, String field, Object expected) { + DiffElement value = rec.getElement(field); + Assert.assertNotNull(value, "Expected to see leaf named " + field + " in rec " + rec); + Assert.assertEquals(value.getValue().getValue(), expected, "Expected to leaf named " + field + " to have value " + expected + " in rec " + rec); + } + + @Test(enabled = true, dependsOnMethods = "testPluggableDiffableReaders") + public void testVCF1() { + logger.warn("testVCF1"); + DiffableReader vcfReader = engine.getReader("VCF"); + Assert.assertTrue(vcfReader.canRead(vcfFile)); + Assert.assertFalse(vcfReader.canRead(bamFile)); + + DiffElement diff = vcfReader.readFromFile(vcfFile); + Assert.assertNotNull(diff); + + Assert.assertEquals(diff.getName(), vcfFile.getName()); + Assert.assertSame(diff.getParent(), DiffElement.ROOT); + + DiffNode node = diff.getValueAsNode(); + Assert.assertEquals(node.getElements().size(), 9); + + // chr1 2646 rs62635284 G A 0.15 PASS AC=2;AF=1.00;AN=2 GT:AD:DP:GL:GQ 1/1:53,75:3:-12.40,-0.90,-0.00:9.03 + DiffNode rec1 = node.getElement("chr1:2646").getValueAsNode(); + testLeaf(rec1, "CHROM", "chr1"); + testLeaf(rec1, "POS", 2646); + testLeaf(rec1, "ID", "rs62635284"); + testLeaf(rec1, "REF", Allele.create("G", true)); + testLeaf(rec1, "ALT", new HashSet(Arrays.asList(Allele.create("A")))); + testLeaf(rec1, "QUAL", 0.15); + testLeaf(rec1, "FILTER", Collections.emptySet()); + testLeaf(rec1, "AC", "2"); + testLeaf(rec1, "AF", "1.00"); + testLeaf(rec1, "AN", "2"); + } + + @Test(enabled = true, dependsOnMethods = "testPluggableDiffableReaders") + public void testBAM() { + logger.warn("testBAM"); + DiffableReader bamReader = engine.getReader("BAM"); + Assert.assertTrue(bamReader.canRead(bamFile)); + Assert.assertFalse(bamReader.canRead(vcfFile)); + + DiffElement diff = bamReader.readFromFile(bamFile); + Assert.assertNotNull(diff); + + Assert.assertEquals(diff.getName(), bamFile.getName()); + Assert.assertSame(diff.getParent(), DiffElement.ROOT); + + DiffNode node = diff.getValueAsNode(); + Assert.assertEquals(node.getElements().size(), 33); + + // 30PPJAAXX090125:1:42:512:1817#0 99 chr1 200 0 76M = + // 255 -130 ACCCTAACCCTAACCCTAACCCTAACCATAACCCTAAGACTAACCCTAAACCTAACCCTCATAATCGAAATACAAC + // BBBBC@C?AABCBB<63>=B@>+B9-9+)2B8,+@327B5A>90((>-+''3?(/'''A)(''19('7.,**%)3: + // PG:Z:0 RG:Z:exampleBAM.bam SM:Z:exampleBAM.bam + + DiffNode rec1 = node.getElement("30PPJAAXX090125:1:42:512:1817#0_1").getValueAsNode(); + testLeaf(rec1, "NAME", "30PPJAAXX090125:1:42:512:1817#0"); + testLeaf(rec1, "FLAGS", 99); + testLeaf(rec1, "RNAME", "chr1"); + testLeaf(rec1, "POS", 200); + testLeaf(rec1, "MAPQ", 0); + testLeaf(rec1, "CIGAR", "76M"); + testLeaf(rec1, "RNEXT", "chr1"); + testLeaf(rec1, "PNEXT", 255); + testLeaf(rec1, "TLEN", -130); + testLeaf(rec1, "SEQ", "ACCCTAACCCTAACCCTAACCCTAACCATAACCCTAAGACTAACCCTAAACCTAACCCTCATAATCGAAATACAAC"); + testLeaf(rec1, "QUAL", "BBBBC@C?AABCBB<63>=B@>+B9-9+)2B8,+@327B5A>90((>-+''3?(/'''A)(''19('7.,**%)3:"); + testLeaf(rec1, "PG", "0"); + testLeaf(rec1, "RG", "exampleBAM.bam"); + testLeaf(rec1, "SM", "exampleBAM.bam"); + } +} \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/diffengine/DifferenceUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/diffengine/DifferenceUnitTest.java new file mode 100644 index 000000000..da272ec30 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/diffengine/DifferenceUnitTest.java @@ -0,0 +1,95 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +// our package +package org.broadinstitute.sting.gatk.walkers.diffengine; + + +// the imports for unit testing. + + +import org.broadinstitute.sting.BaseTest; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; + +/** + * Basic unit test for DifferableReaders in reduced reads + */ +public class DifferenceUnitTest extends BaseTest { + // -------------------------------------------------------------------------------- + // + // testing routines + // + // -------------------------------------------------------------------------------- + + private class DifferenceTest extends TestDataProvider { + public DiffElement tree1, tree2; + public String difference; + + private DifferenceTest(String tree1, String tree2, String difference) { + this(DiffNode.fromString(tree1), DiffNode.fromString(tree2), difference); + } + + private DifferenceTest(DiffElement tree1, DiffElement tree2, String difference) { + super(DifferenceTest.class); + this.tree1 = tree1; + this.tree2 = tree2; + this.difference = difference; + } + + public String toString() { + return String.format("tree1=%s tree2=%s diff=%s", + tree1 == null ? "null" : tree1.toOneLineString(), + tree2 == null ? "null" : tree2.toOneLineString(), + difference); + } + } + + @DataProvider(name = "data") + public Object[][] createTrees() { + new DifferenceTest("A=X", "A=Y", "A:X!=Y"); + new DifferenceTest("A=Y", "A=X", "A:Y!=X"); + new DifferenceTest(DiffNode.fromString("A=X"), null, "A:X!=MISSING"); + new DifferenceTest(null, DiffNode.fromString("A=X"), "A:MISSING!=X"); + return DifferenceTest.getTests(DifferenceTest.class); + } + + @Test(enabled = true, dataProvider = "data") + public void testDiffToString(DifferenceTest test) { + logger.warn("Test tree1: " + (test.tree1 == null ? "null" : test.tree1.toOneLineString())); + logger.warn("Test tree2: " + (test.tree2 == null ? "null" : test.tree2.toOneLineString())); + logger.warn("Test expected diff : " + test.difference); + Difference diff = new Difference(test.tree1, test.tree2); + logger.warn("Observed diffs : " + diff); + Assert.assertEquals(diff.toString(), test.difference, "Observed diff string " + diff + " not equal to expected difference string " + test.difference ); + + } +} \ No newline at end of file From 8056a3fe89046d942c4b656ff8138283e0235769 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Tue, 12 Jul 2011 08:52:31 -0400 Subject: [PATCH 02/35] getElement() now uses O(1) get from hash instead of linear O(n) search. Enables us to read large files easily. --- .../sting/gatk/walkers/diffengine/DiffNode.java | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffNode.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffNode.java index 0720e18c0..3e1be8609 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffNode.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffNode.java @@ -107,11 +107,13 @@ public class DiffNode extends DiffValue { return getElements(false); } + /** + * Returns the element bound to name, or null if no such binding exists + * @param name + * @return + */ public DiffElement getElement(String name) { - for ( DiffElement elt : getElements() ) - if ( elt.getName().equals(name) ) - return elt; - return null; + return getElementMap().get(name); } /** From 05212aea62b2f78f7a739257bac86fd0b16d2c5b Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Tue, 12 Jul 2011 08:53:19 -0400 Subject: [PATCH 03/35] reader now takes an argument for the maximum number of elements to read from the file. --- .../walkers/diffengine/BAMDiffableReader.java | 5 ++--- .../gatk/walkers/diffengine/DiffEngine.java | 7 ++++++- .../walkers/diffengine/DiffObjectsWalker.java | 17 ++++++++++------- .../gatk/walkers/diffengine/DiffableReader.java | 2 +- .../walkers/diffengine/VCFDiffableReader.java | 10 ++++++++-- .../diffengine/DiffableReaderUnitTest.java | 4 ++-- 6 files changed, 29 insertions(+), 16 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/BAMDiffableReader.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/BAMDiffableReader.java index f7a395d9d..a5ebf27bb 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/BAMDiffableReader.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/BAMDiffableReader.java @@ -51,12 +51,11 @@ import java.util.zip.GZIPInputStream; * Class implementing diffnode reader for VCF */ public class BAMDiffableReader implements DiffableReader { - private final static int MAX_RECORDS_TO_READ = 1000; @Override public String getName() { return "BAM"; } @Override - public DiffElement readFromFile(File file) { + public DiffElement readFromFile(File file, int maxElementsToRead) { final SAMFileReader reader = new SAMFileReader(file, null); // null because we don't want it to look for the index reader.setValidationStringency(SAMFileReader.ValidationStringency.SILENT); @@ -65,7 +64,7 @@ public class BAMDiffableReader implements DiffableReader { int count = 0; while ( iterator.hasNext() ) { - if ( count++ > MAX_RECORDS_TO_READ ) + if ( count++ > maxElementsToRead && maxElementsToRead != -1) break; final SAMRecord record = iterator.next(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffEngine.java index ba2713bff..54a7a464d 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffEngine.java @@ -385,12 +385,17 @@ public class DiffEngine { return findReaderForFile(file) != null; } + public DiffElement createDiffableFromFile(File file) { + return createDiffableFromFile(file, -1); + } + + public DiffElement createDiffableFromFile(File file, int maxElementsToRead) { DiffableReader reader = findReaderForFile(file); if ( reader == null ) throw new UserException("Unsupported file type: " + file); else - return reader.readFromFile(file); + return reader.readFromFile(file, maxElementsToRead); } public static boolean simpleDiffFiles(File masterFile, File testFile, DiffEngine.SummaryReportParams params) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffObjectsWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffObjectsWalker.java index a08108db2..fe411b195 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffObjectsWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffObjectsWalker.java @@ -48,11 +48,14 @@ public class DiffObjectsWalker extends RodWalker { @Output(doc="File to which results should be written",required=true) protected PrintStream out; - @Argument(fullName="maxRecords", shortName="M", doc="Max. number of records to process", required=false) - int MAX_RECORDS = 0; + @Argument(fullName="maxObjectsToRead", shortName="motr", doc="Max. number of objects to read from the files. -1 [default] means unlimited", required=false) + int MAX_OBJECTS_TO_READ = -1; - @Argument(fullName="maxCount1Records", shortName="M1", doc="Max. number of records occuring exactly once in the file to process", required=false) - int MAX_COUNT1_RECORDS = 0; + @Argument(fullName="maxDiffs", shortName="M", doc="Max. number of diffs to process", required=false) + int MAX_DIFFS = 0; + + @Argument(fullName="maxCount1Diffs", shortName="M1", doc="Max. number of diffs occuring exactly once in the file to process", required=false) + int MAX_COUNT1_DIFFS = 0; @Argument(fullName="minCountForDiff", shortName="MCFD", doc="Min number of observations for a records to display", required=false) int minCountForDiff = 1; @@ -91,9 +94,9 @@ public class DiffObjectsWalker extends RodWalker { @Override public void onTraversalDone(Integer sum) { out.printf("Reading master file %s%n", masterFile); - DiffElement master = diffEngine.createDiffableFromFile(masterFile); + DiffElement master = diffEngine.createDiffableFromFile(masterFile, MAX_OBJECTS_TO_READ); out.printf("Reading test file %s%n", testFile); - DiffElement test = diffEngine.createDiffableFromFile(testFile); + DiffElement test = diffEngine.createDiffableFromFile(testFile, MAX_OBJECTS_TO_READ); // out.printf("Master diff objects%n"); // out.println(master.toString()); @@ -107,7 +110,7 @@ public class DiffObjectsWalker extends RodWalker { out.printf("DIFF: %s%n", diff.toString()); } - DiffEngine.SummaryReportParams params = new DiffEngine.SummaryReportParams(out, MAX_RECORDS, MAX_COUNT1_RECORDS, minCountForDiff); + DiffEngine.SummaryReportParams params = new DiffEngine.SummaryReportParams(out, MAX_DIFFS, MAX_COUNT1_DIFFS, minCountForDiff); diffEngine.reportSummarizedDifferences(diffs, params); } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffableReader.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffableReader.java index 84c2eed10..af5771c55 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffableReader.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffableReader.java @@ -43,7 +43,7 @@ public interface DiffableReader { @Ensures("result != null") @Requires("file != null") - public DiffElement readFromFile(File file); + public DiffElement readFromFile(File file, int maxElementsToRead); @Requires("file != null") public boolean canRead(File file); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/VCFDiffableReader.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/VCFDiffableReader.java index 743178538..06d14366f 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/VCFDiffableReader.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/VCFDiffableReader.java @@ -51,15 +51,21 @@ public class VCFDiffableReader implements DiffableReader { public String getName() { return "VCF"; } @Override - public DiffElement readFromFile(File file) { + public DiffElement readFromFile(File file, int maxElementsToRead) { DiffNode root = DiffNode.rooted(file.getName()); try { LineReader lineReader = new AsciiLineReader(new FileInputStream(file)); VCFCodec vcfCodec = new VCFCodec(); - VCFHeader header = (VCFHeader)vcfCodec.readHeader(lineReader); + + // must be read as state is stored in reader itself + vcfCodec.readHeader(lineReader); String line = lineReader.readLine(); + int count = 0; while ( line != null ) { + if ( count++ > maxElementsToRead && maxElementsToRead != -1) + break; + VariantContext vc = (VariantContext)vcfCodec.decode(line); String name = vc.getChr() + ":" + vc.getStart(); DiffNode vcRoot = DiffNode.empty(name, root); diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/diffengine/DiffableReaderUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/diffengine/DiffableReaderUnitTest.java index 5738b643f..baa2f0383 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/diffengine/DiffableReaderUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/diffengine/DiffableReaderUnitTest.java @@ -80,7 +80,7 @@ public class DiffableReaderUnitTest extends BaseTest { Assert.assertTrue(vcfReader.canRead(vcfFile)); Assert.assertFalse(vcfReader.canRead(bamFile)); - DiffElement diff = vcfReader.readFromFile(vcfFile); + DiffElement diff = vcfReader.readFromFile(vcfFile, -1); Assert.assertNotNull(diff); Assert.assertEquals(diff.getName(), vcfFile.getName()); @@ -110,7 +110,7 @@ public class DiffableReaderUnitTest extends BaseTest { Assert.assertTrue(bamReader.canRead(bamFile)); Assert.assertFalse(bamReader.canRead(vcfFile)); - DiffElement diff = bamReader.readFromFile(bamFile); + DiffElement diff = bamReader.readFromFile(bamFile, -1); Assert.assertNotNull(diff); Assert.assertEquals(diff.getName(), bamFile.getName()); From ccedd6ff4c942c20c1a57f6a6bf65c5cb63b6e16 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Tue, 12 Jul 2011 15:20:28 -0400 Subject: [PATCH 04/35] Difference is now the general form -- used to be SummarizedDifference. The old Difference class is now a subclass of Difference that includes pointers to specific the master and test DiffElements. Added a size() function that calculates the number of elements tree from a DiffElement. --- .../gatk/walkers/diffengine/DiffElement.java | 4 + .../gatk/walkers/diffengine/DiffEngine.java | 142 +++++------------- .../gatk/walkers/diffengine/DiffNode.java | 7 + .../walkers/diffengine/DiffObjectsWalker.java | 7 +- .../gatk/walkers/diffengine/DiffValue.java | 1 + .../gatk/walkers/diffengine/Difference.java | 83 +++++++--- .../diffengine/SpecificDifference.java | 59 ++++++++ .../diffengine/DiffEngineUnitTest.java | 6 +- .../diffengine/DifferenceUnitTest.java | 2 +- 9 files changed, 176 insertions(+), 135 deletions(-) create mode 100644 public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/SpecificDifference.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffElement.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffElement.java index eff24bb88..4c3f7bd95 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffElement.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffElement.java @@ -115,4 +115,8 @@ public class DiffElement { else throw new ReviewedStingException("Illegal request conversion of a DiffValue into a DiffNode: " + this); } + + public int size() { + return 1 + getValue().size(); + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffEngine.java index 54a7a464d..6d85df71d 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffEngine.java @@ -24,11 +24,9 @@ package org.broadinstitute.sting.gatk.walkers.diffengine; -import com.google.java.contract.Requires; import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.report.GATKReport; import org.broadinstitute.sting.gatk.report.GATKReportTable; -import org.broadinstitute.sting.gatk.walkers.varianteval.stratifications.VariantStratifier; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.classloader.PluginManager; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; @@ -60,7 +58,7 @@ public class DiffEngine { // // -------------------------------------------------------------------------------- - public List diff(DiffElement master, DiffElement test) { + public List diff(DiffElement master, DiffElement test) { DiffValue masterValue = master.getValue(); DiffValue testValue = test.getValue(); @@ -70,14 +68,14 @@ public class DiffEngine { return diff(masterValue, testValue); } else { // structural difference in types. one is node, other is leaf - return Arrays.asList(new Difference(master, test)); + return Arrays.asList(new SpecificDifference(master, test)); } } - public List diff(DiffNode master, DiffNode test) { + public List diff(DiffNode master, DiffNode test) { Set allNames = new HashSet(master.getElementNames()); allNames.addAll(test.getElementNames()); - List diffs = new ArrayList(); + List diffs = new ArrayList(); for ( String name : allNames ) { DiffElement masterElt = master.getElement(name); @@ -86,7 +84,7 @@ public class DiffEngine { throw new ReviewedStingException("BUG: unexceptedly got two null elements for field: " + name); } else if ( masterElt == null || testElt == null ) { // if either is null, we are missing a value // todo -- should one of these be a special MISSING item? - diffs.add(new Difference(masterElt, testElt)); + diffs.add(new SpecificDifference(masterElt, testElt)); } else { diffs.addAll(diff(masterElt, testElt)); } @@ -95,11 +93,11 @@ public class DiffEngine { return diffs; } - public List diff(DiffValue master, DiffValue test) { + public List diff(DiffValue master, DiffValue test) { if ( master.getValue().equals(test.getValue()) ) { return Collections.emptyList(); } else { - return Arrays.asList(new Difference(master.getBinding(), test.getBinding())); + return Arrays.asList(new SpecificDifference(master.getBinding(), test.getBinding())); } } @@ -147,64 +145,68 @@ public class DiffEngine { * @param params determines how we display the items * @param diffs */ - public void reportSummarizedDifferences(List diffs, SummaryReportParams params ) { + public void reportSummarizedDifferences(List diffs, SummaryReportParams params ) { printSummaryReport(summarizeDifferences(diffs), params ); } - public List summarizeDifferences(List diffs) { - List diffPaths = new ArrayList(diffs.size()); - - for ( Difference diff1 : diffs ) { - diffPaths.add(diffNameToPath(diff1.getFullyQualifiedName())); - } - - return summarizedDifferencesOfPaths(diffPaths); + public List summarizeDifferences(List diffs) { + return summarizedDifferencesOfPaths(diffs); } final protected static String[] diffNameToPath(String diffName) { return diffName.split("\\."); } - protected List summarizedDifferencesOfPaths(List diffPaths) { - Map summaries = new HashMap(); + protected List summarizedDifferencesOfPathsFromString(List singletonDiffs) { + List diffs = new ArrayList(); + + for ( String diff : singletonDiffs ) { + diffs.add(new Difference(diff)); + } + + return summarizedDifferencesOfPaths(diffs); + } + + protected List summarizedDifferencesOfPaths(List singletonDiffs) { + Map summaries = new HashMap(); // create the initial set of differences - for ( int i = 0; i < diffPaths.size(); i++ ) { + for ( int i = 0; i < singletonDiffs.size(); i++ ) { for ( int j = 0; j <= i; j++ ) { - String[] diffPath1 = diffPaths.get(i); - String[] diffPath2 = diffPaths.get(j); - if ( diffPath1.length == diffPath2.length ) { - int lcp = longestCommonPostfix(diffPath1, diffPath2); - String path = lcp > 0 ? summarizedPath(diffPath2, lcp) : Utils.join(".", diffPath2); + Difference diffPath1 = singletonDiffs.get(i); + Difference diffPath2 = singletonDiffs.get(j); + if ( diffPath1.length() == diffPath2.length() ) { + int lcp = longestCommonPostfix(diffPath1.getParts(), diffPath2.getParts()); + String path = lcp > 0 ? summarizedPath(diffPath2.getParts(), lcp) : diffPath2.getPath(); addSummary(summaries, path, true); } } } // count differences - for ( String[] diffPath : diffPaths ) { - for ( SummarizedDifference sumDiff : summaries.values() ) { - if ( sumDiff.matches(diffPath) ) + for ( Difference diffPath : singletonDiffs ) { + for ( Difference sumDiff : summaries.values() ) { + if ( sumDiff.matches(diffPath.getParts()) ) addSummary(summaries, sumDiff.getPath(), false); } } - List sortedSummaries = new ArrayList(summaries.values()); + List sortedSummaries = new ArrayList(summaries.values()); Collections.sort(sortedSummaries); return sortedSummaries; } - private static void addSummary(Map summaries, String path, boolean onlyCatalog) { + private static void addSummary(Map summaries, String path, boolean onlyCatalog) { if ( summaries.containsKey(path) ) { if ( ! onlyCatalog ) summaries.get(path).incCount(); } else { - SummarizedDifference sumDiff = new SummarizedDifference(path); + Difference sumDiff = new Difference(path); summaries.put(sumDiff.getPath(), sumDiff); } } - protected void printSummaryReport(List sortedSummaries, SummaryReportParams params ) { + protected void printSummaryReport(List sortedSummaries, SummaryReportParams params ) { GATKReport report = new GATKReport(); final String tableName = "diffences"; report.addTable(tableName, "Summarized differences between the master and test files.\nSee http://www.broadinstitute.org/gsa/wiki/index.php/DiffObjectsWalker_and_SummarizedDifferences for more information"); @@ -213,7 +215,7 @@ public class DiffEngine { table.addColumn("NumberOfOccurrences", 0); int count = 0, count1 = 0; - for ( SummarizedDifference diff : sortedSummaries ) { + for ( Difference diff : sortedSummaries ) { if ( diff.getCount() < params.minSumDiffToShow ) // in order, so break as soon as the count is too low break; @@ -261,76 +263,6 @@ public class DiffEngine { return Utils.join(".", parts); } - /** - * TODO -- all of the algorithms above should use SummarizedDifference instead - * TODO -- of some SummarizedDifferences and some low-level String[] - */ - public static class SummarizedDifference implements Comparable { - final String path; // X.Y.Z - final String[] parts; - int count = 0; - - public SummarizedDifference(String path) { - this.path = path; - this.parts = diffNameToPath(path); - } - - public void incCount() { count++; } - - public int getCount() { - return count; - } - - /** - * The fully qualified path object A.B.C etc - * @return - */ - public String getPath() { - return path; - } - - /** - * @return the length of the parts of this summary - */ - public int length() { - return this.parts.length; - } - - /** - * Returns true if the string parts matches this summary. Matches are - * must be equal() everywhere where this summary isn't *. - * @param otherParts - * @return - */ - public boolean matches(String[] otherParts) { - if ( otherParts.length != length() ) - return false; - - // TODO optimization: can start at right most non-star element - for ( int i = 0; i < length(); i++ ) { - String part = parts[i]; - if ( ! part.equals("*") && ! part.equals(otherParts[i]) ) - return false; - } - - return true; - } - - @Override - public String toString() { - return String.format("%s:%d", getPath(), getCount()); - } - - @Override - public int compareTo(SummarizedDifference other) { - // sort first highest to lowest count, then by lowest to highest path - int countCmp = Integer.valueOf(count).compareTo(other.count); - return countCmp != 0 ? -1 * countCmp : path.compareTo(other.path); - } - - - } - // -------------------------------------------------------------------------------- // // plugin manager @@ -404,7 +336,7 @@ public class DiffEngine { if ( diffEngine.canRead(masterFile) && diffEngine.canRead(testFile) ) { DiffElement master = diffEngine.createDiffableFromFile(masterFile); DiffElement test = diffEngine.createDiffableFromFile(testFile); - List diffs = diffEngine.diff(master, test); + List diffs = diffEngine.diff(master, test); diffEngine.reportSummarizedDifferences(diffs, params); return true; } else { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffNode.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffNode.java index 3e1be8609..2f48de2d3 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffNode.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffNode.java @@ -153,6 +153,13 @@ public class DiffNode extends DiffValue { add(new DiffElement(name, this.getBinding(), new DiffValue(value))); } + public int size() { + int count = 0; + for ( DiffElement value : getElements() ) + count += value.size(); + return count; + } + // --------------------------------------------------------------------------- // // toString diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffObjectsWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffObjectsWalker.java index fe411b195..ecb836af9 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffObjectsWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffObjectsWalker.java @@ -24,7 +24,6 @@ package org.broadinstitute.sting.gatk.walkers.diffengine; -import org.apache.xmlbeans.impl.tool.Diff; import org.broadinstitute.sting.commandline.Argument; import org.broadinstitute.sting.commandline.Output; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; @@ -95,18 +94,20 @@ public class DiffObjectsWalker extends RodWalker { public void onTraversalDone(Integer sum) { out.printf("Reading master file %s%n", masterFile); DiffElement master = diffEngine.createDiffableFromFile(masterFile, MAX_OBJECTS_TO_READ); + out.printf(" Read %d objects%n", master.size()); out.printf("Reading test file %s%n", testFile); DiffElement test = diffEngine.createDiffableFromFile(testFile, MAX_OBJECTS_TO_READ); + out.printf(" Read %d objects%n", test.size()); // out.printf("Master diff objects%n"); // out.println(master.toString()); // out.printf("Test diff objects%n"); // out.println(test.toString()); - List diffs = diffEngine.diff(master, test); + List diffs = diffEngine.diff(master, test); if ( showItemizedDifferences ) { out.printf("Itemized results%n"); - for ( Difference diff : diffs ) + for ( SpecificDifference diff : diffs ) out.printf("DIFF: %s%n", diff.toString()); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffValue.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffValue.java index 7245e9e8d..3750496a1 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffValue.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffValue.java @@ -87,4 +87,5 @@ public class DiffValue { public boolean isAtomic() { return true; } public boolean isCompound() { return ! isAtomic(); } + public int size() { return 1; } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/Difference.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/Difference.java index 6627a4cc5..efc6ef160 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/Difference.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/Difference.java @@ -24,35 +24,72 @@ package org.broadinstitute.sting.gatk.walkers.diffengine; -/** - * Created by IntelliJ IDEA. - * User: depristo - * Date: 7/4/11 - * Time: 12:53 PM - * - * Represents a specific difference between two specific DiffElements - */ -public class Difference { - DiffElement master, test; +public class Difference implements Comparable { + final String path; // X.Y.Z + final String[] parts; + int count = 0; - public Difference(DiffElement master, DiffElement test) { - if ( master == null && test == null ) throw new IllegalArgumentException("Master and test both cannot be null"); - this.master = master; - this.test = test; + public Difference(String path) { + this.path = path; + this.parts = DiffEngine.diffNameToPath(path); } + public String[] getParts() { + return parts; + } + + public void incCount() { count++; } + + public int getCount() { + return count; + } + + /** + * The fully qualified path object A.B.C etc + * @return + */ + public String getPath() { + return path; + } + + /** + * @return the length of the parts of this summary + */ + public int length() { + return this.parts.length; + } + + /** + * Returns true if the string parts matches this summary. Matches are + * must be equal() everywhere where this summary isn't *. + * @param otherParts + * @return + */ + public boolean matches(String[] otherParts) { + if ( otherParts.length != length() ) + return false; + + // TODO optimization: can start at right most non-star element + for ( int i = 0; i < length(); i++ ) { + String part = parts[i]; + if ( ! part.equals("*") && ! part.equals(otherParts[i]) ) + return false; + } + + return true; + } + + @Override public String toString() { - return String.format("%s:%s!=%s", - getFullyQualifiedName(), - getOneLineString(master), - getOneLineString(test)); + return String.format("%s:%d", getPath(), getCount()); } - public String getFullyQualifiedName() { - return (master == null ? test : master).fullyQualifiedName(); + @Override + public int compareTo(Difference other) { + // sort first highest to lowest count, then by lowest to highest path + int countCmp = Integer.valueOf(count).compareTo(other.count); + return countCmp != 0 ? -1 * countCmp : path.compareTo(other.path); } - private static String getOneLineString(DiffElement elt) { - return elt == null ? "MISSING" : elt.getValue().toOneLineString(); - } + } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/SpecificDifference.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/SpecificDifference.java new file mode 100644 index 000000000..2fe9b47f8 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/SpecificDifference.java @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.diffengine; + +/** + * Created by IntelliJ IDEA. + * User: depristo + * Date: 7/4/11 + * Time: 12:53 PM + * + * Represents a specific difference between two specific DiffElements + */ +public class SpecificDifference extends Difference { + DiffElement master, test; + + public SpecificDifference(DiffElement master, DiffElement test) { + super(createName(master, test)); + if ( master == null && test == null ) throw new IllegalArgumentException("Master and test both cannot be null"); + this.master = master; + this.test = test; + } + + public String toString() { + return String.format("%s:%s!=%s", + getPath(), + getOneLineString(master), + getOneLineString(test)); + } + + private static String createName(DiffElement master, DiffElement test) { + return (master == null ? test : master).fullyQualifiedName(); + } + + private static String getOneLineString(DiffElement elt) { + return elt == null ? "MISSING" : elt.getValue().toOneLineString(); + } +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/diffengine/DiffEngineUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/diffengine/DiffEngineUnitTest.java index cd6c3598a..96dfec6e8 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/diffengine/DiffEngineUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/diffengine/DiffEngineUnitTest.java @@ -99,7 +99,7 @@ public class DiffEngineUnitTest extends BaseTest { logger.warn("Test tree1: " + test.tree1.toOneLineString()); logger.warn("Test tree2: " + test.tree2.toOneLineString()); - List diffs = engine.diff(test.tree1, test.tree2); + List diffs = engine.diff(test.tree1, test.tree2); logger.warn("Test expected diff : " + test.differences); logger.warn("Observed diffs : " + diffs); } @@ -185,12 +185,12 @@ public class DiffEngineUnitTest extends BaseTest { List diffPaths = new ArrayList(diffs.size()); for ( String diff : diffs ) { diffPaths.add(DiffEngine.diffNameToPath(diff)); } - List sumDiffs = engine.summarizedDifferencesOfPaths(diffPaths); + List sumDiffs = engine.summarizedDifferencesOfPathsFromString(diffs); Assert.assertEquals(sumDiffs.size(), expecteds.size(), "Unexpected number of summarized differences: " + sumDiffs); for ( int i = 0; i < sumDiffs.size(); i++ ) { - DiffEngine.SummarizedDifference sumDiff = sumDiffs.get(i); + Difference sumDiff = sumDiffs.get(i); String expected = expecteds.get(i); String[] pathCount = expected.split(":"); String path = pathCount[0]; diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/diffengine/DifferenceUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/diffengine/DifferenceUnitTest.java index da272ec30..64579a01b 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/diffengine/DifferenceUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/diffengine/DifferenceUnitTest.java @@ -87,7 +87,7 @@ public class DifferenceUnitTest extends BaseTest { logger.warn("Test tree1: " + (test.tree1 == null ? "null" : test.tree1.toOneLineString())); logger.warn("Test tree2: " + (test.tree2 == null ? "null" : test.tree2.toOneLineString())); logger.warn("Test expected diff : " + test.difference); - Difference diff = new Difference(test.tree1, test.tree2); + SpecificDifference diff = new SpecificDifference(test.tree1, test.tree2); logger.warn("Observed diffs : " + diff); Assert.assertEquals(diff.toString(), test.difference, "Observed diff string " + diff + " not equal to expected difference string " + test.difference ); From 5077c94d85929bad35fcc00bbeab0b8036aabe4a Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Tue, 12 Jul 2011 15:39:07 -0400 Subject: [PATCH 06/35] Adding MappingQualityUnavailableReadFilter to the SNP and indel CountCovariates --- .../recalibration/CountCovariatesWalker.java | 3 +- .../RecalibrationWalkersIntegrationTest.java | 91 +++---------------- 2 files changed, 15 insertions(+), 79 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CountCovariatesWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CountCovariatesWalker.java index 6673bec92..c21f548b3 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CountCovariatesWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CountCovariatesWalker.java @@ -27,6 +27,7 @@ package org.broadinstitute.sting.gatk.walkers.recalibration; import org.broad.tribble.bed.BEDCodec; import org.broad.tribble.dbsnp.DbSNPCodec; +import org.broadinstitute.sting.gatk.filters.MappingQualityUnavailableReadFilter; import org.broadinstitute.sting.utils.codecs.vcf.VCF3Codec; import org.broadinstitute.sting.utils.codecs.vcf.VCFCodec; import org.broadinstitute.sting.commandline.Gather; @@ -75,7 +76,7 @@ import java.util.Map; @BAQMode(ApplicationTime = BAQ.ApplicationTime.FORBIDDEN) @By( DataSource.READS ) // Only look at covered loci, not every loci of the reference file -@ReadFilters( {MappingQualityZeroReadFilter.class} ) // Filter out all reads with zero mapping quality +@ReadFilters( {MappingQualityZeroReadFilter.class, MappingQualityUnavailableReadFilter.class} ) // Filter out all reads with zero or unavailable mapping quality @Requires( {DataSource.READS, DataSource.REFERENCE, DataSource.REFERENCE_BASES} ) // This walker requires both -I input.bam and -R reference.fasta @PartitionBy(PartitionType.LOCUS) public class CountCovariatesWalker extends LocusWalker implements TreeReducible { diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationWalkersIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationWalkersIntegrationTest.java index b0f76229b..129161da3 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationWalkersIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationWalkersIntegrationTest.java @@ -19,9 +19,9 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest { public void testCountCovariates1() { HashMap e = new HashMap(); e.put( validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.bam", "7b5832d4b2a23b8ef2bb639eb59bfa88" ); - e.put( validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SOLID.bam", "f4f8a49bb5764d2a8f61e055f64dcce4"); + e.put( validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "9c006f8e9fb5752b1c139f5a8cc7ea88"); e.put( validationDataLocation + "NA12873.454.SRP000031.2009_06.chr1.10_20mb.bam", "e6f7b4ab9aa291022e0ba8b7dbe4c77e" ); - e.put( validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam", "570506533f079d738d70934dfe1c02cd" ); + e.put( validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam", "e6b98af01c5a08e4954b79ec42db6fc3" ); for ( String parallelism : Arrays.asList("", " -nt 4")) { for ( Map.Entry entry : e.entrySet() ) { @@ -53,9 +53,9 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest { public void testTableRecalibrator1() { HashMap e = new HashMap(); e.put( validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.bam", "0278cce4cfdab869dc0c11d6852a984b" ); - e.put( validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SOLID.bam", "344d4252143df8c2cce6b568747553a5"); + e.put( validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "6797d7ffa4ef6c48413719ba32696ccf"); e.put( validationDataLocation + "NA12873.454.SRP000031.2009_06.chr1.10_20mb.bam", "2bb3374dde131791d7638031ae3b3e10" ); - e.put( validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam", "064c4a7bdd23974c3a9c5f924540df76" ); + e.put( validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam", "1f9d8944b73169b367cb83b0d22e5432" ); for ( Map.Entry entry : e.entrySet() ) { String bam = entry.getKey(); @@ -107,7 +107,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest { @Test public void testTableRecalibratorMaxQ70() { HashMap e = new HashMap(); - e.put( validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SOLID.bam", "344d4252143df8c2cce6b568747553a5" ); + e.put( validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.bam", "0278cce4cfdab869dc0c11d6852a984b" ); for ( Map.Entry entry : e.entrySet() ) { String bam = entry.getKey(); @@ -133,12 +133,10 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest { } } - - @Test public void testCountCovariatesSolidIndelsRemoveRefBias() { HashMap e = new HashMap(); - e.put( validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "0a6cdb9611e5880ea6611205080aa267" ); + e.put( validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "c9ea5f995e1e2b7a5688533e678dcedc" ); for ( Map.Entry entry : e.entrySet() ) { String bam = entry.getKey(); @@ -164,7 +162,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest { @Test public void testTableRecalibratorSolidIndelsRemoveRefBias() { HashMap e = new HashMap(); - e.put( validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "9bc7e1ad223ba759fe5e8ddb4c07369c" ); + e.put( validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "993fae4270e7e1e15986f270acf247af" ); for ( Map.Entry entry : e.entrySet() ) { String bam = entry.getKey(); @@ -189,13 +187,10 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest { } } - - - @Test public void testCountCovariatesVCF() { HashMap e = new HashMap(); - e.put( validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SOLID.bam", "3700eaf567e4937f442fc777a226d6ad"); + e.put( validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.bam", "170f0c3cc4b8d72c539136effeec9a16"); for ( Map.Entry entry : e.entrySet() ) { String bam = entry.getKey(); @@ -219,7 +214,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest { @Test public void testCountCovariatesBED() { HashMap e = new HashMap(); - e.put( validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SOLID.bam", "6803891a3398821fc8a37e19ea8e5a00"); + e.put( validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.bam", "b460478d9683e827784e42bc352db8bb"); for ( Map.Entry entry : e.entrySet() ) { String bam = entry.getKey(); @@ -243,7 +238,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest { @Test public void testCountCovariatesVCFPlusDBsnp() { HashMap e = new HashMap(); - e.put( validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SOLID.bam", "f224c42fbc4026db973ccc91265ab5c7"); + e.put( validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.bam", "a3d892bd60d8f679affda3c1e3af96c1"); for ( Map.Entry entry : e.entrySet() ) { String bam = entry.getKey(); @@ -268,69 +263,10 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest { } } - @Test - public void testCountCovariatesNoReadGroups() { - HashMap e = new HashMap(); - e.put( validationDataLocation + "NA12762.SOLID.SRP000031.2009_07.chr1.10_20mb.bam", "c024e03f019aeceaf364fa58c8295ad8" ); - - for ( Map.Entry entry : e.entrySet() ) { - String bam = entry.getKey(); - String md5 = entry.getValue(); - - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-R " + b36KGReference + - " --DBSNP " + GATKDataLocation + "dbsnp_129_b36.rod" + - " -T CountCovariates" + - " -I " + bam + - " -L 1:10,000,000-10,200,000" + - " -cov ReadGroupCovariate" + - " -cov QualityScoreCovariate" + - " -cov CycleCovariate" + - " -cov DinucCovariate" + - " --default_read_group DefaultReadGroup" + - " --default_platform illumina" + - " --solid_recal_mode SET_Q_ZERO" + - " -recalFile %s", - 1, // just one output file - Arrays.asList(md5)); - List result = executeTest("testCountCovariatesNoReadGroups", spec).getFirst(); - paramsFilesNoReadGroupTest.put(bam, result.get(0).getAbsolutePath()); - } - } - - @Test - public void testTableRecalibratorNoReadGroups() { - HashMap e = new HashMap(); - e.put( validationDataLocation + "NA12762.SOLID.SRP000031.2009_07.chr1.10_20mb.bam", "1eefbe7ac0376fc1ed1392d85242171e" ); - - for ( Map.Entry entry : e.entrySet() ) { - String bam = entry.getKey(); - String md5 = entry.getValue(); - String paramsFile = paramsFilesNoReadGroupTest.get(bam); - System.out.printf("PARAMS FOR %s is %s%n", bam, paramsFile); - if ( paramsFile != null ) { - WalkerTestSpec spec = new WalkerTestSpec( - "-R " + b36KGReference + - " -T TableRecalibration" + - " -I " + bam + - " -L 1:10,100,000-10,300,000" + - " -o %s" + - " --no_pg_tag" + - " --solid_recal_mode SET_Q_ZERO" + - " --default_read_group DefaultReadGroup" + - " --default_platform illumina" + - " -recalFile " + paramsFile, - 1, // just one output file - Arrays.asList(md5)); - executeTest("testTableRecalibratorNoReadGroups", spec); - } - } - } - @Test public void testCountCovariatesNoIndex() { HashMap e = new HashMap(); - e.put( validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.noindex.bam", "cfc31bb6f51436d1c3b34f62bb801dc8" ); + e.put( validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.noindex.bam", "284ccac1f8fe485e52c86333cac7c2d4" ); for ( Map.Entry entry : e.entrySet() ) { String bam = entry.getKey(); @@ -356,7 +292,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest { @Test public void testTableRecalibratorNoIndex() { HashMap e = new HashMap(); - e.put( validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.noindex.bam", "83b848a16034c2fb423d1bb0f5be7784" ); + e.put( validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.noindex.bam", "c167799c2d9cab815d7c9b23337f162e" ); for ( Map.Entry entry : e.entrySet() ) { String bam = entry.getKey(); @@ -380,11 +316,10 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest { } } - @Test public void testCountCovariatesFailWithoutDBSNP() { HashMap e = new HashMap(); - e.put( validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SOLID.bam", ""); + e.put( validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.bam", ""); for ( Map.Entry entry : e.entrySet() ) { String bam = entry.getKey(); From 6007eea3ffba2e459ec6bf0a1c66c0a017fe0a62 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 13 Jul 2011 09:56:08 -0400 Subject: [PATCH 12/35] Allowing VCF records without GTs in vf4.1 --- .../utils/codecs/vcf/StandardVCFWriter.java | 41 +++++++++++++------ .../sting/utils/codecs/vcf/VCF3Codec.java | 13 +++--- .../sting/utils/codecs/vcf/VCFCodec.java | 28 ++++++------- .../sting/utils/variantcontext/Genotype.java | 35 +++++++++++++--- .../utils/variantcontext/VariantContext.java | 8 ++-- 5 files changed, 83 insertions(+), 42 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/StandardVCFWriter.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/StandardVCFWriter.java index a8bf74707..f7d09f16d 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/StandardVCFWriter.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/StandardVCFWriter.java @@ -32,6 +32,7 @@ import org.broad.tribble.index.IndexFactory; import org.broad.tribble.util.LittleEndianOutputStream; import org.broad.tribble.util.ParsingUtils; import org.broad.tribble.util.PositionalStream; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.Genotype; @@ -300,10 +301,7 @@ public class StandardVCFWriter implements VCFWriter { } else { List genotypeAttributeKeys = new ArrayList(); if ( vc.hasGenotypes() ) { - genotypeAttributeKeys.add(VCFConstants.GENOTYPE_KEY); - for ( String key : calcVCFGenotypeKeys(vc) ) { - genotypeAttributeKeys.add(key); - } + genotypeAttributeKeys.addAll(calcVCFGenotypeKeys(vc)); } else if ( mHeader.hasGenotypingData() ) { // this needs to be done in case all samples are no-calls genotypeAttributeKeys.add(VCFConstants.GENOTYPE_KEY); @@ -387,16 +385,22 @@ public class StandardVCFWriter implements VCFWriter { continue; } - writeAllele(g.getAllele(0), alleleMap); - for (int i = 1; i < g.getPloidy(); i++) { - mWriter.write(g.isPhased() ? VCFConstants.PHASED : VCFConstants.UNPHASED); - writeAllele(g.getAllele(i), alleleMap); - } - List attrs = new ArrayList(genotypeFormatKeys.size()); for ( String key : genotypeFormatKeys ) { - if ( key.equals(VCFConstants.GENOTYPE_KEY) ) + + if ( key.equals(VCFConstants.GENOTYPE_KEY) ) { + if ( !g.isAvailable() ) { + throw new ReviewedStingException("GTs cannot be missing for some samples if they are available for others in the record"); + } + + writeAllele(g.getAllele(0), alleleMap); + for (int i = 1; i < g.getPloidy(); i++) { + mWriter.write(g.isPhased() ? VCFConstants.PHASED : VCFConstants.UNPHASED); + writeAllele(g.getAllele(i), alleleMap); + } + continue; + } Object val = g.hasAttribute(key) ? g.getAttribute(key) : VCFConstants.MISSING_VALUE_v4; @@ -488,10 +492,13 @@ public class StandardVCFWriter implements VCFWriter { private static List calcVCFGenotypeKeys(VariantContext vc) { Set keys = new HashSet(); + boolean sawGoodGT = false; boolean sawGoodQual = false; boolean sawGenotypeFilter = false; for ( Genotype g : vc.getGenotypes().values() ) { keys.addAll(g.getAttributes().keySet()); + if ( g.isAvailable() ) + sawGoodGT = true; if ( g.hasNegLog10PError() ) sawGoodQual = true; if (g.isFiltered() && g.isCalled()) @@ -504,7 +511,17 @@ public class StandardVCFWriter implements VCFWriter { if (sawGenotypeFilter) keys.add(VCFConstants.GENOTYPE_FILTER_KEY); - return ParsingUtils.sortList(new ArrayList(keys)); + List sortedList = ParsingUtils.sortList(new ArrayList(keys)); + + // make sure the GT is first + if ( sawGoodGT ) { + List newList = new ArrayList(sortedList.size()+1); + newList.add(VCFConstants.GENOTYPE_KEY); + newList.addAll(sortedList); + sortedList = newList; + } + + return sortedList; } diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCF3Codec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCF3Codec.java index f3c99e963..c29f2ba8b 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCF3Codec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCF3Codec.java @@ -141,8 +141,6 @@ public class VCF3Codec extends AbstractVCFCodec { boolean missing = i >= GTValueSplitSize; if (gtKey.equals(VCFConstants.GENOTYPE_KEY)) { - if (i != 0) - generateException("Saw GT at position " + i + ", but it must be at the first position for genotypes"); genotypeAlleleLocation = i; } else if (gtKey.equals(VCFConstants.GENOTYPE_QUALITY_KEY)) { GTQual = missing ? parseQual(VCFConstants.MISSING_VALUE_v4) : parseQual(GTValueArray[i]); @@ -156,12 +154,13 @@ public class VCF3Codec extends AbstractVCFCodec { } } - // check to make sure we found a gentoype field - if (genotypeAlleleLocation < 0) generateException("Unable to find required field GT for the record; we don't yet support a missing GT field"); + // check to make sure we found a genotype field + if ( genotypeAlleleLocation < 0 ) + generateException("Unable to find the GT field for the record; the GT field is required"); + if ( genotypeAlleleLocation > 0 ) + generateException("Saw GT field at position " + genotypeAlleleLocation + ", but it must be at the first position for genotypes"); - // todo -- assuming allele list length in the single digits is bad. Fix me. - // Check for > 1 for haploid genotypes - boolean phased = GTValueArray[genotypeAlleleLocation].length() > 1 && GTValueArray[genotypeAlleleLocation].charAt(1) == '|'; + boolean phased = GTValueArray[genotypeAlleleLocation].indexOf(VCFConstants.PHASED) != -1; // add it to the list try { diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java index 0fb2940bb..05fff5d9e 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java @@ -145,8 +145,6 @@ public class VCFCodec extends AbstractVCFCodec { // todo -- all of these on the fly parsing of the missing value should be static constants if (gtKey.equals(VCFConstants.GENOTYPE_KEY)) { - if (i != 0) - generateException("Saw GT at position " + i + ", but it must be at the first position for genotypes"); genotypeAlleleLocation = i; } else if (gtKey.equals(VCFConstants.GENOTYPE_QUALITY_KEY)) { GTQual = missing ? parseQual(VCFConstants.MISSING_VALUE_v4) : parseQual(GTValueArray[i]); @@ -160,22 +158,24 @@ public class VCFCodec extends AbstractVCFCodec { } } - // check to make sure we found a gentoype field - // TODO -- This is no longer required in v4.1 - if (genotypeAlleleLocation < 0) generateException("Unable to find required field GT for the record; we don't yet support a missing GT field"); + // check to make sure we found a genotype field if we are a VCF4.0 file + if ( version == VCFHeaderVersion.VCF4_0 && genotypeAlleleLocation == -1 ) + generateException("Unable to find the GT field for the record; the GT field is required in VCF4.0"); + if ( genotypeAlleleLocation > 0 ) + generateException("Saw GT field at position " + genotypeAlleleLocation + ", but it must be at the first position for genotypes when present"); - // todo -- assuming allele list length in the single digits is bad. Fix me. - // Check for > 1 for haploid genotypes - boolean phased = GTValueArray[genotypeAlleleLocation].length() > 1 && GTValueArray[genotypeAlleleLocation].charAt(1) == '|'; + List GTalleles = (genotypeAlleleLocation == -1 ? null : parseGenotypeAlleles(GTValueArray[genotypeAlleleLocation], alleles, alleleMap)); + boolean phased = genotypeAlleleLocation != -1 && GTValueArray[genotypeAlleleLocation].indexOf(VCFConstants.PHASED) != -1; // add it to the list try { - genotypes.put(sampleName, new Genotype(sampleName, - parseGenotypeAlleles(GTValueArray[genotypeAlleleLocation], alleles, alleleMap), - GTQual, - genotypeFilters, - gtAttributes, - phased)); + genotypes.put(sampleName, + new Genotype(sampleName, + GTalleles, + GTQual, + genotypeFilters, + gtAttributes, + phased)); } catch (TribbleException e) { throw new TribbleException.InternalCodecException(e.getMessage() + ", at position " + chr+":"+pos); } diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/Genotype.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/Genotype.java index 3a87f1196..0b5976c3c 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/Genotype.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/Genotype.java @@ -3,6 +3,7 @@ package org.broadinstitute.sting.utils.variantcontext; import org.broad.tribble.util.ParsingUtils; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import java.util.*; @@ -19,12 +20,14 @@ public class Genotype { protected InferredGeneticContext commonInfo; public final static double NO_NEG_LOG_10PERROR = InferredGeneticContext.NO_NEG_LOG_10PERROR; protected List alleles = null; // new ArrayList(); + protected Type type = null; protected boolean isPhased = false; - private boolean filtersWereAppliedToContext; + protected boolean filtersWereAppliedToContext; public Genotype(String sampleName, List alleles, double negLog10PError, Set filters, Map attributes, boolean isPhased) { - this.alleles = Collections.unmodifiableList(alleles); + if ( alleles != null ) + this.alleles = Collections.unmodifiableList(alleles); commonInfo = new InferredGeneticContext(sampleName, negLog10PError, filters, attributes); filtersWereAppliedToContext = filters != null; this.isPhased = isPhased; @@ -66,6 +69,9 @@ public class Genotype { } public List getAlleles(Allele allele) { + if ( getType() == Type.UNAVAILABLE ) + throw new ReviewedStingException("Requesting alleles for an UNAVAILABLE genotype"); + List al = new ArrayList(); for ( Allele a : alleles ) if ( a.equals(allele) ) @@ -75,6 +81,8 @@ public class Genotype { } public Allele getAllele(int i) { + if ( getType() == Type.UNAVAILABLE ) + throw new ReviewedStingException("Requesting alleles for an UNAVAILABLE genotype"); return alleles.get(i); } @@ -89,10 +97,21 @@ public class Genotype { NO_CALL, HOM_REF, HET, - HOM_VAR + HOM_VAR, + UNAVAILABLE } public Type getType() { + if ( type == null ) { + type = determineType(); + } + return type; + } + + protected Type determineType() { + if ( alleles == null ) + return Type.UNAVAILABLE; + Allele firstAllele = alleles.get(0); if ( firstAllele.isNoCall() ) { @@ -122,7 +141,8 @@ public class Genotype { * @return true if this genotype is not actually a genotype but a "no call" (e.g. './.' in VCF) */ public boolean isNoCall() { return getType() == Type.NO_CALL; } - public boolean isCalled() { return getType() != Type.NO_CALL; } + public boolean isCalled() { return getType() != Type.NO_CALL && getType() != Type.UNAVAILABLE; } + public boolean isAvailable() { return getType() != Type.UNAVAILABLE; } // // Useful methods for getting genotype likelihoods for a genotype object, if present @@ -157,8 +177,8 @@ public class Genotype { } public void validate() { - if ( alleles == null ) throw new IllegalArgumentException("BUG: alleles cannot be null in setAlleles"); - if ( alleles.size() == 0) throw new IllegalArgumentException("BUG: alleles cannot be of size 0 in setAlleles"); + if ( alleles == null ) return; + if ( alleles.size() == 0) throw new IllegalArgumentException("BUG: alleles cannot be of size 0"); int nNoCalls = 0; for ( Allele allele : alleles ) { @@ -175,6 +195,9 @@ public class Genotype { } public String getGenotypeString(boolean ignoreRefState) { + if ( alleles == null ) + return null; + // Notes: // 1. Make sure to use the appropriate separator depending on whether the genotype is phased // 2. If ignoreRefState is true, then we want just the bases of the Alleles (ignoring the '*' indicating a ref Allele) diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java index da80a3431..92c5d648b 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java @@ -1206,9 +1206,11 @@ public class VariantContext implements Feature { // to enable tribble intergrati if ( ! name.equals(g.getSampleName()) ) throw new IllegalStateException("Bound sample name " + name + " does not equal the name of the genotype " + g.getSampleName()); - for ( Allele gAllele : g.getAlleles() ) { - if ( ! hasAllele(gAllele) && gAllele.isCalled() ) - throw new IllegalStateException("Allele in genotype " + gAllele + " not in the variant context " + alleles); + if ( g.isAvailable() ) { + for ( Allele gAllele : g.getAlleles() ) { + if ( ! hasAllele(gAllele) && gAllele.isCalled() ) + throw new IllegalStateException("Allele in genotype " + gAllele + " not in the variant context " + alleles); + } } } } From 797c50e6894f5e5fd341c3b002610301173c269a Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 13 Jul 2011 10:01:23 -0400 Subject: [PATCH 13/35] Fixing integration tests I broke yesterday; removing batch merging test since we don't support that anymore. --- .../phasing/MergeMNPsIntegrationTest.java | 6 +-- ...gatingAlternateAllelesIntegrationTest.java | 6 +-- .../BatchMergeIntegrationTest.java | 46 ------------------- 3 files changed, 6 insertions(+), 52 deletions(-) delete mode 100755 public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/BatchMergeIntegrationTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/MergeMNPsIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/MergeMNPsIntegrationTest.java index 3f87fc1a2..c88eac149 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/MergeMNPsIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/MergeMNPsIntegrationTest.java @@ -23,7 +23,7 @@ public class MergeMNPsIntegrationTest extends WalkerTest { baseTestString(hg18Reference, "merging_test_chr20_556259_756570.vcf", 1) + " -L chr20:556259-756570", 1, - Arrays.asList("e312b7d3854d5b2834a370659514a813")); + Arrays.asList("7f11f7f75d1526077f0173c7ed1fc6c4")); executeTest("Merge MNP sites within genomic distance of 1 [TEST ONE]", spec); } @@ -33,7 +33,7 @@ public class MergeMNPsIntegrationTest extends WalkerTest { baseTestString(hg18Reference, "merging_test_chr20_556259_756570.vcf", 10) + " -L chr20:556259-756570", 1, - Arrays.asList("681f50e45f1d697370d2c355df2e18bc")); + Arrays.asList("53dd312468296826bdd3c22387390c88")); executeTest("Merge MNP sites within genomic distance of 10 [TEST TWO]", spec); } @@ -43,7 +43,7 @@ public class MergeMNPsIntegrationTest extends WalkerTest { baseTestString(hg18Reference, "merging_test_chr20_556259_756570.vcf", 100) + " -L chr20:556259-756570", 1, - Arrays.asList("0bccb0ef928a108418246bec01098083")); + Arrays.asList("e26f92d2fb9f4eaeac7f9d8ee27410ee")); executeTest("Merge MNP sites within genomic distance of 100 [TEST THREE]", spec); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/MergeSegregatingAlternateAllelesIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/MergeSegregatingAlternateAllelesIntegrationTest.java index 009048c10..f855c1dd3 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/MergeSegregatingAlternateAllelesIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/MergeSegregatingAlternateAllelesIntegrationTest.java @@ -23,7 +23,7 @@ public class MergeSegregatingAlternateAllelesIntegrationTest extends WalkerTest baseTestString(hg18Reference, "merging_test_chr20_556259_756570.vcf", 1) + " -L chr20:556259-756570", 1, - Arrays.asList("e16f957d888054ae0518e25660295241")); + Arrays.asList("af5e1370822551c0c6f50f23447dc627")); executeTest("Merge sites within genomic distance of 1 [TEST ONE]", spec); } @@ -33,7 +33,7 @@ public class MergeSegregatingAlternateAllelesIntegrationTest extends WalkerTest baseTestString(hg18Reference, "merging_test_chr20_556259_756570.vcf", 10) + " -L chr20:556259-756570", 1, - Arrays.asList("122a482090677c7619c2105d44e00d11")); + Arrays.asList("dd8c44ae1ef059a7fe85399467e102eb")); executeTest("Merge sites within genomic distance of 10 [TEST TWO]", spec); } @@ -43,7 +43,7 @@ public class MergeSegregatingAlternateAllelesIntegrationTest extends WalkerTest baseTestString(hg18Reference, "merging_test_chr20_556259_756570.vcf", 100) + " -L chr20:556259-756570", 1, - Arrays.asList("bc6a8c8a42bb2601db98e88e9ad74748")); + Arrays.asList("f81fd72ecaa57b3215406fcea860bcc5")); executeTest("Merge sites within genomic distance of 100 [TEST THREE]", spec); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/BatchMergeIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/BatchMergeIntegrationTest.java deleted file mode 100755 index 7e1d86105..000000000 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/BatchMergeIntegrationTest.java +++ /dev/null @@ -1,46 +0,0 @@ -/* - * Copyright (c) 2010, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.walkers.variantutils; - -import org.broadinstitute.sting.WalkerTest; -import org.testng.annotations.Test; - -import java.io.File; -import java.util.Arrays; - -public class BatchMergeIntegrationTest extends WalkerTest { - @Test - public void testBatchMerge1() { - String bam = validationDataLocation + "NA12878.HiSeq.b37.chr20.10_11mb.bam"; - String alleles = validationDataLocation + "batch.merge.alleles.vcf"; - WalkerTestSpec spec = new WalkerTestSpec( - "-T UnifiedGenotyper -NO_HEADER -BTI alleles -stand_call_conf 0.0 -glm BOTH -G none -nsl -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -o %s -R " + b37KGReference - + " -B:alleles,VCF " + alleles - + " -I " + bam, - 1, - Arrays.asList("f4ed8f4ef2cba96823c06e90e9d0de35")); - executeTest("testBatchMerge UG genotype given alleles:" + new File(bam).getName() + " with " + new File(alleles).getName(), spec); - } -} \ No newline at end of file From fa3ff5350800f1449b6e4e5f365cf2b745906932 Mon Sep 17 00:00:00 2001 From: Menachem Fromer Date: Wed, 13 Jul 2011 11:58:16 -0400 Subject: [PATCH 17/35] Filters should only be applied to the new VC if the old VC had filters applied --- .../sting/gatk/walkers/phasing/ReadBackedPhasingWalker.java | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingWalker.java index e59b29502..af24035c8 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingWalker.java @@ -220,6 +220,9 @@ public class ReadBackedPhasingWalker extends RodWalker Date: Wed, 13 Jul 2011 14:40:01 -0400 Subject: [PATCH 19/35] Don't output source and ref header lines anymore. Short-term motivation for this is that I'd like this tool when run on a VCF to emit the exact same VCF. Long-term motivation is that these tags should be output by the VCF writer itself for all tools. --- .../walkers/variantutils/VariantsToVCF.java | 4 +-- .../utils/codecs/vcf/VCFIntegrationTest.java | 28 +++++++++++++++++++ 2 files changed, 30 insertions(+), 2 deletions(-) create mode 100644 public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFIntegrationTest.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCF.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCF.java index 7eb49da34..79134b553 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCF.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCF.java @@ -199,8 +199,8 @@ public class VariantsToVCF extends RodWalker { // setup the header fields Set hInfo = new HashSet(); hInfo.addAll(VCFUtils.getHeaderFields(getToolkit())); - hInfo.add(new VCFHeaderLine("source", "VariantsToVCF")); - hInfo.add(new VCFHeaderLine("reference", getToolkit().getArguments().referenceFile.getName())); + //hInfo.add(new VCFHeaderLine("source", "VariantsToVCF")); + //hInfo.add(new VCFHeaderLine("reference", getToolkit().getArguments().referenceFile.getName())); allowedGenotypeFormatStrings.add(VCFConstants.GENOTYPE_KEY); for ( VCFHeaderLine field : hInfo ) { diff --git a/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFIntegrationTest.java b/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFIntegrationTest.java new file mode 100644 index 000000000..32ff25c7b --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFIntegrationTest.java @@ -0,0 +1,28 @@ +package org.broadinstitute.sting.utils.codecs.vcf; + +import org.broadinstitute.sting.WalkerTest; +import org.testng.annotations.Test; + +import java.io.File; +import java.util.Arrays; +import java.util.List; + +public class VCFIntegrationTest extends WalkerTest { + + @Test + public void testReadingAndWritingWitHNoChanges() { + + String md5ofInputVCF = "a990ba187a69ca44cb9bc2bb44d00447"; + String testVCF = validationDataLocation + "vcf4.1.example.vcf"; + + String baseCommand = "-R " + b37KGReference + " -NO_HEADER -o %s "; + + String test1 = baseCommand + "-T VariantAnnotator -BTI variant -B:variant,vcf " + testVCF; + WalkerTestSpec spec1 = new WalkerTestSpec(test1, 1, Arrays.asList(md5ofInputVCF)); + List result = executeTest("Test Variant Annotator with no changes", spec1).getFirst(); + + String test2 = baseCommand + "-T VariantsToVCF -B:variant,vcf " + result.get(0).getAbsolutePath(); + WalkerTestSpec spec2 = new WalkerTestSpec(test2, 1, Arrays.asList(md5ofInputVCF)); + executeTest("Test Variants To VCF from new output", spec2); + } +} From df996a1a738208dac7c42b24645f747304b3de7f Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Wed, 13 Jul 2011 14:53:58 -0400 Subject: [PATCH 20/35] more progress report for the Data Processing Pipeline. Bam lists can now have empty lines, comments and whitespaces anywhere. --- .../qscripts/DataProcessingPipeline.scala | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala index f9369ee3f..d6caabd23 100755 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala @@ -147,13 +147,22 @@ class DataProcessingPipeline extends QScript { } } + println("\n\n*** DEBUG ***\n") // Creating one file for each sample in the dataset val sampleBamFiles = scala.collection.mutable.Map.empty[String, File] for ((sample, flist) <- sampleTable) { + + println(sample + ":") + for (f <- flist) + println (f) + println() + val sampleFileName = new File(qscript.outputDir + qscript.projectName + "." + sample + ".bam") sampleBamFiles(sample) = sampleFileName add(joinBams(flist, sampleFileName)) } + println("*** DEBUG ***\n\n") + return sampleBamFiles.toMap } @@ -211,8 +220,10 @@ class DataProcessingPipeline extends QScript { if (in.toString.endsWith("bam")) return List(in) var l: List[File] = List() - for (bam <- fromFile(in).getLines) - l :+= new File(bam) + for (bam <- fromFile(in).getLines) { + if (!bam.startsWith("#") && !bam.isEmpty) + l :+= new File(bam.trim) + } return l } @@ -234,9 +245,6 @@ class DataProcessingPipeline extends QScript { // Generate a BAM file per sample joining all per lane files if necessary val sampleBamFiles: Map[String, File] = createSampleFiles(bams, realignedBams) - - println("nContigs: " + nContigs) - // Final output list of processed bam files var cohortList: List[File] = List() @@ -244,6 +252,7 @@ class DataProcessingPipeline extends QScript { println("\nFound the following samples: ") for ((sample, file) <- sampleBamFiles) println("\t" + sample + " -> " + file) + println("\n") // If this is a 'knowns only' indel realignment run, do it only once for all samples. val globalIntervals = new File(outputDir + projectName + ".intervals") From bb0e3a26fcf2a0c99ee0e70c46fb6359b3720a40 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 13 Jul 2011 14:57:21 -0400 Subject: [PATCH 21/35] Added integration test for VCF writing. Also, bug fix for writing the GT-free records. --- .../sting/utils/codecs/vcf/StandardVCFWriter.java | 7 ++++--- .../variantutils/VariantsToVCFIntegrationTest.java | 8 ++++---- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/StandardVCFWriter.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/StandardVCFWriter.java index f7d09f16d..e7ddac185 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/StandardVCFWriter.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/StandardVCFWriter.java @@ -444,9 +444,10 @@ public class StandardVCFWriter implements VCFWriter { break; } - for (String s : attrs ) { - mWriter.write(VCFConstants.GENOTYPE_FIELD_SEPARATOR); - mWriter.write(s); + for (int i = 0; i < attrs.size(); i++) { + if ( i > 0 || genotypeFormatKeys.contains(VCFConstants.GENOTYPE_KEY) ) + mWriter.write(VCFConstants.GENOTYPE_FIELD_SEPARATOR); + mWriter.write(attrs.get(i)); } } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCFIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCFIntegrationTest.java index 8421076c9..8c96c1e11 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCFIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCFIntegrationTest.java @@ -20,7 +20,7 @@ public class VariantsToVCFIntegrationTest extends WalkerTest { @Test public void testVariantsToVCFUsingGeliInput() { List md5 = new ArrayList(); - md5.add("815b82fff92aab41c209eedce2d7e7d9"); + md5.add("4accae035d271b35ee2ec58f403c68c6"); WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-R " + b36KGReference + @@ -38,7 +38,7 @@ public class VariantsToVCFIntegrationTest extends WalkerTest { @Test public void testGenotypesToVCFUsingGeliInput() { List md5 = new ArrayList(); - md5.add("22336ee9c12aa222ce29c3c5babca7d0"); + md5.add("71e8c98d7c3a73b6287ecc339086fe03"); WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-R " + b36KGReference + @@ -56,7 +56,7 @@ public class VariantsToVCFIntegrationTest extends WalkerTest { @Test public void testGenotypesToVCFUsingHapMapInput() { List md5 = new ArrayList(); - md5.add("9bedaa7670b86a07be5191898c3727cf"); + md5.add("f343085305e80c7a2493422e4eaad983"); WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-R " + b36KGReference + @@ -73,7 +73,7 @@ public class VariantsToVCFIntegrationTest extends WalkerTest { @Test public void testGenotypesToVCFUsingVCFInput() { List md5 = new ArrayList(); - md5.add("cc215edec9ca28e5c79ab1b67506f9f7"); + md5.add("86f02e2e764ba35854cff2aa05a1fdd8"); WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-R " + b36KGReference + From 66c652d687ec5b5e15b33255441e31b775d20c3c Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Thu, 14 Jul 2011 11:56:10 -0400 Subject: [PATCH 28/35] Added some extra error checks in the VCF codec. Now that we've moved this back into the GATK, changed some of the standard exceptions to be USerErrors (instead of TribbleExceptions). --- .../utils/codecs/vcf/AbstractVCFCodec.java | 25 ++++++++++--------- .../sting/utils/exceptions/UserException.java | 10 ++++++++ 2 files changed, 23 insertions(+), 12 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java index 01344a117..710127f7a 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java @@ -7,6 +7,8 @@ import org.broad.tribble.NameAwareCodec; import org.broad.tribble.TribbleException; import org.broad.tribble.readers.LineReader; import org.broad.tribble.util.ParsingUtils; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.Genotype; import org.broadinstitute.sting.utils.variantcontext.VariantContext; @@ -96,6 +98,9 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec, for ( String str : headerStrings ) { if ( !str.startsWith(VCFHeader.METADATA_INDICATOR) ) { String[] strings = str.substring(1).split(VCFConstants.FIELD_SEPARATOR); + if ( strings.length < VCFHeader.HEADER_FIELDS.values().length ) + throw new TribbleException.InvalidHeader("there are not enough columns present in the header line: " + str); + int arrayIndex = 0; for (VCFHeader.HEADER_FIELDS field : VCFHeader.HEADER_FIELDS.values()) { try { @@ -159,12 +164,11 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec, } private Feature reallyDecode(String line) { - try { // the same line reader is not used for parsing the header and parsing lines, if we see a #, we've seen a header line if (line.startsWith(VCFHeader.HEADER_INDICATOR)) return null; // our header cannot be null, we need the genotype sample names and counts - if (header == null) throw new IllegalStateException("VCF Header cannot be null when decoding a record"); + if (header == null) throw new ReviewedStingException("VCF Header cannot be null when decoding a record"); if (parts == null) parts = new String[Math.min(header.getColumnCount(), NUM_STANDARD_FIELDS+1)]; @@ -174,17 +178,18 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec, // if we have don't have a header, or we have a header with no genotyping data check that we have eight columns. Otherwise check that we have nine (normal colummns + genotyping data) if (( (header == null || (header != null && !header.hasGenotypingData())) && nParts != NUM_STANDARD_FIELDS) || (header != null && header.hasGenotypingData() && nParts != (NUM_STANDARD_FIELDS + 1)) ) - throw new IllegalArgumentException("There aren't enough columns for line " + line + " (we expected " + (header == null ? NUM_STANDARD_FIELDS : NUM_STANDARD_FIELDS + 1) + - " tokens, and saw " + nParts + " )"); + throw new UserException.MalformedVCF("there aren't enough columns for line " + line + " (we expected " + (header == null ? NUM_STANDARD_FIELDS : NUM_STANDARD_FIELDS + 1) + + " tokens, and saw " + nParts + " )", lineNo); return parseVCFLine(parts); - } catch (TribbleException e) { - throw new TribbleException.InvalidDecodeLine(e.getMessage(), line); - } } protected void generateException(String message) { - throw new TribbleException.InvalidDecodeLine(message, lineNo); + throw new UserException.MalformedVCF(message, lineNo); + } + + private static void generateException(String message, int lineNo) { + throw new UserException.MalformedVCF(message, lineNo); } /** @@ -472,10 +477,6 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec, return true; } - private static void generateException(String message, int lineNo) { - throw new TribbleException.InvalidDecodeLine(message, lineNo); - } - private static int computeForwardClipping(List unclippedAlleles, String ref) { boolean clipping = true; // Note that the computation of forward clipping here is meant only to see whether there is a common diff --git a/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java b/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java index 0be4bec91..17c4a7df4 100755 --- a/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java +++ b/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java @@ -154,6 +154,16 @@ public class UserException extends ReviewedStingException { } } + public static class MalformedVCF extends UserException { + public MalformedVCF(String message, String line) { + super(String.format("The provided VCF file is malformed at line %s: %s", line, message)); + } + + public MalformedVCF(String message, int lineNo) { + super(String.format("The provided VCF file is malformed at line nmber %d: %s", lineNo, message)); + } + } + public static class ReadMissingReadGroup extends MalformedBAM { public ReadMissingReadGroup(SAMRecord read) { super(read, String.format("Read %s is either missing the read group or its read group is not defined in the BAM header, both of which are required by the GATK. Please use http://www.broadinstitute.org/gsa/wiki/index.php/ReplaceReadGroups to fix this problem", read.getReadName())); From ed6beae1f3769c247b9a4fc80121985baad6443f Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Thu, 14 Jul 2011 13:55:35 -0400 Subject: [PATCH 29/35] Adding headers to diffable reading for VCFs --- .../gatk/walkers/diffengine/VCFDiffableReader.java | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/VCFDiffableReader.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/VCFDiffableReader.java index 06d14366f..5677574bd 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/VCFDiffableReader.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/VCFDiffableReader.java @@ -26,16 +26,12 @@ package org.broadinstitute.sting.gatk.walkers.diffengine; import org.broad.tribble.readers.AsciiLineReader; import org.broad.tribble.readers.LineReader; -import org.broadinstitute.sting.utils.codecs.vcf.VCFCodec; -import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; -import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader; +import org.broadinstitute.sting.utils.codecs.vcf.*; import org.broadinstitute.sting.utils.variantcontext.Genotype; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.io.*; -import java.util.Arrays; import java.util.Map; -import java.util.zip.GZIPInputStream; /** @@ -58,7 +54,11 @@ public class VCFDiffableReader implements DiffableReader { VCFCodec vcfCodec = new VCFCodec(); // must be read as state is stored in reader itself - vcfCodec.readHeader(lineReader); + VCFHeader header = (VCFHeader)vcfCodec.readHeader(lineReader); + for ( VCFHeaderLine headerLine : header.getMetaData() ) { + final String key = (headerLine instanceof VCFNamedHeaderLine ? headerLine.getKey() + "." + ((VCFNamedHeaderLine) headerLine).getName() : headerLine.getKey()); + root.add(key, headerLine.toString()); + } String line = lineReader.readLine(); int count = 0; From 9540df69986c16112b5fdd57efebf72846d86424 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Thu, 14 Jul 2011 14:00:19 -0400 Subject: [PATCH 30/35] Oops, forgot to update unit test --- .../sting/gatk/walkers/diffengine/DiffableReaderUnitTest.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/diffengine/DiffableReaderUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/diffengine/DiffableReaderUnitTest.java index baa2f0383..a0cb47770 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/diffengine/DiffableReaderUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/diffengine/DiffableReaderUnitTest.java @@ -87,7 +87,7 @@ public class DiffableReaderUnitTest extends BaseTest { Assert.assertSame(diff.getParent(), DiffElement.ROOT); DiffNode node = diff.getValueAsNode(); - Assert.assertEquals(node.getElements().size(), 9); + Assert.assertEquals(node.getElements().size(), 10); // chr1 2646 rs62635284 G A 0.15 PASS AC=2;AF=1.00;AN=2 GT:AD:DP:GL:GQ 1/1:53,75:3:-12.40,-0.90,-0.00:9.03 DiffNode rec1 = node.getElement("chr1:2646").getValueAsNode(); From 5ffeddd3b1ac5f83d6018dad0a178ae785a4dc39 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Thu, 14 Jul 2011 14:45:16 -0400 Subject: [PATCH 31/35] better to use _ instead of ., as this is a special case later. --- .../sting/gatk/walkers/diffengine/VCFDiffableReader.java | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/VCFDiffableReader.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/VCFDiffableReader.java index 5677574bd..a812babaf 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/VCFDiffableReader.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/VCFDiffableReader.java @@ -56,7 +56,9 @@ public class VCFDiffableReader implements DiffableReader { // must be read as state is stored in reader itself VCFHeader header = (VCFHeader)vcfCodec.readHeader(lineReader); for ( VCFHeaderLine headerLine : header.getMetaData() ) { - final String key = (headerLine instanceof VCFNamedHeaderLine ? headerLine.getKey() + "." + ((VCFNamedHeaderLine) headerLine).getName() : headerLine.getKey()); + String key = headerLine.getKey(); + if ( headerLine instanceof VCFNamedHeaderLine ) + key += "_" + ((VCFNamedHeaderLine) headerLine).getName(); root.add(key, headerLine.toString()); } From c0bbeb23ba0200d80f940a395ddc019fdb61f093 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Thu, 14 Jul 2011 15:12:28 -0400 Subject: [PATCH 32/35] Now providing more information when the index on the fly isn't equal to the one created by reading the file from disk. --- .../test/org/broadinstitute/sting/WalkerTest.java | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/public/java/test/org/broadinstitute/sting/WalkerTest.java b/public/java/test/org/broadinstitute/sting/WalkerTest.java index dacaf2738..d65f4ec34 100755 --- a/public/java/test/org/broadinstitute/sting/WalkerTest.java +++ b/public/java/test/org/broadinstitute/sting/WalkerTest.java @@ -26,7 +26,9 @@ package org.broadinstitute.sting; import org.apache.commons.lang.StringUtils; +import org.broad.tribble.FeatureCodec; import org.broad.tribble.Tribble; +import org.broad.tribble.index.Index; import org.broad.tribble.index.IndexFactory; import org.broadinstitute.sting.utils.codecs.vcf.VCFCodec; import org.broadinstitute.sting.gatk.CommandLineExecutable; @@ -64,10 +66,19 @@ public class WalkerTest extends BaseTest { } System.out.println("Verifying on-the-fly index " + indexFile + " for test " + name + " using file " + resultFile); - Assert.assertTrue(IndexFactory.onDiskIndexEqualToNewlyCreatedIndex(resultFile, indexFile, new VCFCodec()), "Index on disk from indexing on the fly not equal to the index created after the run completed"); + Index indexFromOutputFile = IndexFactory.createIndex(resultFile, new VCFCodec()); + Index dynamicIndex = IndexFactory.loadIndex(indexFile.getAbsolutePath()); + + if ( ! indexFromOutputFile.equals(dynamicIndex) ) { + Assert.fail(String.format("Index on disk from indexing on the fly not equal to the index created after the run completed. FileIndex %s vs. on-the-fly %s%n", + indexFromOutputFile.getProperties(), + dynamicIndex.getProperties())); + } } } + + public List assertMatchingMD5s(final String name, List resultFiles, List expectedMD5s) { List md5s = new ArrayList(); for (int i = 0; i < resultFiles.size(); i++) { From 9f5180ab053d7116b194f2ef09b5f157dbeb4cca Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Thu, 14 Jul 2011 16:42:17 -0400 Subject: [PATCH 33/35] Recalibrates a list of bam files allowing multiple bams to be recalibrated out of a single 'mother' queue job. --- .../qscripts/RecalibrateBaseQualities.scala | 39 +++++++++++++------ 1 file changed, 27 insertions(+), 12 deletions(-) diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/RecalibrateBaseQualities.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/RecalibrateBaseQualities.scala index dc9ae0f4b..88c7f5ff7 100755 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/RecalibrateBaseQualities.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/RecalibrateBaseQualities.scala @@ -3,6 +3,7 @@ package org.broadinstitute.sting.queue.qscripts import org.broadinstitute.sting.queue.QScript import org.broadinstitute.sting.queue.extensions.gatk._ import net.sf.samtools.SAMFileReader +import io.Source._ /** * Created by IntelliJ IDEA. @@ -37,21 +38,35 @@ class RecalibrateBaseQualities extends QScript { return samReader.getFileHeader.getSequenceDictionary.getSequences.size() } + // Reads a BAM LIST file and creates a scala list with all the files + def createListFromFile(in: File):List[File] = { + if (in.toString.endsWith("bam")) + return List(in) + var l: List[File] = List() + for (bam <- fromFile(in).getLines) + l :+= new File(bam) + return l + } + def script = { - nContigs = getNumberOfContigs(input) + val bamList = createListFromFile(input) + nContigs = getNumberOfContigs(bamList(0)) - val recalFile1: File = swapExt(input, ".bam", ".recal1.csv") - val recalFile2: File = swapExt(input, ".bam", ".recal2.csv") - val recalBam: File = swapExt(input, ".bam", ".recal.bam") - val path1: String = input + "before" - val path2: String = input + "after" - - add(cov(input, recalFile1), - recal(input, recalFile1, recalBam), - cov(recalBam, recalFile2), - analyzeCovariates(recalFile1, path1), - analyzeCovariates(recalFile2, path2)) + for (bam <- bamList) { + + val recalFile1: File = swapExt(bam, ".bam", ".recal1.csv") + val recalFile2: File = swapExt(bam, ".bam", ".recal2.csv") + val recalBam: File = swapExt(bam, ".bam", ".recal.bam") + val path1: String = bam + "before" + val path2: String = bam + "after" + + add(cov(bam, recalFile1), + recal(bam, recalFile1, recalBam), + cov(recalBam, recalFile2), + analyzeCovariates(recalFile1, path1), + analyzeCovariates(recalFile2, path2)) + } } trait CommandLineGATKArgs extends CommandLineGATK { From 09ffe277ae7d3418f75510cabe65e6c9a8fa8f25 Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Thu, 14 Jul 2011 17:09:35 -0400 Subject: [PATCH 34/35] Added a qscripts util package with some utility functions commonly shared across queue scripts. Refactored some of my public scripts to use it in an effort to make queue scripts more reusable and "supportable". --- .../qscripts/DataProcessingPipeline.scala | 38 ++---------- .../qscripts/RecalibrateBaseQualities.scala | 20 +----- .../sting/queue/qscripts/utils/Utils.scala | 62 +++++++++++++++++++ 3 files changed, 71 insertions(+), 49 deletions(-) create mode 100644 public/scala/qscript/org/broadinstitute/sting/queue/qscripts/utils/Utils.scala diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala index f9369ee3f..e62b1b926 100755 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala @@ -4,13 +4,13 @@ import org.broadinstitute.sting.queue.extensions.gatk._ import org.broadinstitute.sting.queue.QScript import org.broadinstitute.sting.queue.function.ListWriterFunction -import scala.io.Source._ import collection.JavaConversions._ import org.broadinstitute.sting.gatk.walkers.indels.IndelRealigner.ConsensusDeterminationModel import org.broadinstitute.sting.queue.extensions.picard._ -import net.sf.samtools.{SAMFileReader, SAMReadGroupRecord} +import net.sf.samtools.{SAMFileReader} import net.sf.samtools.SAMFileHeader.SortOrder +import org.broadinstitute.sting.queue.qscripts.utils.Utils class DataProcessingPipeline extends QScript { qscript => @@ -103,18 +103,6 @@ class DataProcessingPipeline extends QScript { val ds: String) {} - // Utility function to check if there are multiple samples in a BAM file (currently we can't deal with that) - def hasMultipleSamples(readGroups: java.util.List[SAMReadGroupRecord]): Boolean = { - var sample: String = "" - for (r <- readGroups) { - if (sample.isEmpty) - sample = r.getSample - else if (sample != r.getSample) - return true; - } - return false - } - // Utility function to merge all bam files of similar samples. Generates one BAM file per sample. // It uses the sample information on the header of the input BAM files. // @@ -135,7 +123,7 @@ class DataProcessingPipeline extends QScript { // only allow one sample per file. Bam files with multiple samples would require pre-processing of the file // with PrintReads to separate the samples. Tell user to do it himself! - assert(!hasMultipleSamples(readGroups), "The pipeline requires that only one sample is present in a BAM file. Please separate the samples in " + bam) + assert(!Utils.hasMultipleSamples(readGroups), "The pipeline requires that only one sample is present in a BAM file. Please separate the samples in " + bam) // Fill out the sample table with the readgroups in this file for (rg <- readGroups) { @@ -157,12 +145,6 @@ class DataProcessingPipeline extends QScript { return sampleBamFiles.toMap } - // Checks how many contigs are in the dataset. Uses the BAM file header information. - def getNumberOfContigs(bamFile: File): Int = { - val samReader = new SAMFileReader(new File(bamFile)) - return samReader.getFileHeader.getSequenceDictionary.getSequences.size() - } - // Rebuilds the Read Group string to give BWA def addReadGroups(inBam: File, outBam: File, samReader: SAMFileReader) { val readGroups = samReader.getFileHeader.getReadGroups @@ -206,15 +188,7 @@ class DataProcessingPipeline extends QScript { return realignedBams } - // Reads a BAM LIST file and creates a scala list with all the files - def createListFromFile(in: File):List[File] = { - if (in.toString.endsWith("bam")) - return List(in) - var l: List[File] = List() - for (bam <- fromFile(in).getLines) - l :+= new File(bam) - return l - } + @@ -226,8 +200,8 @@ class DataProcessingPipeline extends QScript { def script = { // keep a record of the number of contigs in the first bam file in the list - val bams = createListFromFile(input) - nContigs = getNumberOfContigs(bams(0)) + val bams = Utils.createListFromFile(input) + nContigs = Utils.getNumberOfContigs(bams(0)) val realignedBams = if (useBWApe || useBWAse) {performAlignment(bams)} else {bams} diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/RecalibrateBaseQualities.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/RecalibrateBaseQualities.scala index 88c7f5ff7..b2960f150 100755 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/RecalibrateBaseQualities.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/RecalibrateBaseQualities.scala @@ -4,6 +4,7 @@ import org.broadinstitute.sting.queue.QScript import org.broadinstitute.sting.queue.extensions.gatk._ import net.sf.samtools.SAMFileReader import io.Source._ +import org.broadinstitute.sting.queue.qscripts.utils.Utils /** * Created by IntelliJ IDEA. @@ -33,25 +34,10 @@ class RecalibrateBaseQualities extends QScript { val queueLogDir: String = ".qlog/" var nContigs: Int = 0 - def getNumberOfContigs(bamFile: File): Int = { - val samReader = new SAMFileReader(new File(bamFile)) - return samReader.getFileHeader.getSequenceDictionary.getSequences.size() - } - - // Reads a BAM LIST file and creates a scala list with all the files - def createListFromFile(in: File):List[File] = { - if (in.toString.endsWith("bam")) - return List(in) - var l: List[File] = List() - for (bam <- fromFile(in).getLines) - l :+= new File(bam) - return l - } - def script = { - val bamList = createListFromFile(input) - nContigs = getNumberOfContigs(bamList(0)) + val bamList = Utils.createListFromFile(input) + nContigs = Utils.getNumberOfContigs(bamList(0)) for (bam <- bamList) { diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/utils/Utils.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/utils/Utils.scala new file mode 100644 index 000000000..ff94744ee --- /dev/null +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/utils/Utils.scala @@ -0,0 +1,62 @@ +package org.broadinstitute.sting.queue.qscripts.utils + +import java.io.File +import io.Source._ +import net.sf.samtools.{SAMReadGroupRecord, SAMFileReader} + +import collection.JavaConversions._ + + +/** + * Created by IntelliJ IDEA. + * User: carneiro + * Date: 7/14/11 + * Time: 4:57 PM + * To change this template use File | Settings | File Templates. + */ + +object Utils { + + /** + * Takes a bam list file and produces a scala list with each file allowing the bam list + * to have empty lines and comment lines (lines starting with #). + */ + def createListFromFile(in: File):List[File] = { + // If the file provided ends with .bam, it is not a bam list, we treat it as a single file. + // and return a list with only this file. + if (in.toString.endsWith(".bam")) + return List(in) + + var list: List[File] = List() + for (bam <- fromFile(in).getLines) + if (!bam.startsWith("#") && !bam.isEmpty ) + list :+= new File(bam.trim()) + list + } + + /** + * Returns the number of contigs in the BAM file header. + */ + + def getNumberOfContigs(bamFile: File): Int = { + val samReader = new SAMFileReader(new File(bamFile)) + samReader.getFileHeader.getSequenceDictionary.getSequences.size() + } + + /** + * Check if there are multiple samples in a BAM file + */ + + def hasMultipleSamples(readGroups: java.util.List[SAMReadGroupRecord]): Boolean = { + var sample: String = "" + for (r <- readGroups) { + if (sample.isEmpty) + sample = r.getSample + else if (sample != r.getSample) + return true; + } + false + } + + +} \ No newline at end of file From 43c6a8565bfe77e4c50317a3e8858b754d4afcba Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Thu, 14 Jul 2011 17:10:44 -0400 Subject: [PATCH 35/35] looks better now. --- .../org/broadinstitute/sting/queue/qscripts/utils/Utils.scala | 2 -- 1 file changed, 2 deletions(-) diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/utils/Utils.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/utils/Utils.scala index ff94744ee..1189b376c 100644 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/utils/Utils.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/utils/Utils.scala @@ -37,7 +37,6 @@ object Utils { /** * Returns the number of contigs in the BAM file header. */ - def getNumberOfContigs(bamFile: File): Int = { val samReader = new SAMFileReader(new File(bamFile)) samReader.getFileHeader.getSequenceDictionary.getSequences.size() @@ -46,7 +45,6 @@ object Utils { /** * Check if there are multiple samples in a BAM file */ - def hasMultipleSamples(readGroups: java.util.List[SAMReadGroupRecord]): Boolean = { var sample: String = "" for (r <- readGroups) {