From 8014178f2f306c5ea04acd868ed49e90997666c6 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Sun, 10 Jun 2012 20:13:18 -0400 Subject: [PATCH] Algorithmically faster version of DiffEngine -- Now only includes leaf nodes in the summary, i.e., summaries of the form "*.*....*.X", which are really the most valuable to see. This calculation can be accomplished in linear time for N differences, rather than the previous O(n^2) algorithm -- Now computes the max number of elements to read correctly. Counts now the size of the entire element tree, not just the count of the roots, which was painful because the trees vary by orders of magnitude in size. -- Because of this we can enforce a meaningful, useful value for the max elements in MD5 or 100K, and this works well. -- Added integration test for new leaf and old pairwise calculations -- Bugfix for Utils.join(sep, int[]) that was eating the first element of the AD, PL fields --- .../walkers/diffengine/BAMDiffableReader.java | 5 +- .../gatk/walkers/diffengine/DiffEngine.java | 51 +++++++++++++++++-- .../walkers/diffengine/DiffObjectsWalker.java | 6 ++- .../walkers/diffengine/VCFDiffableReader.java | 6 +-- .../org/broadinstitute/sting/utils/Utils.java | 3 +- .../test/org/broadinstitute/sting/MD5DB.java | 6 +-- .../DiffObjectsIntegrationTest.java | 11 ++-- 7 files changed, 71 insertions(+), 17 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/BAMDiffableReader.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/BAMDiffableReader.java index a1c043365..2d372ca9f 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/BAMDiffableReader.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/BAMDiffableReader.java @@ -55,8 +55,6 @@ public class BAMDiffableReader implements DiffableReader { int count = 0; while ( iterator.hasNext() ) { - if ( count++ > maxElementsToRead && maxElementsToRead != -1) - break; final SAMRecord record = iterator.next(); // name is the read name + first of pair @@ -88,6 +86,9 @@ public class BAMDiffableReader implements DiffableReader { if ( ! root.hasElement(name) ) // protect ourselves from malformed files root.add(readRoot); + count += readRoot.size(); + if ( count > maxElementsToRead && maxElementsToRead != -1) + break; } reader.close(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffEngine.java index f66eeed4e..40ed26608 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffEngine.java @@ -147,7 +147,7 @@ public class DiffEngine { * @param diffs the list of differences to summarize */ public void reportSummarizedDifferences(List diffs, SummaryReportParams params ) { - printSummaryReport(summarizedDifferencesOfPaths(diffs, params.maxRawDiffsToSummarize), params ); + printSummaryReport(summarizedDifferencesOfPaths(diffs, params.doPairwise, params.maxRawDiffsToSummarize), params ); } final protected static String[] diffNameToPath(String diffName) { @@ -161,9 +161,17 @@ public class DiffEngine { diffs.add(new Difference(diff)); } - return summarizedDifferencesOfPaths(diffs, -1); + return summarizedDifferencesOfPaths(diffs, true, -1); } + /** + * Computes a minimum set of potential differences between all singleton differences + * in singletonDiffs. Employs an expensive pairwise O(n^2) algorithm. + * + * @param singletonDiffs + * @param maxRawDiffsToSummarize + * @return + */ private Map initialPairwiseSummaries(final List singletonDiffs, final int maxRawDiffsToSummarize) { Map summaries = new HashMap(); @@ -191,9 +199,41 @@ public class DiffEngine { return summaries; } + /** + * Computes the possible leaf differences among the singleton diffs. + * + * The leaf differences are all of the form *.*...*.X where all internal + * differences are wildcards and the only summarized difference considered + * interesting to compute is + * + * @param singletonDiffs + * @param maxRawDiffsToSummarize + * @return + */ + private Map initialLeafSummaries(final List singletonDiffs, + final int maxRawDiffsToSummarize) { + Map summaries = new HashMap(); + + // create the initial set of differences + for ( final Difference d : singletonDiffs ) { + final String path = summarizedPath(d.getParts(), 1); + Difference sumDiff = new Difference(path, d.getMaster(), d.getTest()); + sumDiff.setCount(0); + addSummaryIfMissing(summaries, sumDiff); + + if ( maxRawDiffsToSummarize != -1 && summaries.size() > maxRawDiffsToSummarize) + return summaries; + } + + return summaries; + } + protected List summarizedDifferencesOfPaths(final List singletonDiffs, + final boolean doPairwise, final int maxRawDiffsToSummarize) { - Map summaries = initialPairwiseSummaries(singletonDiffs, maxRawDiffsToSummarize); + final Map summaries = doPairwise + ? initialPairwiseSummaries(singletonDiffs, maxRawDiffsToSummarize) + : initialLeafSummaries(singletonDiffs, maxRawDiffsToSummarize); // count differences for ( Difference diffPath : singletonDiffs ) { @@ -372,18 +412,21 @@ public class DiffEngine { final int maxCountOneItems; final int minSumDiffToShow; final int maxRawDiffsToSummarize; + final boolean doPairwise; boolean descending = true; public SummaryReportParams(PrintStream out, int maxItemsToDisplay, int maxCountOneItems, int minSumDiffToShow, - int maxRawDiffsToSummarize) { + int maxRawDiffsToSummarize, + final boolean doPairwise) { this.out = out; this.maxItemsToDisplay = maxItemsToDisplay; this.maxCountOneItems = maxCountOneItems; this.minSumDiffToShow = minSumDiffToShow; this.maxRawDiffsToSummarize = maxRawDiffsToSummarize; + this.doPairwise = doPairwise; } public void setDescending(boolean descending) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffObjectsWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffObjectsWalker.java index 1524a6a7b..b07e2a610 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffObjectsWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffObjectsWalker.java @@ -165,6 +165,8 @@ public class DiffObjectsWalker extends RodWalker { @Argument(fullName="maxRawDiffsToSummary", shortName="maxRawDiffsToSummary", doc="Max. number of objects to read from the files. -1 [default] means unlimited", required=false) int maxRawDiffsToSummary = -1; + @Argument(fullName="doPairwise", shortName="doPairwise", doc="If provided, we will compute the minimum pairwise differences to summary, which can be extremely expensive", required=false) + boolean doPairwise = false; /** * The max number of differences to display when summarizing. For example, if there are 10M differences, but @@ -243,7 +245,9 @@ public class DiffObjectsWalker extends RodWalker { out.printf("DIFF: %s%n", diff.toString()); } - DiffEngine.SummaryReportParams params = new DiffEngine.SummaryReportParams(out, MAX_DIFFS, MAX_COUNT1_DIFFS, minCountForDiff, maxRawDiffsToSummary); + DiffEngine.SummaryReportParams params = new DiffEngine.SummaryReportParams(out, + MAX_DIFFS, MAX_COUNT1_DIFFS, minCountForDiff, + maxRawDiffsToSummary, doPairwise); params.setDescending(false); diffEngine.reportSummarizedDifferences(diffs, params); logger.info(String.format("Done summarizing differences")); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/VCFDiffableReader.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/VCFDiffableReader.java index 3794d0967..f06bca904 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/VCFDiffableReader.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/VCFDiffableReader.java @@ -81,9 +81,6 @@ public class VCFDiffableReader implements DiffableReader { String prevName = ""; Iterator it = reader.iterator(); while ( it.hasNext() ) { - if ( count++ > maxElementsToRead && maxElementsToRead != -1) - break; - VariantContext vc = it.next(); String name = vc.getChr() + ":" + vc.getStart(); if ( name.equals(prevName) ) { @@ -125,6 +122,9 @@ public class VCFDiffableReader implements DiffableReader { } root.add(vcRoot); + count += vcRoot.size(); + if ( count > maxElementsToRead && maxElementsToRead != -1) + break; } reader.close(); diff --git a/public/java/src/org/broadinstitute/sting/utils/Utils.java b/public/java/src/org/broadinstitute/sting/utils/Utils.java index b6b4c8ca6..17c145dbf 100755 --- a/public/java/src/org/broadinstitute/sting/utils/Utils.java +++ b/public/java/src/org/broadinstitute/sting/utils/Utils.java @@ -227,7 +227,8 @@ public class Utils { if ( ints == null || ints.length == 0) return ""; else { - StringBuilder ret = new StringBuilder(ints[0]); + StringBuilder ret = new StringBuilder(); + ret.append(ints[0]); for (int i = 1; i < ints.length; ++i) { ret.append(separator); ret.append(ints[i]); diff --git a/public/java/test/org/broadinstitute/sting/MD5DB.java b/public/java/test/org/broadinstitute/sting/MD5DB.java index 6e7cbc230..85780b569 100644 --- a/public/java/test/org/broadinstitute/sting/MD5DB.java +++ b/public/java/test/org/broadinstitute/sting/MD5DB.java @@ -48,8 +48,8 @@ public class MD5DB { /** * Subdirectory under the ant build directory where we store integration test md5 results */ - private static final int MAX_RECORDS_TO_READ = 1000; - private static final int MAX_RAW_DIFFS_TO_SUMMARIZE = 100; + private static final int MAX_RECORDS_TO_READ = 100000; + private static final int MAX_RAW_DIFFS_TO_SUMMARIZE = -1; public static final String LOCAL_MD5_DB_DIR = "integrationtests"; public static final String GLOBAL_MD5_DB_DIR = "/humgen/gsa-hpprojects/GATK/data/integrationtests"; @@ -284,7 +284,7 @@ public class MD5DB { // inline differences final ByteArrayOutputStream baos = new ByteArrayOutputStream(); final PrintStream ps = new PrintStream(baos); - DiffEngine.SummaryReportParams params = new DiffEngine.SummaryReportParams(ps, 20, 10, 0, MAX_RAW_DIFFS_TO_SUMMARIZE); + DiffEngine.SummaryReportParams params = new DiffEngine.SummaryReportParams(ps, 20, 10, 0, MAX_RAW_DIFFS_TO_SUMMARIZE, false); boolean success = DiffEngine.simpleDiffFiles(new File(pathToExpectedMD5File), new File(pathToFileMD5File), MAX_RECORDS_TO_READ, params); if ( success ) { final String content = baos.toString(); diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/diffengine/DiffObjectsIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/diffengine/DiffObjectsIntegrationTest.java index 66fec35e6..fd03a9cc1 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/diffengine/DiffObjectsIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/diffengine/DiffObjectsIntegrationTest.java @@ -35,12 +35,14 @@ public class DiffObjectsIntegrationTest extends WalkerTest { private class TestParams extends TestDataProvider { public File master, test; public String MD5; + public boolean doPairwise; - private TestParams(String master, String test, String MD5) { + private TestParams(String master, String test, final boolean doPairwise, String MD5) { super(TestParams.class); this.master = new File(master); this.test = new File(test); this.MD5 = MD5; + this.doPairwise = doPairwise; } public String toString() { @@ -50,8 +52,10 @@ public class DiffObjectsIntegrationTest extends WalkerTest { @DataProvider(name = "data") public Object[][] createData() { - new TestParams(testDir + "diffTestMaster.vcf", testDir + "diffTestTest.vcf", "daead9bfab1a5df72c5e3a239366118e"); - new TestParams(testDir + "exampleBAM.bam", testDir + "exampleBAM.simple.bam", "3f46f5a964f7c34015d972256fe49a35"); + new TestParams(testDir + "diffTestMaster.vcf", testDir + "diffTestTest.vcf", true, "fc06e758e5588a52d2dddafdff1665a4"); + new TestParams(testDir + "exampleBAM.bam", testDir + "exampleBAM.simple.bam", true, "3f46f5a964f7c34015d972256fe49a35"); + new TestParams(testDir + "diffTestMaster.vcf", testDir + "diffTestTest.vcf", false, "54dff80e3f9569146dd66d5369c82b48"); + new TestParams(testDir + "exampleBAM.bam", testDir + "exampleBAM.simple.bam", false, "47bf16c27c9e2c657a7e1d13f20880c9"); return TestParams.getTests(TestParams.class); } @@ -61,6 +65,7 @@ public class DiffObjectsIntegrationTest extends WalkerTest { "-T DiffObjects -R public/testdata/exampleFASTA.fasta " + " -m " + params.master + " -t " + params.test + + (params.doPairwise ? " -doPairwise " : "") + " -o %s", Arrays.asList(params.MD5)); executeTest("testDiffObjects:"+params, spec).getFirst();