diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/BAMDiffableReader.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/BAMDiffableReader.java index a1c043365..2d372ca9f 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/BAMDiffableReader.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/BAMDiffableReader.java @@ -55,8 +55,6 @@ public class BAMDiffableReader implements DiffableReader { int count = 0; while ( iterator.hasNext() ) { - if ( count++ > maxElementsToRead && maxElementsToRead != -1) - break; final SAMRecord record = iterator.next(); // name is the read name + first of pair @@ -88,6 +86,9 @@ public class BAMDiffableReader implements DiffableReader { if ( ! root.hasElement(name) ) // protect ourselves from malformed files root.add(readRoot); + count += readRoot.size(); + if ( count > maxElementsToRead && maxElementsToRead != -1) + break; } reader.close(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffEngine.java index f66eeed4e..40ed26608 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffEngine.java @@ -147,7 +147,7 @@ public class DiffEngine { * @param diffs the list of differences to summarize */ public void reportSummarizedDifferences(List diffs, SummaryReportParams params ) { - printSummaryReport(summarizedDifferencesOfPaths(diffs, params.maxRawDiffsToSummarize), params ); + printSummaryReport(summarizedDifferencesOfPaths(diffs, params.doPairwise, params.maxRawDiffsToSummarize), params ); } final protected static String[] diffNameToPath(String diffName) { @@ -161,9 +161,17 @@ public class DiffEngine { diffs.add(new Difference(diff)); } - return summarizedDifferencesOfPaths(diffs, -1); + return summarizedDifferencesOfPaths(diffs, true, -1); } + /** + * Computes a minimum set of potential differences between all singleton differences + * in singletonDiffs. Employs an expensive pairwise O(n^2) algorithm. + * + * @param singletonDiffs + * @param maxRawDiffsToSummarize + * @return + */ private Map initialPairwiseSummaries(final List singletonDiffs, final int maxRawDiffsToSummarize) { Map summaries = new HashMap(); @@ -191,9 +199,41 @@ public class DiffEngine { return summaries; } + /** + * Computes the possible leaf differences among the singleton diffs. + * + * The leaf differences are all of the form *.*...*.X where all internal + * differences are wildcards and the only summarized difference considered + * interesting to compute is + * + * @param singletonDiffs + * @param maxRawDiffsToSummarize + * @return + */ + private Map initialLeafSummaries(final List singletonDiffs, + final int maxRawDiffsToSummarize) { + Map summaries = new HashMap(); + + // create the initial set of differences + for ( final Difference d : singletonDiffs ) { + final String path = summarizedPath(d.getParts(), 1); + Difference sumDiff = new Difference(path, d.getMaster(), d.getTest()); + sumDiff.setCount(0); + addSummaryIfMissing(summaries, sumDiff); + + if ( maxRawDiffsToSummarize != -1 && summaries.size() > maxRawDiffsToSummarize) + return summaries; + } + + return summaries; + } + protected List summarizedDifferencesOfPaths(final List singletonDiffs, + final boolean doPairwise, final int maxRawDiffsToSummarize) { - Map summaries = initialPairwiseSummaries(singletonDiffs, maxRawDiffsToSummarize); + final Map summaries = doPairwise + ? initialPairwiseSummaries(singletonDiffs, maxRawDiffsToSummarize) + : initialLeafSummaries(singletonDiffs, maxRawDiffsToSummarize); // count differences for ( Difference diffPath : singletonDiffs ) { @@ -372,18 +412,21 @@ public class DiffEngine { final int maxCountOneItems; final int minSumDiffToShow; final int maxRawDiffsToSummarize; + final boolean doPairwise; boolean descending = true; public SummaryReportParams(PrintStream out, int maxItemsToDisplay, int maxCountOneItems, int minSumDiffToShow, - int maxRawDiffsToSummarize) { + int maxRawDiffsToSummarize, + final boolean doPairwise) { this.out = out; this.maxItemsToDisplay = maxItemsToDisplay; this.maxCountOneItems = maxCountOneItems; this.minSumDiffToShow = minSumDiffToShow; this.maxRawDiffsToSummarize = maxRawDiffsToSummarize; + this.doPairwise = doPairwise; } public void setDescending(boolean descending) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffObjectsWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffObjectsWalker.java index 1524a6a7b..b07e2a610 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffObjectsWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffObjectsWalker.java @@ -165,6 +165,8 @@ public class DiffObjectsWalker extends RodWalker { @Argument(fullName="maxRawDiffsToSummary", shortName="maxRawDiffsToSummary", doc="Max. number of objects to read from the files. -1 [default] means unlimited", required=false) int maxRawDiffsToSummary = -1; + @Argument(fullName="doPairwise", shortName="doPairwise", doc="If provided, we will compute the minimum pairwise differences to summary, which can be extremely expensive", required=false) + boolean doPairwise = false; /** * The max number of differences to display when summarizing. For example, if there are 10M differences, but @@ -243,7 +245,9 @@ public class DiffObjectsWalker extends RodWalker { out.printf("DIFF: %s%n", diff.toString()); } - DiffEngine.SummaryReportParams params = new DiffEngine.SummaryReportParams(out, MAX_DIFFS, MAX_COUNT1_DIFFS, minCountForDiff, maxRawDiffsToSummary); + DiffEngine.SummaryReportParams params = new DiffEngine.SummaryReportParams(out, + MAX_DIFFS, MAX_COUNT1_DIFFS, minCountForDiff, + maxRawDiffsToSummary, doPairwise); params.setDescending(false); diffEngine.reportSummarizedDifferences(diffs, params); logger.info(String.format("Done summarizing differences")); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/VCFDiffableReader.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/VCFDiffableReader.java index 3794d0967..f06bca904 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/VCFDiffableReader.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/VCFDiffableReader.java @@ -81,9 +81,6 @@ public class VCFDiffableReader implements DiffableReader { String prevName = ""; Iterator it = reader.iterator(); while ( it.hasNext() ) { - if ( count++ > maxElementsToRead && maxElementsToRead != -1) - break; - VariantContext vc = it.next(); String name = vc.getChr() + ":" + vc.getStart(); if ( name.equals(prevName) ) { @@ -125,6 +122,9 @@ public class VCFDiffableReader implements DiffableReader { } root.add(vcRoot); + count += vcRoot.size(); + if ( count > maxElementsToRead && maxElementsToRead != -1) + break; } reader.close(); diff --git a/public/java/src/org/broadinstitute/sting/utils/Utils.java b/public/java/src/org/broadinstitute/sting/utils/Utils.java index b6b4c8ca6..17c145dbf 100755 --- a/public/java/src/org/broadinstitute/sting/utils/Utils.java +++ b/public/java/src/org/broadinstitute/sting/utils/Utils.java @@ -227,7 +227,8 @@ public class Utils { if ( ints == null || ints.length == 0) return ""; else { - StringBuilder ret = new StringBuilder(ints[0]); + StringBuilder ret = new StringBuilder(); + ret.append(ints[0]); for (int i = 1; i < ints.length; ++i) { ret.append(separator); ret.append(ints[i]); diff --git a/public/java/test/org/broadinstitute/sting/MD5DB.java b/public/java/test/org/broadinstitute/sting/MD5DB.java index 6e7cbc230..85780b569 100644 --- a/public/java/test/org/broadinstitute/sting/MD5DB.java +++ b/public/java/test/org/broadinstitute/sting/MD5DB.java @@ -48,8 +48,8 @@ public class MD5DB { /** * Subdirectory under the ant build directory where we store integration test md5 results */ - private static final int MAX_RECORDS_TO_READ = 1000; - private static final int MAX_RAW_DIFFS_TO_SUMMARIZE = 100; + private static final int MAX_RECORDS_TO_READ = 100000; + private static final int MAX_RAW_DIFFS_TO_SUMMARIZE = -1; public static final String LOCAL_MD5_DB_DIR = "integrationtests"; public static final String GLOBAL_MD5_DB_DIR = "/humgen/gsa-hpprojects/GATK/data/integrationtests"; @@ -284,7 +284,7 @@ public class MD5DB { // inline differences final ByteArrayOutputStream baos = new ByteArrayOutputStream(); final PrintStream ps = new PrintStream(baos); - DiffEngine.SummaryReportParams params = new DiffEngine.SummaryReportParams(ps, 20, 10, 0, MAX_RAW_DIFFS_TO_SUMMARIZE); + DiffEngine.SummaryReportParams params = new DiffEngine.SummaryReportParams(ps, 20, 10, 0, MAX_RAW_DIFFS_TO_SUMMARIZE, false); boolean success = DiffEngine.simpleDiffFiles(new File(pathToExpectedMD5File), new File(pathToFileMD5File), MAX_RECORDS_TO_READ, params); if ( success ) { final String content = baos.toString(); diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/diffengine/DiffObjectsIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/diffengine/DiffObjectsIntegrationTest.java index 66fec35e6..fd03a9cc1 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/diffengine/DiffObjectsIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/diffengine/DiffObjectsIntegrationTest.java @@ -35,12 +35,14 @@ public class DiffObjectsIntegrationTest extends WalkerTest { private class TestParams extends TestDataProvider { public File master, test; public String MD5; + public boolean doPairwise; - private TestParams(String master, String test, String MD5) { + private TestParams(String master, String test, final boolean doPairwise, String MD5) { super(TestParams.class); this.master = new File(master); this.test = new File(test); this.MD5 = MD5; + this.doPairwise = doPairwise; } public String toString() { @@ -50,8 +52,10 @@ public class DiffObjectsIntegrationTest extends WalkerTest { @DataProvider(name = "data") public Object[][] createData() { - new TestParams(testDir + "diffTestMaster.vcf", testDir + "diffTestTest.vcf", "daead9bfab1a5df72c5e3a239366118e"); - new TestParams(testDir + "exampleBAM.bam", testDir + "exampleBAM.simple.bam", "3f46f5a964f7c34015d972256fe49a35"); + new TestParams(testDir + "diffTestMaster.vcf", testDir + "diffTestTest.vcf", true, "fc06e758e5588a52d2dddafdff1665a4"); + new TestParams(testDir + "exampleBAM.bam", testDir + "exampleBAM.simple.bam", true, "3f46f5a964f7c34015d972256fe49a35"); + new TestParams(testDir + "diffTestMaster.vcf", testDir + "diffTestTest.vcf", false, "54dff80e3f9569146dd66d5369c82b48"); + new TestParams(testDir + "exampleBAM.bam", testDir + "exampleBAM.simple.bam", false, "47bf16c27c9e2c657a7e1d13f20880c9"); return TestParams.getTests(TestParams.class); } @@ -61,6 +65,7 @@ public class DiffObjectsIntegrationTest extends WalkerTest { "-T DiffObjects -R public/testdata/exampleFASTA.fasta " + " -m " + params.master + " -t " + params.test + + (params.doPairwise ? " -doPairwise " : "") + " -o %s", Arrays.asList(params.MD5)); executeTest("testDiffObjects:"+params, spec).getFirst();