Stop unlimited runtimes in DiffEngine when you have lots of differences

-- Added a new parameter to control the maximum number of pairwise differences to generate, which previously could expand to a very large number when there were lots of differences among genotypes, resulting in a n^2 algorithm running with n > 1,000,000
This commit is contained in:
Mark DePristo 2012-05-26 09:05:54 -04:00
parent a6ee4f98b5
commit 31f4e5b52e
3 changed files with 32 additions and 14 deletions

View File

@ -147,11 +147,7 @@ public class DiffEngine {
* @param diffs the list of differences to summarize
*/
public void reportSummarizedDifferences(List<Difference> diffs, SummaryReportParams params ) {
printSummaryReport(summarizeDifferences(diffs), params );
}
public List<Difference> summarizeDifferences(List<Difference> diffs) {
return summarizedDifferencesOfPaths(diffs);
printSummaryReport(summarizedDifferencesOfPaths(diffs, params.maxRawDiffsToSummarize), params );
}
final protected static String[] diffNameToPath(String diffName) {
@ -165,10 +161,11 @@ public class DiffEngine {
diffs.add(new Difference(diff));
}
return summarizedDifferencesOfPaths(diffs);
return summarizedDifferencesOfPaths(diffs, -1);
}
protected List<Difference> summarizedDifferencesOfPaths(List<? extends Difference> singletonDiffs) {
private Map<String, Difference> initialPairwiseSummaries(final List<? extends Difference> singletonDiffs,
final int maxRawDiffsToSummarize) {
Map<String, Difference> summaries = new HashMap<String, Difference>();
// create the initial set of differences
@ -184,10 +181,20 @@ public class DiffEngine {
Difference sumDiff = new Difference(path, diffPath2.getMaster(), diffPath2.getTest());
sumDiff.setCount(0);
addSummaryIfMissing(summaries, sumDiff);
if ( maxRawDiffsToSummarize != -1 && summaries.size() > maxRawDiffsToSummarize)
return summaries;
}
}
}
return summaries;
}
protected List<Difference> summarizedDifferencesOfPaths(final List<? extends Difference> singletonDiffs,
final int maxRawDiffsToSummarize) {
Map<String, Difference> summaries = initialPairwiseSummaries(singletonDiffs, maxRawDiffsToSummarize);
// count differences
for ( Difference diffPath : singletonDiffs ) {
for ( Difference sumDiff : summaries.values() ) {
@ -360,17 +367,23 @@ public class DiffEngine {
}
public static class SummaryReportParams {
PrintStream out = System.out;
int maxItemsToDisplay = 0;
int maxCountOneItems = 0;
int minSumDiffToShow = 0;
final PrintStream out;
final int maxItemsToDisplay;
final int maxCountOneItems;
final int minSumDiffToShow;
final int maxRawDiffsToSummarize;
boolean descending = true;
public SummaryReportParams(PrintStream out, int maxItemsToDisplay, int maxCountOneItems, int minSumDiffToShow) {
public SummaryReportParams(PrintStream out,
int maxItemsToDisplay,
int maxCountOneItems,
int minSumDiffToShow,
int maxRawDiffsToSummarize) {
this.out = out;
this.maxItemsToDisplay = maxItemsToDisplay;
this.maxCountOneItems = maxCountOneItems;
this.minSumDiffToShow = minSumDiffToShow;
this.maxRawDiffsToSummarize = maxRawDiffsToSummarize;
}
public void setDescending(boolean descending) {

View File

@ -162,6 +162,10 @@ public class DiffObjectsWalker extends RodWalker<Integer, Integer> {
@Argument(fullName="maxObjectsToRead", shortName="motr", doc="Max. number of objects to read from the files. -1 [default] means unlimited", required=false)
int MAX_OBJECTS_TO_READ = -1;
@Argument(fullName="maxRawDiffsToSummary", shortName="maxRawDiffsToSummary", doc="Max. number of objects to read from the files. -1 [default] means unlimited", required=false)
int maxRawDiffsToSummary = -1;
/**
* The max number of differences to display when summarizing. For example, if there are 10M differences, but
* maxDiffs is 10, then the comparison aborts after first ten summarized differences are shown. Note that
@ -232,13 +236,14 @@ public class DiffObjectsWalker extends RodWalker<Integer, Integer> {
// out.println(test.toString());
List<Difference> diffs = diffEngine.diff(master, test);
out.printf(" Done computing diff, n = %d%n", diffs.size());
if ( showItemizedDifferences ) {
out.printf("Itemized results%n");
for ( Difference diff : diffs )
out.printf("DIFF: %s%n", diff.toString());
}
DiffEngine.SummaryReportParams params = new DiffEngine.SummaryReportParams(out, MAX_DIFFS, MAX_COUNT1_DIFFS, minCountForDiff);
DiffEngine.SummaryReportParams params = new DiffEngine.SummaryReportParams(out, MAX_DIFFS, MAX_COUNT1_DIFFS, minCountForDiff, maxRawDiffsToSummary);
params.setDescending(false);
diffEngine.reportSummarizedDifferences(diffs, params);
}

View File

@ -250,7 +250,7 @@ public class MD5DB {
// TODO -- capture output and put in log
final ByteArrayOutputStream baos = new ByteArrayOutputStream();
final PrintStream ps = new PrintStream(baos);
DiffEngine.SummaryReportParams params = new DiffEngine.SummaryReportParams(ps, 20, 10, 0);
DiffEngine.SummaryReportParams params = new DiffEngine.SummaryReportParams(ps, 20, 10, 0, MAX_RECORDS_TO_READ);
boolean success = DiffEngine.simpleDiffFiles(new File(pathToExpectedMD5File), new File(pathToFileMD5File), MAX_RECORDS_TO_READ, params);
if ( success ) {
final String content = baos.toString();