Stop unlimited runtimes in DiffEngine when you have lots of differences
-- Added a new parameter to control the maximum number of pairwise differences to generate, which previously could expand to a very large number when there were lots of differences among genotypes, resulting in a n^2 algorithm running with n > 1,000,000
This commit is contained in:
parent
a6ee4f98b5
commit
31f4e5b52e
|
|
@ -147,11 +147,7 @@ public class DiffEngine {
|
|||
* @param diffs the list of differences to summarize
|
||||
*/
|
||||
public void reportSummarizedDifferences(List<Difference> diffs, SummaryReportParams params ) {
|
||||
printSummaryReport(summarizeDifferences(diffs), params );
|
||||
}
|
||||
|
||||
public List<Difference> summarizeDifferences(List<Difference> diffs) {
|
||||
return summarizedDifferencesOfPaths(diffs);
|
||||
printSummaryReport(summarizedDifferencesOfPaths(diffs, params.maxRawDiffsToSummarize), params );
|
||||
}
|
||||
|
||||
final protected static String[] diffNameToPath(String diffName) {
|
||||
|
|
@ -165,10 +161,11 @@ public class DiffEngine {
|
|||
diffs.add(new Difference(diff));
|
||||
}
|
||||
|
||||
return summarizedDifferencesOfPaths(diffs);
|
||||
return summarizedDifferencesOfPaths(diffs, -1);
|
||||
}
|
||||
|
||||
protected List<Difference> summarizedDifferencesOfPaths(List<? extends Difference> singletonDiffs) {
|
||||
private Map<String, Difference> initialPairwiseSummaries(final List<? extends Difference> singletonDiffs,
|
||||
final int maxRawDiffsToSummarize) {
|
||||
Map<String, Difference> summaries = new HashMap<String, Difference>();
|
||||
|
||||
// create the initial set of differences
|
||||
|
|
@ -184,10 +181,20 @@ public class DiffEngine {
|
|||
Difference sumDiff = new Difference(path, diffPath2.getMaster(), diffPath2.getTest());
|
||||
sumDiff.setCount(0);
|
||||
addSummaryIfMissing(summaries, sumDiff);
|
||||
|
||||
if ( maxRawDiffsToSummarize != -1 && summaries.size() > maxRawDiffsToSummarize)
|
||||
return summaries;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return summaries;
|
||||
}
|
||||
|
||||
protected List<Difference> summarizedDifferencesOfPaths(final List<? extends Difference> singletonDiffs,
|
||||
final int maxRawDiffsToSummarize) {
|
||||
Map<String, Difference> summaries = initialPairwiseSummaries(singletonDiffs, maxRawDiffsToSummarize);
|
||||
|
||||
// count differences
|
||||
for ( Difference diffPath : singletonDiffs ) {
|
||||
for ( Difference sumDiff : summaries.values() ) {
|
||||
|
|
@ -360,17 +367,23 @@ public class DiffEngine {
|
|||
}
|
||||
|
||||
public static class SummaryReportParams {
|
||||
PrintStream out = System.out;
|
||||
int maxItemsToDisplay = 0;
|
||||
int maxCountOneItems = 0;
|
||||
int minSumDiffToShow = 0;
|
||||
final PrintStream out;
|
||||
final int maxItemsToDisplay;
|
||||
final int maxCountOneItems;
|
||||
final int minSumDiffToShow;
|
||||
final int maxRawDiffsToSummarize;
|
||||
boolean descending = true;
|
||||
|
||||
public SummaryReportParams(PrintStream out, int maxItemsToDisplay, int maxCountOneItems, int minSumDiffToShow) {
|
||||
public SummaryReportParams(PrintStream out,
|
||||
int maxItemsToDisplay,
|
||||
int maxCountOneItems,
|
||||
int minSumDiffToShow,
|
||||
int maxRawDiffsToSummarize) {
|
||||
this.out = out;
|
||||
this.maxItemsToDisplay = maxItemsToDisplay;
|
||||
this.maxCountOneItems = maxCountOneItems;
|
||||
this.minSumDiffToShow = minSumDiffToShow;
|
||||
this.maxRawDiffsToSummarize = maxRawDiffsToSummarize;
|
||||
}
|
||||
|
||||
public void setDescending(boolean descending) {
|
||||
|
|
|
|||
|
|
@ -162,6 +162,10 @@ public class DiffObjectsWalker extends RodWalker<Integer, Integer> {
|
|||
@Argument(fullName="maxObjectsToRead", shortName="motr", doc="Max. number of objects to read from the files. -1 [default] means unlimited", required=false)
|
||||
int MAX_OBJECTS_TO_READ = -1;
|
||||
|
||||
@Argument(fullName="maxRawDiffsToSummary", shortName="maxRawDiffsToSummary", doc="Max. number of objects to read from the files. -1 [default] means unlimited", required=false)
|
||||
int maxRawDiffsToSummary = -1;
|
||||
|
||||
|
||||
/**
|
||||
* The max number of differences to display when summarizing. For example, if there are 10M differences, but
|
||||
* maxDiffs is 10, then the comparison aborts after first ten summarized differences are shown. Note that
|
||||
|
|
@ -232,13 +236,14 @@ public class DiffObjectsWalker extends RodWalker<Integer, Integer> {
|
|||
// out.println(test.toString());
|
||||
|
||||
List<Difference> diffs = diffEngine.diff(master, test);
|
||||
out.printf(" Done computing diff, n = %d%n", diffs.size());
|
||||
if ( showItemizedDifferences ) {
|
||||
out.printf("Itemized results%n");
|
||||
for ( Difference diff : diffs )
|
||||
out.printf("DIFF: %s%n", diff.toString());
|
||||
}
|
||||
|
||||
DiffEngine.SummaryReportParams params = new DiffEngine.SummaryReportParams(out, MAX_DIFFS, MAX_COUNT1_DIFFS, minCountForDiff);
|
||||
DiffEngine.SummaryReportParams params = new DiffEngine.SummaryReportParams(out, MAX_DIFFS, MAX_COUNT1_DIFFS, minCountForDiff, maxRawDiffsToSummary);
|
||||
params.setDescending(false);
|
||||
diffEngine.reportSummarizedDifferences(diffs, params);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -250,7 +250,7 @@ public class MD5DB {
|
|||
// TODO -- capture output and put in log
|
||||
final ByteArrayOutputStream baos = new ByteArrayOutputStream();
|
||||
final PrintStream ps = new PrintStream(baos);
|
||||
DiffEngine.SummaryReportParams params = new DiffEngine.SummaryReportParams(ps, 20, 10, 0);
|
||||
DiffEngine.SummaryReportParams params = new DiffEngine.SummaryReportParams(ps, 20, 10, 0, MAX_RECORDS_TO_READ);
|
||||
boolean success = DiffEngine.simpleDiffFiles(new File(pathToExpectedMD5File), new File(pathToFileMD5File), MAX_RECORDS_TO_READ, params);
|
||||
if ( success ) {
|
||||
final String content = baos.toString();
|
||||
|
|
|
|||
Loading…
Reference in New Issue