Algorithmically faster version of DiffEngine
-- Now only includes leaf nodes in the summary, i.e., summaries of the form "*.*....*.X", which are really the most valuable to see. This calculation can be accomplished in linear time for N differences, rather than the previous O(n^2) algorithm -- Now computes the max number of elements to read correctly. Counts now the size of the entire element tree, not just the count of the roots, which was painful because the trees vary by orders of magnitude in size. -- Because of this we can enforce a meaningful, useful value for the max elements in MD5 or 100K, and this works well. -- Added integration test for new leaf and old pairwise calculations -- Bugfix for Utils.join(sep, int[]) that was eating the first element of the AD, PL fields
This commit is contained in:
parent
2a86b81a3f
commit
8014178f2f
|
|
@ -55,8 +55,6 @@ public class BAMDiffableReader implements DiffableReader {
|
|||
|
||||
int count = 0;
|
||||
while ( iterator.hasNext() ) {
|
||||
if ( count++ > maxElementsToRead && maxElementsToRead != -1)
|
||||
break;
|
||||
final SAMRecord record = iterator.next();
|
||||
|
||||
// name is the read name + first of pair
|
||||
|
|
@ -88,6 +86,9 @@ public class BAMDiffableReader implements DiffableReader {
|
|||
if ( ! root.hasElement(name) )
|
||||
// protect ourselves from malformed files
|
||||
root.add(readRoot);
|
||||
count += readRoot.size();
|
||||
if ( count > maxElementsToRead && maxElementsToRead != -1)
|
||||
break;
|
||||
}
|
||||
|
||||
reader.close();
|
||||
|
|
|
|||
|
|
@ -147,7 +147,7 @@ public class DiffEngine {
|
|||
* @param diffs the list of differences to summarize
|
||||
*/
|
||||
public void reportSummarizedDifferences(List<Difference> diffs, SummaryReportParams params ) {
|
||||
printSummaryReport(summarizedDifferencesOfPaths(diffs, params.maxRawDiffsToSummarize), params );
|
||||
printSummaryReport(summarizedDifferencesOfPaths(diffs, params.doPairwise, params.maxRawDiffsToSummarize), params );
|
||||
}
|
||||
|
||||
final protected static String[] diffNameToPath(String diffName) {
|
||||
|
|
@ -161,9 +161,17 @@ public class DiffEngine {
|
|||
diffs.add(new Difference(diff));
|
||||
}
|
||||
|
||||
return summarizedDifferencesOfPaths(diffs, -1);
|
||||
return summarizedDifferencesOfPaths(diffs, true, -1);
|
||||
}
|
||||
|
||||
/**
|
||||
* Computes a minimum set of potential differences between all singleton differences
|
||||
* in singletonDiffs. Employs an expensive pairwise O(n^2) algorithm.
|
||||
*
|
||||
* @param singletonDiffs
|
||||
* @param maxRawDiffsToSummarize
|
||||
* @return
|
||||
*/
|
||||
private Map<String, Difference> initialPairwiseSummaries(final List<? extends Difference> singletonDiffs,
|
||||
final int maxRawDiffsToSummarize) {
|
||||
Map<String, Difference> summaries = new HashMap<String, Difference>();
|
||||
|
|
@ -191,9 +199,41 @@ public class DiffEngine {
|
|||
return summaries;
|
||||
}
|
||||
|
||||
/**
|
||||
* Computes the possible leaf differences among the singleton diffs.
|
||||
*
|
||||
* The leaf differences are all of the form *.*...*.X where all internal
|
||||
* differences are wildcards and the only summarized difference considered
|
||||
* interesting to compute is
|
||||
*
|
||||
* @param singletonDiffs
|
||||
* @param maxRawDiffsToSummarize
|
||||
* @return
|
||||
*/
|
||||
private Map<String, Difference> initialLeafSummaries(final List<? extends Difference> singletonDiffs,
|
||||
final int maxRawDiffsToSummarize) {
|
||||
Map<String, Difference> summaries = new HashMap<String, Difference>();
|
||||
|
||||
// create the initial set of differences
|
||||
for ( final Difference d : singletonDiffs ) {
|
||||
final String path = summarizedPath(d.getParts(), 1);
|
||||
Difference sumDiff = new Difference(path, d.getMaster(), d.getTest());
|
||||
sumDiff.setCount(0);
|
||||
addSummaryIfMissing(summaries, sumDiff);
|
||||
|
||||
if ( maxRawDiffsToSummarize != -1 && summaries.size() > maxRawDiffsToSummarize)
|
||||
return summaries;
|
||||
}
|
||||
|
||||
return summaries;
|
||||
}
|
||||
|
||||
protected List<Difference> summarizedDifferencesOfPaths(final List<? extends Difference> singletonDiffs,
|
||||
final boolean doPairwise,
|
||||
final int maxRawDiffsToSummarize) {
|
||||
Map<String, Difference> summaries = initialPairwiseSummaries(singletonDiffs, maxRawDiffsToSummarize);
|
||||
final Map<String, Difference> summaries = doPairwise
|
||||
? initialPairwiseSummaries(singletonDiffs, maxRawDiffsToSummarize)
|
||||
: initialLeafSummaries(singletonDiffs, maxRawDiffsToSummarize);
|
||||
|
||||
// count differences
|
||||
for ( Difference diffPath : singletonDiffs ) {
|
||||
|
|
@ -372,18 +412,21 @@ public class DiffEngine {
|
|||
final int maxCountOneItems;
|
||||
final int minSumDiffToShow;
|
||||
final int maxRawDiffsToSummarize;
|
||||
final boolean doPairwise;
|
||||
boolean descending = true;
|
||||
|
||||
public SummaryReportParams(PrintStream out,
|
||||
int maxItemsToDisplay,
|
||||
int maxCountOneItems,
|
||||
int minSumDiffToShow,
|
||||
int maxRawDiffsToSummarize) {
|
||||
int maxRawDiffsToSummarize,
|
||||
final boolean doPairwise) {
|
||||
this.out = out;
|
||||
this.maxItemsToDisplay = maxItemsToDisplay;
|
||||
this.maxCountOneItems = maxCountOneItems;
|
||||
this.minSumDiffToShow = minSumDiffToShow;
|
||||
this.maxRawDiffsToSummarize = maxRawDiffsToSummarize;
|
||||
this.doPairwise = doPairwise;
|
||||
}
|
||||
|
||||
public void setDescending(boolean descending) {
|
||||
|
|
|
|||
|
|
@ -165,6 +165,8 @@ public class DiffObjectsWalker extends RodWalker<Integer, Integer> {
|
|||
@Argument(fullName="maxRawDiffsToSummary", shortName="maxRawDiffsToSummary", doc="Max. number of objects to read from the files. -1 [default] means unlimited", required=false)
|
||||
int maxRawDiffsToSummary = -1;
|
||||
|
||||
@Argument(fullName="doPairwise", shortName="doPairwise", doc="If provided, we will compute the minimum pairwise differences to summary, which can be extremely expensive", required=false)
|
||||
boolean doPairwise = false;
|
||||
|
||||
/**
|
||||
* The max number of differences to display when summarizing. For example, if there are 10M differences, but
|
||||
|
|
@ -243,7 +245,9 @@ public class DiffObjectsWalker extends RodWalker<Integer, Integer> {
|
|||
out.printf("DIFF: %s%n", diff.toString());
|
||||
}
|
||||
|
||||
DiffEngine.SummaryReportParams params = new DiffEngine.SummaryReportParams(out, MAX_DIFFS, MAX_COUNT1_DIFFS, minCountForDiff, maxRawDiffsToSummary);
|
||||
DiffEngine.SummaryReportParams params = new DiffEngine.SummaryReportParams(out,
|
||||
MAX_DIFFS, MAX_COUNT1_DIFFS, minCountForDiff,
|
||||
maxRawDiffsToSummary, doPairwise);
|
||||
params.setDescending(false);
|
||||
diffEngine.reportSummarizedDifferences(diffs, params);
|
||||
logger.info(String.format("Done summarizing differences"));
|
||||
|
|
|
|||
|
|
@ -81,9 +81,6 @@ public class VCFDiffableReader implements DiffableReader {
|
|||
String prevName = "";
|
||||
Iterator<VariantContext> it = reader.iterator();
|
||||
while ( it.hasNext() ) {
|
||||
if ( count++ > maxElementsToRead && maxElementsToRead != -1)
|
||||
break;
|
||||
|
||||
VariantContext vc = it.next();
|
||||
String name = vc.getChr() + ":" + vc.getStart();
|
||||
if ( name.equals(prevName) ) {
|
||||
|
|
@ -125,6 +122,9 @@ public class VCFDiffableReader implements DiffableReader {
|
|||
}
|
||||
|
||||
root.add(vcRoot);
|
||||
count += vcRoot.size();
|
||||
if ( count > maxElementsToRead && maxElementsToRead != -1)
|
||||
break;
|
||||
}
|
||||
|
||||
reader.close();
|
||||
|
|
|
|||
|
|
@ -227,7 +227,8 @@ public class Utils {
|
|||
if ( ints == null || ints.length == 0)
|
||||
return "";
|
||||
else {
|
||||
StringBuilder ret = new StringBuilder(ints[0]);
|
||||
StringBuilder ret = new StringBuilder();
|
||||
ret.append(ints[0]);
|
||||
for (int i = 1; i < ints.length; ++i) {
|
||||
ret.append(separator);
|
||||
ret.append(ints[i]);
|
||||
|
|
|
|||
|
|
@ -48,8 +48,8 @@ public class MD5DB {
|
|||
/**
|
||||
* Subdirectory under the ant build directory where we store integration test md5 results
|
||||
*/
|
||||
private static final int MAX_RECORDS_TO_READ = 1000;
|
||||
private static final int MAX_RAW_DIFFS_TO_SUMMARIZE = 100;
|
||||
private static final int MAX_RECORDS_TO_READ = 100000;
|
||||
private static final int MAX_RAW_DIFFS_TO_SUMMARIZE = -1;
|
||||
public static final String LOCAL_MD5_DB_DIR = "integrationtests";
|
||||
public static final String GLOBAL_MD5_DB_DIR = "/humgen/gsa-hpprojects/GATK/data/integrationtests";
|
||||
|
||||
|
|
@ -284,7 +284,7 @@ public class MD5DB {
|
|||
// inline differences
|
||||
final ByteArrayOutputStream baos = new ByteArrayOutputStream();
|
||||
final PrintStream ps = new PrintStream(baos);
|
||||
DiffEngine.SummaryReportParams params = new DiffEngine.SummaryReportParams(ps, 20, 10, 0, MAX_RAW_DIFFS_TO_SUMMARIZE);
|
||||
DiffEngine.SummaryReportParams params = new DiffEngine.SummaryReportParams(ps, 20, 10, 0, MAX_RAW_DIFFS_TO_SUMMARIZE, false);
|
||||
boolean success = DiffEngine.simpleDiffFiles(new File(pathToExpectedMD5File), new File(pathToFileMD5File), MAX_RECORDS_TO_READ, params);
|
||||
if ( success ) {
|
||||
final String content = baos.toString();
|
||||
|
|
|
|||
|
|
@ -35,12 +35,14 @@ public class DiffObjectsIntegrationTest extends WalkerTest {
|
|||
private class TestParams extends TestDataProvider {
|
||||
public File master, test;
|
||||
public String MD5;
|
||||
public boolean doPairwise;
|
||||
|
||||
private TestParams(String master, String test, String MD5) {
|
||||
private TestParams(String master, String test, final boolean doPairwise, String MD5) {
|
||||
super(TestParams.class);
|
||||
this.master = new File(master);
|
||||
this.test = new File(test);
|
||||
this.MD5 = MD5;
|
||||
this.doPairwise = doPairwise;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
|
|
@ -50,8 +52,10 @@ public class DiffObjectsIntegrationTest extends WalkerTest {
|
|||
|
||||
@DataProvider(name = "data")
|
||||
public Object[][] createData() {
|
||||
new TestParams(testDir + "diffTestMaster.vcf", testDir + "diffTestTest.vcf", "daead9bfab1a5df72c5e3a239366118e");
|
||||
new TestParams(testDir + "exampleBAM.bam", testDir + "exampleBAM.simple.bam", "3f46f5a964f7c34015d972256fe49a35");
|
||||
new TestParams(testDir + "diffTestMaster.vcf", testDir + "diffTestTest.vcf", true, "fc06e758e5588a52d2dddafdff1665a4");
|
||||
new TestParams(testDir + "exampleBAM.bam", testDir + "exampleBAM.simple.bam", true, "3f46f5a964f7c34015d972256fe49a35");
|
||||
new TestParams(testDir + "diffTestMaster.vcf", testDir + "diffTestTest.vcf", false, "54dff80e3f9569146dd66d5369c82b48");
|
||||
new TestParams(testDir + "exampleBAM.bam", testDir + "exampleBAM.simple.bam", false, "47bf16c27c9e2c657a7e1d13f20880c9");
|
||||
return TestParams.getTests(TestParams.class);
|
||||
}
|
||||
|
||||
|
|
@ -61,6 +65,7 @@ public class DiffObjectsIntegrationTest extends WalkerTest {
|
|||
"-T DiffObjects -R public/testdata/exampleFASTA.fasta "
|
||||
+ " -m " + params.master
|
||||
+ " -t " + params.test
|
||||
+ (params.doPairwise ? " -doPairwise " : "")
|
||||
+ " -o %s",
|
||||
Arrays.asList(params.MD5));
|
||||
executeTest("testDiffObjects:"+params, spec).getFirst();
|
||||
|
|
|
|||
Loading…
Reference in New Issue