Walker index html now emited.

This commit is contained in:
Mark DePristo 2011-07-21 16:01:54 -04:00
parent e892489696
commit 81d0cab27e
3 changed files with 123 additions and 32 deletions

View File

@ -25,6 +25,7 @@
package org.broadinstitute.sting.gatk.walkers.diffengine;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.Hidden;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
@ -38,9 +39,79 @@ import java.util.List;
/**
* A generic engine for comparing tree-structured objects
*
* <p>
* Compares two record-oriented files, itemizing specific difference between equivalent
* records in the two files. Reports both itemized and summarized differences.
* <p>
* <b>What are the summarized differences and the DiffObjectsWalker</b>
* <p>
* The GATK contains a summarizing difference engine that compares hierarchical data structures to emit:
* <ul>
* <li>A list of specific differences between the two data structures. This is similar to saying the value in field A in record 1 in file F differences from the value in field A in record 1 in file G.
* <li>A summarized list of differences ordered by frequency of the difference. This output is similar to saying field A in 50 records in files F and G differed.
* </ul>
*
* <p>
* The GATK contains a private walker DiffObjects that allows you access to the DiffEngine capabilities on the command line. Simply provide the walker with the master and test files and it will emit summarized differences for you.
*
* <p>
* <b>Why?</b>
* <p>
* The reason for this system is that it allows you to compare two structured files -- such as BAMs and VCFs -- for common differences among them. This is primarily useful in regression testing or optimization, where you want to ensure that the differences are those that you expect and not any others.
*
* <p>Understanding the output
* <p>The DiffEngine system compares to two hierarchical data structures for specific differences in the values of named
* nodes. Suppose I have two trees:
* <pre>
* Tree1=(A=1 B=(C=2 D=3))
* Tree2=(A=1 B=(C=3 D=3 E=4))
* Tree3=(A=1 B=(C=4 D=3 E=4))
* </pre>
* <p>
* where every node in the tree is named, or is a raw value (here all leaf values are integers). The DiffEngine
* traverses these data structures by name, identifies equivalent nodes by fully qualified names
* (Tree1.A is distinct from Tree2.A, and determines where their values are equal (Tree1.A=1, Tree2.A=1, so they are).
* These itemized differences are listed as:
* <pre>
* Tree1.B.C=2 != Tree2.B.C=3
* Tree1.B.C=2 != Tree3.B.C=4
* Tree2.B.C=3 != Tree3.B.C=4
* Tree1.B.E=MISSING != Tree2.B.E=4
* </pre>
* <p>
* This conceptually very similar to the output of the unix command line tool diff. What's nice about DiffEngine though
* is that it computes similarity among the itemized differences and displays the count of differences names
* in the system. In the above example, the field C is not equal three times, while the missing E in Tree1 occurs
* only once. So the summary is:
*
* <pre>
* *.B.C : 3
* *.B.E : 1
* </pre>
* <p>where the * operator indicates that any named field matches. This output is sorted by counts, and provides an
* immediate picture of the commonly occurring differences among the files.
* <p>
* Below is a detailed example of two VCF fields that differ because of a bug in the AC, AF, and AN counting routines,
* detected by the integrationtest integration (more below). You can see that in the although there are many specific
* instances of these differences between the two files, the summarized differences provide an immediate picture that
* the AC, AF, and AN fields are the major causes of the differences.
* <p>
* <pre>
[testng] path count
[testng] *.*.*.AC 6
[testng] *.*.*.AF 6
[testng] *.*.*.AN 6
[testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000000.AC 1
[testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000000.AF 1
[testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000000.AN 1
[testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000117.AC 1
[testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000117.AF 1
[testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000117.AN 1
[testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000211.AC 1
[testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000211.AF 1
[testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000211.AN 1
[testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000598.AC 1
</pre>
*
* @author Mark DePristo
* @since 7/4/11
@ -112,6 +183,7 @@ public class DiffObjectsWalker extends RodWalker<Integer, Integer> {
@Argument(fullName="showItemizedDifferences", shortName="SID", doc="Should we enumerate all differences between the files?", required=false)
boolean showItemizedDifferences = false;
@Hidden
@Argument(fullName="testEnum", doc="X", required=false)
TestEnum testEnum = TestEnum.ONE;

View File

@ -47,7 +47,7 @@ import java.util.*;
*
*/
public class GATKDoclet extends ResourceBundleExtractorDoclet {
RootDoc root;
RootDoc rootDoc;
/**
* Extracts the contents of certain types of javadoc and adds them to an XML file.
@ -69,7 +69,7 @@ public class GATKDoclet extends ResourceBundleExtractorDoclet {
@Override
protected void processDocs(RootDoc rootDoc, PrintStream ignore) {
// setup the global access to the root
root = rootDoc;
this.rootDoc = rootDoc;
try {
/* ------------------------------------------------------------------- */
@ -83,15 +83,14 @@ public class GATKDoclet extends ResourceBundleExtractorDoclet {
// but just use this:
cfg.setObjectWrapper(new DefaultObjectWrapper());
List<Map<String, Object>> indexData = new ArrayList<Map<String, Object>>();
for ( ClassDoc doc : rootDoc.classes() ) {
if ( ResourceBundleExtractorDoclet.isWalker(doc) ) { // && getClassName(doc).contains("UGCalcLikelihoods")) {
System.out.printf("Walker class %s%n", doc);
processWalkerDocs(cfg, doc);
//return;
indexData.add(processWalkerDocs(cfg, doc));
}
// else
// System.out.printf("Excluding non-walker class %s%n", doc);
}
processWalkerIndex(indexData,cfg);
} catch ( FileNotFoundException e ) {
throw new RuntimeException(e);
} catch ( IOException e ) {
@ -99,19 +98,15 @@ public class GATKDoclet extends ResourceBundleExtractorDoclet {
}
}
private void processWalkerDocs(Configuration cfg, ClassDoc doc) throws IOException {
/* ------------------------------------------------------------------- */
/* You usually do these for many times in the application life-cycle: */
// Create the root hash
Map root = buildWalkerDataModel(doc);
private void processWalkerIndex(List<Map<String, Object>> indexData, Configuration cfg) throws IOException {
/* Get or create a template */
Template temp = cfg.getTemplate("test.html");
Template temp = cfg.getTemplate("walker.index.template.html");
/* Merge data-model with template */
Writer out = new OutputStreamWriter(new FileOutputStream(new File("testdoc/" + getClassName(doc).replace(".", "_") + ".html")));
Writer out = new OutputStreamWriter(new FileOutputStream(new File("testdoc/index.html")));
try {
Map<String, Object> root = new HashMap<String, Object>();
root.put("walkers", indexData);
temp.process(root, out);
out.flush();
} catch ( TemplateException e ) {
@ -119,6 +114,32 @@ public class GATKDoclet extends ResourceBundleExtractorDoclet {
}
}
private Map<String, Object> processWalkerDocs(Configuration cfg, ClassDoc doc) throws IOException {
// Create the root hash
Map root = buildWalkerDataModel(doc);
/* Get or create a template */
Template temp = cfg.getTemplate("walker.template.html");
/* Merge data-model with template */
File outputFile = new File(getClassName(doc).replace(".", "_") + ".html");
File outputPath = new File("testdoc/" + outputFile);
try {
Writer out = new OutputStreamWriter(new FileOutputStream(outputPath));
temp.process(root, out);
out.flush();
} catch ( TemplateException e ) {
throw new ReviewedStingException("Failed to create GATK documentation", e);
}
// add index data
Map<String, Object> indexData = new HashMap<String, Object>();
indexData.put("filename", outputFile.toString());
indexData.put("name", doc.name());
indexData.put("summary", root.get("summary"));
return indexData;
}
private Map buildWalkerDataModel(ClassDoc classdoc) {
Map<String, Object> root = new HashMap<String, Object>();
@ -195,7 +216,7 @@ public class GATKDoclet extends ResourceBundleExtractorDoclet {
Field field = getFieldForFieldDoc(fieldDoc);
if ( field.isAnnotationPresent(ArgumentCollection.class) ) {
ClassDoc typeDoc = root.classNamed(fieldDoc.type().qualifiedTypeName());
ClassDoc typeDoc = this.rootDoc.classNamed(fieldDoc.type().qualifiedTypeName());
if ( typeDoc == null )
throw new ReviewedStingException("Tried to get javadocs for ArgumentCollection field " + fieldDoc + " but could't find the class in the RootDoc");
else {

View File

@ -1,13 +1,6 @@
<#macro argumentlist name myargs>
<#if myargs?size != 0>
<h3>${name}</h3>
<table border="1" span=100>
<tr>
<th>Name</th>
<th>Synonyms</th>
<th>Type</th>
<th>Summary</th>
</tr>
<tr><th colspan="4" align="left">${name}</th></tr>
<#list myargs as arg>
<tr>
<td><a href="#${arg.name}">${arg.name}</a></td>
@ -19,7 +12,6 @@
<td>${arg.required}</td>
-->
</#list>
</table>
</#if>
</#macro>
@ -40,8 +32,6 @@
<h1>${name}<h1>
<h2>Summary</h2>
${summary}
<h2>Version</h2>
${version!"unknown version"}
<#if author??>
<h2>Author</h2>
${author}
@ -50,10 +40,18 @@
${description}
<#-- Create the argument summary -->
<h2>Arguments</h2>
<@argumentlist name="Required" myargs=arguments.required/>
<@argumentlist name="Optional" myargs=arguments.optional/>
<@argumentlist name="Hidden" myargs=arguments.hidden/>
<@argumentlist name="Depreciated" myargs=arguments.depreciated/>
<table border="1" cellpadding="2">
<tr>
<th>Name</th>
<th>Synonyms</th>
<th>Type</th>
<th>Summary</th>
</tr>
<@argumentlist name="Required" myargs=arguments.required/>
<@argumentlist name="Optional" myargs=arguments.optional/>
<@argumentlist name="Hidden" myargs=arguments.hidden/>
<@argumentlist name="Depreciated" myargs=arguments.depreciated/>
</table>
<#-- Create the argument details -->
<h2>Argument details</h2>