Final commits to VariantEval

-- Molten now supports variableName and valueName so you don't have to use variable and value if you don't want to.
-- Cleanup code, reorganize a bit more.
-- Fix for broken integrationtests
This commit is contained in:
Mark DePristo 2012-03-30 20:11:06 -04:00
parent f997b1dbf9
commit fbbb8509ad
8 changed files with 87 additions and 61 deletions

View File

@ -24,14 +24,15 @@
package org.broadinstitute.sting.gatk.walkers.varianteval;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import org.broadinstitute.sting.gatk.report.GATKReport;
import org.broadinstitute.sting.gatk.report.GATKReportTable;
import org.broadinstitute.sting.gatk.walkers.varianteval.evaluators.VariantEvaluator;
import org.broadinstitute.sting.gatk.walkers.varianteval.stratifications.VariantStratifier;
import org.broadinstitute.sting.gatk.walkers.varianteval.stratifications.manager.StratificationManager;
import org.broadinstitute.sting.gatk.walkers.varianteval.util.*;
import org.broadinstitute.sting.gatk.walkers.varianteval.util.Analysis;
import org.broadinstitute.sting.gatk.walkers.varianteval.util.AnalysisModuleScanner;
import org.broadinstitute.sting.gatk.walkers.varianteval.util.DataPoint;
import org.broadinstitute.sting.gatk.walkers.varianteval.util.EvaluationContext;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.StingException;
@ -44,6 +45,9 @@ import java.util.Map;
/**
* Class for writing the GATKReport for VariantEval
*
* Accepts a fulled evaluated (i.e., there's no more data coming) set of stratifications and evaluators
* and supports writing out the data in these evaluators to a GATKReport.
*/
public class VariantEvalReportWriter {
private final GATKReport report;
@ -56,6 +60,12 @@ public class VariantEvalReportWriter {
this.report = initializeGATKReport(stratifiers, evaluators);
}
/**
* The business end of the class. Writes out the data in the provided stratManager
* to the PrintStream out
*
* @param out
*/
public final void writeReport(final PrintStream out) {
for ( int key = 0; key < stratManager.size(); key++ ) {
final String stratStateString = stratManager.getStratsAndStatesForKeyString(key);
@ -63,7 +73,6 @@ public class VariantEvalReportWriter {
final EvaluationContext nec = stratManager.get(key);
for ( final VariantEvaluator ve : nec.getVariantEvaluators() ) {
ve.finalizeEvaluation();
final GATKReportTable table = report.getTable(ve.getSimpleName());
final AnalysisModuleScanner scanner = new AnalysisModuleScanner(ve);
@ -73,16 +82,19 @@ public class VariantEvalReportWriter {
final Field field = scanner.getMoltenField();
final Object fieldValue = field.get(ve);
if ( ! (fieldValue instanceof Map) )
throw new ReviewedStingException("BUG field " + field.getName() + " must be an instance of Map in " + scanner.getAnalysis().name());
if ( fieldValue == null || ! (fieldValue instanceof Map) )
throw new ReviewedStingException("BUG field " + field.getName() + " must be a non-null instance of Map in " + scanner.getAnalysis().name());
final Map<Object, Object> map = (Map<Object, Object>)fieldValue;
if ( map.isEmpty() )
throw new ReviewedStingException("BUG: map is null or empty in analysis " + scanner.getAnalysis());
int counter = 0; // counter is used to ensure printing order is as defined by entrySet
for ( Map.Entry<Object, Object> keyValue : map.entrySet() ) {
// "%05d" is a terrible hack to ensure sort order
final String moltenStratStateString = stratStateString + String.format("%05d", counter++);
setStratificationColumns(table, moltenStratStateString, stratsAndStates);
table.set(moltenStratStateString, "variable", keyValue.getKey());
table.set(moltenStratStateString, "value", keyValue.getValue());
table.set(moltenStratStateString, scanner.getMoltenAnnotation().variableName(), keyValue.getKey());
table.set(moltenStratStateString, scanner.getMoltenAnnotation().valueName(), keyValue.getValue());
}
} else {
setStratificationColumns(table, stratStateString, stratsAndStates);
@ -108,7 +120,7 @@ public class VariantEvalReportWriter {
* @param primaryKey
* @param stratsAndStates
*/
private final void setStratificationColumns(final GATKReportTable table,
private void setStratificationColumns(final GATKReportTable table,
final String primaryKey,
final List<Pair<VariantStratifier, Object>> stratsAndStates) {
for ( final Pair<VariantStratifier, Object> stratAndState : stratsAndStates ) {
@ -136,21 +148,22 @@ public class VariantEvalReportWriter {
*
* @return an initialized report object
*/
public GATKReport initializeGATKReport(final Collection<VariantStratifier> stratifiers,
private GATKReport initializeGATKReport(final Collection<VariantStratifier> stratifiers,
final Collection<VariantEvaluator> evaluators) {
final GATKReport report = new GATKReport();
for (final VariantEvaluator ve : evaluators) {
// create the table
final String tableName = ve.getSimpleName();
final String tableDesc = ve.getClass().getAnnotation(Analysis.class).description();
report.addTable(tableName, tableDesc, true);
// grab the table, and add the columns we need to it
final GATKReportTable table = report.getTable(tableName);
table.addPrimaryKey("entry", false);
table.addColumn(tableName, tableName);
// create a column to hold each startifier state
// first create a column to hold each stratifier state
for (final VariantStratifier vs : stratifiers) {
final String columnName = vs.getName();
table.addColumn(columnName, null, vs.getFormat());
@ -159,11 +172,15 @@ public class VariantEvalReportWriter {
final AnalysisModuleScanner scanner = new AnalysisModuleScanner(ve);
final Map<Field, DataPoint> datamap = scanner.getData();
// deal with the molten issue
if ( scanner.hasMoltenField() ) {
table.addColumn("variable", true, scanner.getMoltenAnnotation().variableFormat());
table.addColumn("value", true, scanner.getMoltenAnnotation().valueFormat());
// deal with molten data
table.addColumn(scanner.getMoltenAnnotation().variableName(), true, scanner.getMoltenAnnotation().variableFormat());
table.addColumn(scanner.getMoltenAnnotation().valueName(), true, scanner.getMoltenAnnotation().valueFormat());
} else {
if ( datamap.isEmpty() )
throw new ReviewedStingException("Datamap is empty for analysis " + scanner.getAnalysis());
// add DataPoint's for each field marked as such
for (final Field field : datamap.keySet()) {
try {
field.setAccessible(true);

View File

@ -1,6 +1,5 @@
package org.broadinstitute.sting.gatk.walkers.varianteval;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import net.sf.picard.reference.IndexedFastaSequenceFile;
import net.sf.picard.util.IntervalTree;
@ -13,8 +12,6 @@ import org.broadinstitute.sting.gatk.arguments.DbsnpArgumentCollection;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.report.GATKReport;
import org.broadinstitute.sting.gatk.report.GATKReportTable;
import org.broadinstitute.sting.gatk.walkers.Reference;
import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.gatk.walkers.TreeReducible;
@ -23,15 +20,14 @@ import org.broadinstitute.sting.gatk.walkers.varianteval.evaluators.VariantEvalu
import org.broadinstitute.sting.gatk.walkers.varianteval.stratifications.IntervalStratification;
import org.broadinstitute.sting.gatk.walkers.varianteval.stratifications.VariantStratifier;
import org.broadinstitute.sting.gatk.walkers.varianteval.stratifications.manager.StratificationManager;
import org.broadinstitute.sting.gatk.walkers.varianteval.util.*;
import org.broadinstitute.sting.gatk.walkers.varianteval.util.EvaluationContext;
import org.broadinstitute.sting.gatk.walkers.varianteval.util.SortableJexlVCMatchExp;
import org.broadinstitute.sting.gatk.walkers.varianteval.util.VariantEvalUtils;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.SampleUtils;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.StingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
@ -41,7 +37,6 @@ import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.PrintStream;
import java.lang.reflect.Field;
import java.util.*;
/**
@ -158,10 +153,6 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
@Argument(fullName="doNotUseAllStandardModules", shortName="noEV", doc="Do not use the standard modules by default (instead, only those that are specified with the -EV option)", required=false)
protected Boolean NO_STANDARD_MODULES = false;
// Other arguments
@Argument(fullName="numSamples", shortName="ns", doc="Number of samples (used if no samples are available in the VCF file", required=false)
protected Integer NUM_SAMPLES = 0;
@Argument(fullName="minPhaseQuality", shortName="mpq", doc="Minimum phasing quality", required=false)
protected double MIN_PHASE_QUALITY = 10.0;
@ -199,7 +190,6 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
private Set<String> sampleNamesForEvaluation = new TreeSet<String>();
private Set<String> sampleNamesForStratification = new TreeSet<String>();
private int numSamples = 0;
// important stratifications
private boolean byFilterIsEnabled = false;
@ -250,7 +240,6 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
// Load the sample list
sampleNamesForEvaluation.addAll(SampleUtils.getSamplesFromCommandLineInput(vcfSamples, SAMPLE_EXPRESSIONS));
numSamples = NUM_SAMPLES > 0 ? NUM_SAMPLES : sampleNamesForEvaluation.size();
if (Arrays.asList(STRATIFICATIONS_TO_USE).contains("Sample")) {
sampleNamesForStratification.addAll(sampleNamesForEvaluation);
@ -541,6 +530,12 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
*/
public void onTraversalDone(Integer result) {
logger.info("Finalizing variant report");
// go through the evaluations and finalize them
for ( final EvaluationContext nec : stratManager.values() )
for ( final VariantEvaluator ve : nec.getVariantEvaluators() )
ve.finalizeEvaluation();
final VariantEvalReportWriter writer = new VariantEvalReportWriter(stratManager, stratManager.getStratifiers(), stratManager.get(0).getVariantEvaluators());
writer.writeReport(out);
}
@ -548,8 +543,6 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
// Accessors
public Logger getLogger() { return logger; }
public int getNumSamples() { return numSamples; }
public double getMinPhaseQuality() { return MIN_PHASE_QUALITY; }
public double getMendelianViolationQualThreshold() { return MENDELIAN_VIOLATION_QUAL_THRESHOLD; }
@ -580,10 +573,10 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
return contigs;
}
public GenomeLocParser getGenomeLocParser() {
return getToolkit().getGenomeLocParser();
}
/**
* getToolkit is protected, so we have to pseudo-overload it here so eval / strats can get the toolkit
* @return
*/
public GenomeAnalysisEngine getToolkit() {
return super.getToolkit();
}

View File

@ -46,21 +46,16 @@ import java.util.*;
@Analysis(description = "Indel length histogram", molten = true)
public class IndelLengthHistogram extends VariantEvaluator implements StandardEval {
private final Map<Integer, Integer> counts = new HashMap<Integer, Integer>();
private final boolean asFrequencies;
private final static boolean asFrequencies = true;
int nIndels = 0;
@Molten(variableFormat = "%d", valueFormat = "%.2f")
@Molten(variableName = "Length", valueName = "Freq", variableFormat = "%d", valueFormat = "%.2f")
public TreeMap<Object, Object> results;
public final static int MAX_SIZE_FOR_HISTOGRAM = 10;
public IndelLengthHistogram() {
this(MAX_SIZE_FOR_HISTOGRAM, true);
}
public IndelLengthHistogram(int maxSize, boolean asFrequencies) {
this.asFrequencies = asFrequencies;
initializeCounts(maxSize);
initializeCounts(MAX_SIZE_FOR_HISTOGRAM);
}
private void initializeCounts(int size) {
@ -98,11 +93,20 @@ public class IndelLengthHistogram extends VariantEvaluator implements StandardEv
}
}
/**
* Update the histogram with the implied length of the indel allele between ref and alt (alt.len - ref.len).
*
* If this size is outside of MAX_SIZE_FOR_HISTOGRAM, the size is capped to MAX_SIZE_FOR_HISTOGRAM
*
* @param ref
* @param alt
*/
public void updateLengthHistogram(final Allele ref, final Allele alt) {
final int len = alt.length() - ref.length();
if ( counts.containsKey(len) ) {
nIndels++;
counts.put(len, counts.get(len) + 1);
}
int len = alt.length() - ref.length();
if ( len > MAX_SIZE_FOR_HISTOGRAM ) len = MAX_SIZE_FOR_HISTOGRAM;
if ( len < -MAX_SIZE_FOR_HISTOGRAM ) len = -MAX_SIZE_FOR_HISTOGRAM;
nIndels++;
counts.put(len, counts.get(len) + 1);
}
}

View File

@ -59,7 +59,7 @@ public abstract class VariantEvaluator implements Comparable<VariantEvaluator> {
return eval.getAttributeAsBoolean(VariantEvalWalker.IS_SINGLETON_KEY, false);
}
public String getSimpleName() {
public final String getSimpleName() {
return simpleName;
}

View File

@ -126,7 +126,7 @@ public class VariantSummary extends VariantEvaluator implements StandardEval {
long sum = 0;
int n = 0;
for ( final Map.Entry<String, Integer> pair : get(type).entrySet() ) {
if ( pair.getKey() != ALL) {
if ( pair.getKey() != ALL) { // truly must be string ==
n++;
sum += pair.getValue();
}
@ -138,7 +138,7 @@ public class VariantSummary extends VariantEvaluator implements StandardEval {
double sum = 0;
int n = 0;
for ( final String sample : get(type).keySet() ) {
if ( (allP && sample == ALL) || (!allP && sample != ALL) ) {
if ( (allP && sample == ALL) || (!allP && sample != ALL) ) { // truly must be string ==
final long num = get(type).get(sample);
final long denom = denoms.get(type).get(sample);
sum += ratio(num, denom);
@ -192,7 +192,7 @@ public class VariantSummary extends VariantEvaluator implements StandardEval {
private boolean overlapsKnownCNV(VariantContext cnv) {
if ( knownCNVs != null ) {
final GenomeLoc loc = getWalker().getGenomeLocParser().createGenomeLoc(cnv, true);
final GenomeLoc loc = getWalker().getToolkit().getGenomeLocParser().createGenomeLoc(cnv, true);
IntervalTree<GenomeLoc> intervalTree = knownCNVs.get(loc.getContig());
final Iterator<IntervalTree.Node<GenomeLoc>> nodeIt = intervalTree.overlappers(loc.getStart(), loc.getStop());

View File

@ -76,7 +76,7 @@ public class IntervalStratification extends VariantStratifier {
public List<Object> getRelevantStates(ReferenceContext ref, RefMetaDataTracker tracker, VariantContext comp, String compName, VariantContext eval, String evalName, String sampleName) {
if (eval != null) {
final GenomeLoc loc = getVariantEvalWalker().getGenomeLocParser().createGenomeLoc(eval, true);
final GenomeLoc loc = getVariantEvalWalker().getToolkit().getGenomeLocParser().createGenomeLoc(eval, true);
IntervalTree<GenomeLoc> intervalTree = intervalTreeByContig.get(loc.getContig());
IntervalTree.Node<GenomeLoc> node = intervalTree.minOverlapper(loc.getStart(), loc.getStop());
//logger.info(String.format("Overlap %s found %s", loc, node));

View File

@ -28,7 +28,7 @@ import java.lang.annotation.Retention;
import java.lang.annotation.RetentionPolicy;
/**
* @Molten for @Analysis modules.
* Molten for @Analysis modules.
*
* If you are flagged as a molten analysis, then there must be one and
* only one annotation in that evaluation module: @Molten which
@ -41,11 +41,23 @@ import java.lang.annotation.RetentionPolicy;
* ...
* keyN valueN
*
* in the output table
* in the output table. The names of these two fields can be override via annotation values.
*/
@Retention(RetentionPolicy.RUNTIME)
public @interface Molten {
String description() default ""; // the description, optional
/**
* The name to use for the molten variable field in the output table.
* @return
*/
String variableName() default "variable";
String variableFormat() default "";
/**
* The name to use for the molten value field in the output table.
* @return
*/
String valueName() default "value";
String valueFormat() default "";
}

View File

@ -302,7 +302,7 @@ public class VariantEvalIntegrationTest extends WalkerTest {
" --eval " + validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf" +
" --comp:comp_genotypes,VCF3 " + validationDataLocation + "yri.trio.gatk.ug.head.vcf";
WalkerTestSpec spec = new WalkerTestSpec(withSelect(tests, "DP < 50", "DP50") + " " + extraArgs + " -ST CpG -o %s",
1, Arrays.asList("2192418a70a8e018a1675d4f425155f3"));
1, Arrays.asList("c8a782f51e094dc7be06dbfb795feab2"));
executeTestParallel("testSelect1", spec);
}
@ -323,14 +323,14 @@ public class VariantEvalIntegrationTest extends WalkerTest {
WalkerTestSpec spec = new WalkerTestSpec("-T VariantEval -R "+b37KGReference+" --eval " + variantEvalTestDataRoot + vcfFile + " -ped "+ variantEvalTestDataRoot + pedFile +" -noEV -EV MendelianViolationEvaluator -L 1:10109-10315 -o %s -mvq 0 -noST",
1,
Arrays.asList("03581adcb4f2f7960662fc7ffd910f43"));
Arrays.asList("ddcabc30c88a755a78100e30e0d491d2"));
executeTestParallel("testVEMendelianViolationEvaluator" + vcfFile, spec);
}
@Test
public void testCompVsEvalAC() {
String extraArgs = "-T VariantEval -R "+b36KGReference+" -o %s -ST CpG -EV GenotypeConcordance --eval:evalYRI,VCF3 " + validationDataLocation + "yri.trio.gatk.ug.very.few.lines.vcf --comp:compYRI,VCF3 " + validationDataLocation + "yri.trio.gatk.fake.genotypes.ac.test.vcf";
WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("8a7f23063fd7f3a292e5da36778e109e"));
WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("5c409a2ab4517f862c6678902c0fd7a1"));
executeTestParallel("testCompVsEvalAC",spec);
}
@ -360,7 +360,7 @@ public class VariantEvalIntegrationTest extends WalkerTest {
" --dbsnp " + b37dbSNP132 +
" --eval:evalBI " + validationDataLocation + "VariantEval/ALL.20100201.chr20.bi.sites.vcf" +
" -noST -ST Novelty -o %s";
WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("70b0e5b154f3e59e06188e876bbf083f"));
WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("a27c700eabe6b7b3877c8fd4eabb3975"));
executeTestParallel("testEvalTrackWithoutGenotypes",spec);
}
@ -372,7 +372,7 @@ public class VariantEvalIntegrationTest extends WalkerTest {
" --eval:evalBI " + validationDataLocation + "VariantEval/ALL.20100201.chr20.bi.sites.vcf" +
" --eval:evalBC " + validationDataLocation + "VariantEval/ALL.20100201.chr20.bc.sites.vcf" +
" -noST -ST Novelty -o %s";
WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("9fe68cc45d9afc5210ccfc8d555722fd"));
WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("3272a2db627d4f42bc512df49a8ea64b"));
executeTestParallel("testMultipleEvalTracksWithoutGenotypes",spec);
}
@ -488,7 +488,7 @@ public class VariantEvalIntegrationTest extends WalkerTest {
"-o %s"
),
1,
Arrays.asList("c428a76df5039e1e035e3ce45e819d4f")
Arrays.asList("7c01565531cf82c8c03cf042903b96cf")
);
executeTest("testModernVCFWithLargeIndels", spec);
}