Nasty, nasty.....

VariantEval is overly abusive of the GATKReport (lack of) spec.

   1. It converts numeric values (longs, integers and doubles) to string before sending to the Report, then expects it to decipher that those were actually numbers.
   2. Worse, the stratification modules somehow instead of sending the actual values to the report table, sends a string with the value "unknown" and then abuses the GATKReport spec to convert those "unknown" placeholder values with numbers. Then again, it expects the report to know those are numbers, not strings.

   Now that the GATKReport HAS specs, VariantEval needs to be overhauled to conform with that. In the meantime, I have added special ad-hoc treatment to these wrong contracts. It works, and the integration tests all passed without changing any MD5's, but right after Mark and Ryan commit their VariantEval refactors, I will step in to change the way it interacts with the GATKReport, so we can clean up the GATKReport.

   No wonder, the printing needed to be O(n^2).
This commit is contained in:
Mauricio Carneiro 2012-03-29 17:49:42 -04:00
parent 6da9571829
commit cbd21c6339
5 changed files with 28 additions and 8 deletions

View File

@ -132,6 +132,7 @@ public class GATKReportColumn extends LinkedHashMap<Object, Object> {
private static final Collection<String> RIGHT_ALIGN_STRINGS = Arrays.asList(
"null",
"NA",
"unknown",
String.valueOf(Double.POSITIVE_INFINITY),
String.valueOf(Double.NEGATIVE_INFINITY),
String.valueOf(Double.NaN));
@ -144,7 +145,7 @@ public class GATKReportColumn extends LinkedHashMap<Object, Object> {
* @return true if the value is a right alignable
*/
protected static boolean isRightAlign(String value) {
return value == null || RIGHT_ALIGN_STRINGS.contains(value) || NumberUtils.isNumber(value);
return value == null || RIGHT_ALIGN_STRINGS.contains(value) || NumberUtils.isNumber(value.trim());
}
/**
@ -213,7 +214,7 @@ public class GATKReportColumn extends LinkedHashMap<Object, Object> {
public Object put(Object key, Object value) {
if (value != null) {
String formatted = formatValue(value);
if (!formatted.equals("")) {
if (!formatted.equals("") && !formatted.equals("unknown")) {
updateMaxWidth(formatted);
updateFormat(formatted);
}

View File

@ -60,6 +60,8 @@ public class GATKReportTable {
private static final String COULD_NOT_READ_DATA_LINE = "Could not read a data line of this table -- ";
private static final String COULD_NOT_READ_EMPTY_LINE = "Could not read the last empty line of this table -- ";
private static final String OLD_GATK_TABLE_VERSION = "We no longer support older versions of the GATK Tables";
private static final String NUMBER_CONVERSION_EXCEPTION = "String is a number but is not a long or a double: ";
public GATKReportTable(BufferedReader reader, GATKReportVersion version) {
int counter = 0;
@ -413,6 +415,8 @@ public class GATKReportTable {
// This code below is bs. Why am do I have to conform to bad code
// Below is some code to convert a string into its appropriate type.
// I second Roger's rant!
// If we got a string but the column is not a String type
Object newValue = null;

View File

@ -26,18 +26,17 @@ package org.broadinstitute.sting.gatk.walkers.varianteval.stratifications;
import net.sf.picard.util.IntervalTree;
import org.apache.log4j.Logger;
import org.broad.tribble.Feature;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.annotator.SnpEff;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.interval.IntervalUtils;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.*;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
/**
* Stratifies the variants by whether they overlap an interval in the set provided on the command line.

View File

@ -2,10 +2,13 @@ package org.broadinstitute.sting.gatk.walkers.bqsr;
import org.broadinstitute.sting.gatk.report.GATKReport;
import org.broadinstitute.sting.gatk.report.GATKReportTable;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.testng.Assert;
import org.testng.annotations.Test;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.PrintStream;
import java.util.LinkedList;
import java.util.List;
@ -18,6 +21,19 @@ public class BQSRGathererUnitTest {
private static File recal = new File("public/testdata/exampleGRP.grp");
@Test(enabled = true)
public void test(){
PrintStream out;
try {
File f = new File("foo2.grp");
out = new PrintStream(f);
} catch (FileNotFoundException e) {
throw new ReviewedStingException("f");
}
GATKReport report = new GATKReport("foo.grp");
report.print(out);
}
//todo -- this test doesnt work because the primary keys in different tables are not the same. Need to either implement "sort" for testing purposes on GATKReport or have a sophisticated comparison measure
@Test(enabled = false)
public void testCombineSimilarFiles() {

View File

@ -439,7 +439,7 @@ public class VariantEvalIntegrationTest extends WalkerTest {
buildCommandLine(
"-T VariantEval",
"-R " + b37KGReference,
"--dbsnp " + b37dbSNP132,
"--dbsnp " + "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/dbSNP/dbsnp_132_b37.leftAligned.vcf",
"--eval " + fundamentalTestSNPsVCF,
"-noEV",
"-EV CountVariants",