From 9684a2efb0f8593b4546e39701e99c4212120f26 Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Thu, 29 Mar 2012 09:41:29 -0400 Subject: [PATCH 1/5] HaplotypeCaller: Variants found on the same haplotype are now written out with phased genotypes. There are serious eval issues with MNPs so disabling them for now. --- .../sting/gatk/walkers/varianteval/evaluators/CompOverlap.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CompOverlap.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CompOverlap.java index 2715b383b..8ef362ba5 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CompOverlap.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CompOverlap.java @@ -19,7 +19,7 @@ import org.broadinstitute.sting.utils.variantcontext.VariantContext; */ @Analysis(description = "The overlap between eval and comp sites") public class CompOverlap extends VariantEvaluator implements StandardEval { - @DataPoint(description = "number of eval SNP sites", format = "%d") + @DataPoint(description = "number of eval variant sites", format = "%d") long nEvalVariants = 0; @DataPoint(description = "number of eval sites outside of comp sites", format = "%d") From ca96544ed02117b28c166a66f851d4c7e7d4677b Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Thu, 29 Mar 2012 16:14:29 -0400 Subject: [PATCH 2/5] All the zero quality N bases in the solid reads are adding lots of extra paths in the assembly graph. We now require a minimum base quality for every base in the kmer before adding it to the graph. The large number of solid reads with unmapped mates was also triggering the active region traversal at every base. We now ignore that check for solid reads. --- .../sting/gatk/traversals/TraverseActiveRegions.java | 1 - 1 file changed, 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java index f9a185650..22d23f216 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java @@ -1,7 +1,6 @@ package org.broadinstitute.sting.gatk.traversals; import org.apache.log4j.Logger; -import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.WalkerManager; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; From cbd21c6339f8e65c1aad94a34175d9666b001cb7 Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Thu, 29 Mar 2012 17:49:42 -0400 Subject: [PATCH 3/5] Nasty, nasty..... VariantEval is overly abusive of the GATKReport (lack of) spec. 1. It converts numeric values (longs, integers and doubles) to string before sending to the Report, then expects it to decipher that those were actually numbers. 2. Worse, the stratification modules somehow instead of sending the actual values to the report table, sends a string with the value "unknown" and then abuses the GATKReport spec to convert those "unknown" placeholder values with numbers. Then again, it expects the report to know those are numbers, not strings. Now that the GATKReport HAS specs, VariantEval needs to be overhauled to conform with that. In the meantime, I have added special ad-hoc treatment to these wrong contracts. It works, and the integration tests all passed without changing any MD5's, but right after Mark and Ryan commit their VariantEval refactors, I will step in to change the way it interacts with the GATKReport, so we can clean up the GATKReport. No wonder, the printing needed to be O(n^2). --- .../sting/gatk/report/GATKReportColumn.java | 5 +++-- .../sting/gatk/report/GATKReportTable.java | 4 ++++ .../stratifications/IntervalStratification.java | 9 ++++----- .../gatk/walkers/bqsr/BQSRGathererUnitTest.java | 16 ++++++++++++++++ .../varianteval/VariantEvalIntegrationTest.java | 2 +- 5 files changed, 28 insertions(+), 8 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportColumn.java b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportColumn.java index 2db22679a..8b54442b0 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportColumn.java +++ b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportColumn.java @@ -132,6 +132,7 @@ public class GATKReportColumn extends LinkedHashMap { private static final Collection RIGHT_ALIGN_STRINGS = Arrays.asList( "null", "NA", + "unknown", String.valueOf(Double.POSITIVE_INFINITY), String.valueOf(Double.NEGATIVE_INFINITY), String.valueOf(Double.NaN)); @@ -144,7 +145,7 @@ public class GATKReportColumn extends LinkedHashMap { * @return true if the value is a right alignable */ protected static boolean isRightAlign(String value) { - return value == null || RIGHT_ALIGN_STRINGS.contains(value) || NumberUtils.isNumber(value); + return value == null || RIGHT_ALIGN_STRINGS.contains(value) || NumberUtils.isNumber(value.trim()); } /** @@ -213,7 +214,7 @@ public class GATKReportColumn extends LinkedHashMap { public Object put(Object key, Object value) { if (value != null) { String formatted = formatValue(value); - if (!formatted.equals("")) { + if (!formatted.equals("") && !formatted.equals("unknown")) { updateMaxWidth(formatted); updateFormat(formatted); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportTable.java b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportTable.java index 44d70ac4b..58002bd14 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportTable.java +++ b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportTable.java @@ -60,6 +60,8 @@ public class GATKReportTable { private static final String COULD_NOT_READ_DATA_LINE = "Could not read a data line of this table -- "; private static final String COULD_NOT_READ_EMPTY_LINE = "Could not read the last empty line of this table -- "; private static final String OLD_GATK_TABLE_VERSION = "We no longer support older versions of the GATK Tables"; + + private static final String NUMBER_CONVERSION_EXCEPTION = "String is a number but is not a long or a double: "; public GATKReportTable(BufferedReader reader, GATKReportVersion version) { int counter = 0; @@ -413,6 +415,8 @@ public class GATKReportTable { // This code below is bs. Why am do I have to conform to bad code // Below is some code to convert a string into its appropriate type. + + // I second Roger's rant! // If we got a string but the column is not a String type Object newValue = null; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/IntervalStratification.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/IntervalStratification.java index d91422a7e..879e6066f 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/IntervalStratification.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/IntervalStratification.java @@ -26,18 +26,17 @@ package org.broadinstitute.sting.gatk.walkers.varianteval.stratifications; import net.sf.picard.util.IntervalTree; import org.apache.log4j.Logger; -import org.broad.tribble.Feature; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.annotator.SnpEff; import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.GenomeLocSortedSet; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.interval.IntervalUtils; import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import java.util.*; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Map; /** * Stratifies the variants by whether they overlap an interval in the set provided on the command line. diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGathererUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGathererUnitTest.java index fe83dce22..dc6d1c512 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGathererUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGathererUnitTest.java @@ -2,10 +2,13 @@ package org.broadinstitute.sting.gatk.walkers.bqsr; import org.broadinstitute.sting.gatk.report.GATKReport; import org.broadinstitute.sting.gatk.report.GATKReportTable; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.testng.Assert; import org.testng.annotations.Test; import java.io.File; +import java.io.FileNotFoundException; +import java.io.PrintStream; import java.util.LinkedList; import java.util.List; @@ -18,6 +21,19 @@ public class BQSRGathererUnitTest { private static File recal = new File("public/testdata/exampleGRP.grp"); + @Test(enabled = true) + public void test(){ + PrintStream out; + try { + File f = new File("foo2.grp"); + out = new PrintStream(f); + } catch (FileNotFoundException e) { + throw new ReviewedStingException("f"); + } + GATKReport report = new GATKReport("foo.grp"); + report.print(out); + } + //todo -- this test doesnt work because the primary keys in different tables are not the same. Need to either implement "sort" for testing purposes on GATKReport or have a sophisticated comparison measure @Test(enabled = false) public void testCombineSimilarFiles() { diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java index 610733d9c..128f11a31 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java @@ -439,7 +439,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { buildCommandLine( "-T VariantEval", "-R " + b37KGReference, - "--dbsnp " + b37dbSNP132, + "--dbsnp " + "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/dbSNP/dbsnp_132_b37.leftAligned.vcf", "--eval " + fundamentalTestSNPsVCF, "-noEV", "-EV CountVariants", From b7c59d5d4347da330ad8a7991bf3d9f182a64746 Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Thu, 29 Mar 2012 18:00:25 -0400 Subject: [PATCH 4/5] this was a dummy test I was using to figure out what the problem was. Deleting it. --- .../gatk/walkers/bqsr/BQSRGathererUnitTest.java | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGathererUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGathererUnitTest.java index dc6d1c512..fe83dce22 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGathererUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGathererUnitTest.java @@ -2,13 +2,10 @@ package org.broadinstitute.sting.gatk.walkers.bqsr; import org.broadinstitute.sting.gatk.report.GATKReport; import org.broadinstitute.sting.gatk.report.GATKReportTable; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.testng.Assert; import org.testng.annotations.Test; import java.io.File; -import java.io.FileNotFoundException; -import java.io.PrintStream; import java.util.LinkedList; import java.util.List; @@ -21,19 +18,6 @@ public class BQSRGathererUnitTest { private static File recal = new File("public/testdata/exampleGRP.grp"); - @Test(enabled = true) - public void test(){ - PrintStream out; - try { - File f = new File("foo2.grp"); - out = new PrintStream(f); - } catch (FileNotFoundException e) { - throw new ReviewedStingException("f"); - } - GATKReport report = new GATKReport("foo.grp"); - report.print(out); - } - //todo -- this test doesnt work because the primary keys in different tables are not the same. Need to either implement "sort" for testing purposes on GATKReport or have a sophisticated comparison measure @Test(enabled = false) public void testCombineSimilarFiles() { From 962fc352ae0841723565e50180e153dee345df3f Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Thu, 29 Mar 2012 18:01:43 -0400 Subject: [PATCH 5/5] unnecessary substitution. --- .../gatk/walkers/varianteval/VariantEvalIntegrationTest.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java index 128f11a31..610733d9c 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java @@ -439,7 +439,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { buildCommandLine( "-T VariantEval", "-R " + b37KGReference, - "--dbsnp " + "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/dbSNP/dbsnp_132_b37.leftAligned.vcf", + "--dbsnp " + b37dbSNP132, "--eval " + fundamentalTestSNPsVCF, "-noEV", "-EV CountVariants",