From 38740b0ff5e70df6c5d760251d129ed22d0eeccb Mon Sep 17 00:00:00 2001 From: "Mark A. DePristo" Date: Mon, 4 Jul 2011 16:11:42 -0400 Subject: [PATCH 01/22] First working version of the DiffNode readers for VCF and BAM files. Unit tests confirm the readers are approximately working. Skeleton of a working DiffObjects walker that will be able to provide detailed information about how exactly two files of the same type differ, so long as the files are supported by the DiffNode structure. --- public/testdata/diffTestMaster.vcf | 11 +++++++++++ public/testdata/diffTestTest.vcf | 11 +++++++++++ 2 files changed, 22 insertions(+) create mode 100644 public/testdata/diffTestMaster.vcf create mode 100644 public/testdata/diffTestTest.vcf diff --git a/public/testdata/diffTestMaster.vcf b/public/testdata/diffTestMaster.vcf new file mode 100644 index 000000000..549f54345 --- /dev/null +++ b/public/testdata/diffTestMaster.vcf @@ -0,0 +1,11 @@ +##fileformat=VCFv4.0 +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA12878 +chr1 2646 rs62635284 G A 0.15 PASS AC=2;AF=1.00;AN=2 GT:AD:DP:GL:GQ 1/1:53,75:3:-12.40,-0.90,-0.00:9.03 +chr1 2979 rs62635286 T G 83.67 PASS AC=2;AF=1.00;AN=2 GT:AD:DP:GL:GQ 1/1:31,32:9:-33.61,-2.71,-0.00:27.09 +chr1 2981 rs62028691 A G 14.69 PASS AC=2;AF=1.00;AN=2 GT:AD:DP:GL:GQ 1/1:31,33:9:-32.12,-2.71,-0.00:27.08 +chr1 4536 rs11582131 G C 0.18 PASS AC=1;AF=0.50;AN=2 GT:AD:DP:GL:GQ 0/1:42,33:16:-41.67,-4.82,-26.29:99 +chr1 4562 rs11490464 C G 0.14 PASS AC=1;AF=0.50;AN=2 GT:AD:DP:GL:GQ 0/1:26,30:9:-19.64,-2.72,-14.87:99 +chr1 4770 rs6682375 A G 0.32 PASS AC=2;AF=1.00;AN=2 GT:AD:DP:GL:GQ 1/1:9,111:84:-306.27,-28.58,-3.46:99 +chr1 4793 rs6682385 A G 0.15 PASS AC=2;AF=1.00;AN=2 GT:AD:DP:GL:GQ 1/1:4,115:109:-350.74,-32.88,-0.10:99 +chr1 5074 rs11586607 T G 0.01 PASS AC=2;AF=1.00;AN=2 GT:AD:DP:GL:GQ 1/1:29,97:39:-130.41,-11.75,-3.82:79.31 +chr1 5137 rs62636497 A T 140.49 PASS AC=2;AF=1.00;AN=2 GT:AD:DP:GL:GQ 1/1:0,74:39:-148.99,-11.75,-0.01:99 diff --git a/public/testdata/diffTestTest.vcf b/public/testdata/diffTestTest.vcf new file mode 100644 index 000000000..8699ab253 --- /dev/null +++ b/public/testdata/diffTestTest.vcf @@ -0,0 +1,11 @@ +##fileformat=VCFv4.0 +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA12878 +chr1 2646 rs62635284 G A 0.15 PASS AC=2;AF=1.00;AN=2 GT:AD:DP:GL:GQ 1/1:53,75:3:-12.40,-0.90,-0.00:9.03 +chr1 2979 rs62635286 T G 83.67 CHANGED_FILTER AC=2;AF=1.00;AN=2 GT:AD:DP:GL:GQ 1/1:31,32:9:-33.61,-2.71,-0.00:27.09 +chr1 2981 rs62028691 A G 14.69 PASS AC=2;AF=1.00;AN=2 GT:AD:DP:GL:GQ 1/1:31,33:9:-32.12,-2.71,-0.00:27.08 +chr1 4536 rs11582131 G C 0.18 PASS AC=2;AF=0.50;AN=2 GT:AD:DP:GL:GQ 0/1:42,33:16:-41.67,-4.82,-26.29:99 +chr1 4562 rs11490464 C G 0.14 PASS AC=1;AF=0.50;AN=2 GT:AD:DP:GL:GQ 1/1:26,30:9:-19.64,-2.72,-14.87:99 +chr1 4770 rs6682375 A G 0.32 PASS AC=2;AF=1.00;AN=2 GT:AD:DP:GL:GQ 0/1:9,111:84:-306.27,-28.58,-3.46:99 +chr1 4793 rs6682385 A G 0.15 PASS AC=2;AF=1.00;AN=2 GT:AD:DP:GL:GQ 1/1:4,114:109:-350.74,-32.88,-0.10:99 +chr1 5074 rs11586607 T G 0.01 PASS AC=2;AF=1.00;AN=2 GT:AD:DP:GL:GQ 1/1:29,97:39:-130.41,-11.74,-3.82:79.31 +chr1 5137 rs62636497 A T 140.49 PASS AC=2;AF=1.00;AN=2 GT:AD:DP:GL:GQ 1/1:0,74:39:-148.99,-11.75,-0.01:9 From 080875d5daf876ebd29b059c156890d7c349983f Mon Sep 17 00:00:00 2001 From: "Mark A. DePristo" Date: Tue, 5 Jul 2011 16:13:39 -0400 Subject: [PATCH 05/22] Refactored DiffNode/DiffElement/DiffValue class structure. DiffElement is now a pair of Name -> Value, where value is either a DiffValue or its subclass DiffNode. Code cleaned up, more tests added. DiffEngine is now working, with tests. DiffObjectWalker can now take two VCFs and itemize the difference between the two files correctly and concisely. --- build.xml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/build.xml b/build.xml index fe1723587..21066686f 100644 --- a/build.xml +++ b/build.xml @@ -741,8 +741,8 @@ - - + + From ccf34f7e45164cea70beccb4cf1b0791e6acde06 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 6 Jul 2011 21:57:22 -0400 Subject: [PATCH 07/22] (1) Added very useful helper class TestDataProvider to BaseTest that making creating data providers for TestNG far easier (2) DiffEngine now officially working with with summaries. Extensive UnitTests all around! --- .../org/broadinstitute/sting/BaseTest.java | 55 +++++++++++++++++++ 1 file changed, 55 insertions(+) diff --git a/public/java/test/org/broadinstitute/sting/BaseTest.java b/public/java/test/org/broadinstitute/sting/BaseTest.java index 61bb8b34b..b469c8a41 100755 --- a/public/java/test/org/broadinstitute/sting/BaseTest.java +++ b/public/java/test/org/broadinstitute/sting/BaseTest.java @@ -12,6 +12,10 @@ import java.io.*; import java.math.BigInteger; import java.security.MessageDigest; import java.security.NoSuchAlgorithmException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; /** * @@ -107,6 +111,57 @@ public abstract class BaseTest { } } + /** + * Simple generic utility class to creating TestNG data providers: + * + * 1: inherit this class, as in + * + * private class SummarizeDifferenceTest extends TestDataProvider { + * public SummarizeDifferenceTest() { + * super(SummarizeDifferenceTest.class); + * } + * ... + * } + * + * Provide a reference to your class to the TestDataProvider constructor. + * + * 2: Create instances of your subclass. Return from it the call to getTests, providing + * the class type of your test + * + * @DataProvider(name = "summaries") + * public Object[][] createSummaries() { + * new SummarizeDifferenceTest().addDiff("A", "A").addSummary("A:2"); + * new SummarizeDifferenceTest().addDiff("A", "B").addSummary("A:1", "B:1"); + * return SummarizeDifferenceTest.getTests(SummarizeDifferenceTest.class); + * } + * + * This class magically tracks created objects of this + */ + public static class TestDataProvider { + private static final Map> tests = new HashMap>(); + + /** + * Create a new TestDataProvider instance bound to the class variable C + * @param c + */ + public TestDataProvider(Class c) { + if ( ! tests.containsKey(c) ) + tests.put(c, new ArrayList()); + tests.get(c).add(this); + } + + /** + * Return all of the data providers in the form expected by TestNG of type class C + * @param c + * @return + */ + public static Object[][] getTests(Class c) { + List params2 = new ArrayList(); + for ( Object x : tests.get(c) ) params2.add(new Object[]{x}); + return params2.toArray(new Object[][]{}); + } + } + /** * test if the file exists * From 3d4f0e9dd76d1ab23a7e7fdecd61067b57b8e7d7 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Thu, 7 Jul 2011 17:21:15 -0400 Subject: [PATCH 11/22] Now supports the case where you have multiple AC values in the info field. --- .../gatk/walkers/varianteval/stratifications/AlleleCount.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleCount.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleCount.java index ff59c9e29..2cbc66e31 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleCount.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleCount.java @@ -43,9 +43,9 @@ public class AlleleCount extends VariantStratifier { if (eval != null) { int AC = -1; - if ( eval.hasAttribute("AC") ) + if ( eval.hasAttribute("AC") && eval.getAttribute("AC") instanceof Integer ) { AC = eval.getAttributeAsInt("AC"); - else if ( eval.isVariant() ) { + } else if ( eval.isVariant() ) { for (Allele allele : eval.getAlternateAlleles()) AC = Math.max(AC, eval.getChromosomeCount(allele)); } else From 4cfe0dd857891f6c70f4d0e58b60d6c4c401a0cd Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Thu, 7 Jul 2011 23:01:03 -0400 Subject: [PATCH 14/22] Test for bad alleles so that we don't generate IndexOutOfBoundsExceptions --- .../sting/utils/codecs/vcf/AbstractVCFCodec.java | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java index cd0f52c68..01344a117 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java @@ -225,7 +225,7 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec, loc = pos + alleles.get(0).length() - 1; } else if ( !isSingleNucleotideEvent(alleles) ) { ArrayList newAlleles = new ArrayList(); - loc = clipAlleles(pos, ref, alleles, newAlleles); + loc = clipAlleles(pos, ref, alleles, newAlleles, lineNo); alleles = newAlleles; } @@ -504,7 +504,7 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec, * @param clippedAlleles output list of clipped alleles * @return a list of alleles, clipped to the reference */ - protected static long clipAlleles(long position, String ref, List unclippedAlleles, List clippedAlleles) { + protected static long clipAlleles(long position, String ref, List unclippedAlleles, List clippedAlleles, int lineNo) { // Note that the computation of forward clipping here is meant only to see whether there is a common // base to all alleles, and to correctly compute reverse clipping, @@ -522,6 +522,8 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec, } if (a.length() - reverseClipped <= forwardClipping || a.length() - forwardClipping == 0) clipping = false; + else if (ref.length() == reverseClipped) + generateException("bad alleles encountered", lineNo); else if (a.getBases()[a.length()-reverseClipped-1] != ref.getBytes()[ref.length()-reverseClipped-1]) clipping = false; } From 2a4b3ae4a20ea3d5bbd3047f2c2f7842cd01e2ef Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Fri, 8 Jul 2011 12:48:33 -0400 Subject: [PATCH 17/22] Cleaning up / removing most of the monkeying around with annotation values that happens in VariantDataManager --- .../VariantDataManager.java | 34 +++++-------------- .../VariantRecalibrator.java | 6 ++-- ...VariantRecalibratorArgumentCollection.java | 2 +- 3 files changed, 13 insertions(+), 29 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java index 5f35c182c..ddeda1699 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java @@ -82,19 +82,11 @@ public class VariantDataManager { } foundZeroVarianceAnnotation = foundZeroVarianceAnnotation || (theSTD < 1E-6); - if( annotationKeys.get(iii).toLowerCase().contains("ranksum") ) { // BUGBUG: to clean up - for( final VariantDatum datum : data ) { - if( datum.annotations[iii] > 0.0 ) { datum.annotations[iii] /= 3.0; } - } - } meanVector[iii] = theMean; varianceVector[iii] = theSTD; for( final VariantDatum datum : data ) { + // Transform each data point via: (x - mean) / standard deviation datum.annotations[iii] = ( datum.isNull[iii] ? GenomeAnalysisEngine.getRandomGenerator().nextGaussian() : ( datum.annotations[iii] - theMean ) / theSTD ); - // Each data point is now [ (x - mean) / standard deviation ] - if( annotationKeys.get(iii).toLowerCase().contains("ranksum") && datum.isNull[iii] && datum.annotations[iii] > 0.0 ) { - datum.annotations[iii] /= 3.0; - } } } if( foundZeroVarianceAnnotation ) { @@ -163,7 +155,7 @@ public class VariantDataManager { final int numBadSitesAdded = trainingData.size(); logger.info( "Found " + numBadSitesAdded + " variants overlapping bad sites training tracks." ); - // Next, sort the variants by the LOD coming from the positive model and add to the list the bottom X percent of variants + // Next sort the variants by the LOD coming from the positive model and add to the list the bottom X percent of variants Collections.sort( data ); final int numToAdd = Math.max( minimumNumber - trainingData.size(), Math.round((float)bottomPercentage * data.size()) ); if( numToAdd > data.size() ) { @@ -241,23 +233,15 @@ public class VariantDataManager { double value; try { - if( annotationKey.equalsIgnoreCase("QUAL") ) { - value = vc.getPhredScaledQual(); - } else if( annotationKey.equalsIgnoreCase("DP") ) { - value = Double.parseDouble( (String)vc.getAttribute( "DP" ) ) / Double.parseDouble( (String)vc.getAttribute( "AN" ) ); - } else { - value = Double.parseDouble( (String)vc.getAttribute( annotationKey ) ); - if( Double.isInfinite(value) ) { value = Double.NaN; } - if( annotationKey.equalsIgnoreCase("InbreedingCoeff") && value > 0.05 ) { value = Double.NaN; } - if( jitter && annotationKey.equalsIgnoreCase("HRUN") ) { // Integer valued annotations must be jittered a bit to work in this GMM - value += -0.25 + 0.5 * GenomeAnalysisEngine.getRandomGenerator().nextDouble(); - } - if( annotationKey.equalsIgnoreCase("HaplotypeScore") && MathUtils.compareDoubles(value, 0.0, 0.0001) == 0 ) { value = -0.2 + 0.4*GenomeAnalysisEngine.getRandomGenerator().nextDouble(); } - if( annotationKey.equalsIgnoreCase("FS") && MathUtils.compareDoubles(value, 0.0, 0.01) == 0 ) { value = -0.2 + 0.4*GenomeAnalysisEngine.getRandomGenerator().nextDouble(); } + value = Double.parseDouble( (String)vc.getAttribute( annotationKey ) ); + if( Double.isInfinite(value) ) { value = Double.NaN; } + if( jitter && annotationKey.equalsIgnoreCase("HRUN") ) { // Integer valued annotations must be jittered a bit to work in this GMM + value += -0.25 + 0.5 * GenomeAnalysisEngine.getRandomGenerator().nextDouble(); } - + if( jitter && annotationKey.equalsIgnoreCase("HaplotypeScore") && MathUtils.compareDoubles(value, 0.0, 0.0001) == 0 ) { value = -0.2 + 0.4*GenomeAnalysisEngine.getRandomGenerator().nextDouble(); } + if( jitter && annotationKey.equalsIgnoreCase("FS") && MathUtils.compareDoubles(value, 0.0, 0.001) == 0 ) { value = -0.2 + 0.4*GenomeAnalysisEngine.getRandomGenerator().nextDouble(); } } catch( Exception e ) { - value = Double.NaN; // The VQSR works with missing data now by marginalizing over the missing dimension when evaluating Gaussians + value = Double.NaN; // The VQSR works with missing data by marginalizing over the missing dimension when evaluating the Gaussian mixture model } return value; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java index 2c51f02d6..2d0355d7d 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java @@ -284,7 +284,7 @@ public class VariantRecalibrator extends RodWalker Date: Fri, 8 Jul 2011 12:48:49 -0400 Subject: [PATCH 18/22] Bug fix: if we're genotyping a very long indel (>100 bp) fail gracefully instead of with an array out of bounds exception --- .../org/broadinstitute/sting/utils/genotype/Haplotype.java | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/genotype/Haplotype.java b/public/java/src/org/broadinstitute/sting/utils/genotype/Haplotype.java index cb6557408..31791e805 100755 --- a/public/java/src/org/broadinstitute/sting/utils/genotype/Haplotype.java +++ b/public/java/src/org/broadinstitute/sting/utils/genotype/Haplotype.java @@ -133,8 +133,12 @@ public class Haplotype { byte[] basesBeforeVariant = Arrays.copyOfRange(refBases,startIdxInReference,startIdxInReference+numPrefBases); + int startAfter = startIdxInReference+numPrefBases+ refAllele.getBases().length; + // protect against long events that overrun available reference context + if (startAfter > refBases.length) + startAfter = refBases.length; byte[] basesAfterVariant = Arrays.copyOfRange(refBases, - startIdxInReference+numPrefBases+ refAllele.getBases().length, refBases.length); + startAfter, refBases.length); // Create location for all haplotypes From a3c9d9c3ff561a0a910b5313e0ebb037b7883a4b Mon Sep 17 00:00:00 2001 From: David Roazen Date: Fri, 8 Jul 2011 15:34:39 -0400 Subject: [PATCH 22/22] Fixing Contracts for Java, and enabling contracts by default for unit/integration tests. The NullPointerException we were seeing when trying to run with contracts enabled was being caused by an outdated version of the asm library. To run tests without contracts and disable their compilation, pass in "-Duse.contracts=false" to ant. Also did some minor unrelated cleanup in build.xml --- build.xml | 70 +++++++++++------- ivy.xml | 4 + settings/ivysettings.xml | 1 + .../cofoja-1.0-20110609.jar | Bin .../cofoja-1.0-20110609.xml | 3 + 5 files changed, 51 insertions(+), 27 deletions(-) rename settings/repository/{com.google => com.google.code.cofoja}/cofoja-1.0-20110609.jar (100%) create mode 100644 settings/repository/com.google.code.cofoja/cofoja-1.0-20110609.xml diff --git a/build.xml b/build.xml index 2a23f74c1..986d89213 100644 --- a/build.xml +++ b/build.xml @@ -28,6 +28,7 @@ + @@ -44,11 +45,11 @@ - - + + - - + + @@ -69,7 +70,7 @@ - + @@ -103,7 +104,7 @@ - + @@ -128,14 +129,14 @@ - + - + - + @@ -147,7 +148,7 @@ - + + + + + @@ -252,7 +257,7 @@ - + @@ -287,7 +292,7 @@ depends="gatk.compile.public.source,gatk.compile.private.source,gatk.compile.external.source" description="compile the GATK source" /> - + @@ -299,7 +304,16 @@ - + + + + + + + + + + @@ -312,7 +326,7 @@ + description="create GATK contracts" if="include.contracts" /> @@ -452,7 +466,7 @@ - + @@ -663,7 +677,7 @@ - + - - + @@ -820,7 +836,7 @@ - + @@ -828,7 +844,7 @@ - + @@ -921,8 +937,8 @@ - - + + @@ -944,7 +960,7 @@ - + @@ -969,7 +985,7 @@ - + diff --git a/ivy.xml b/ivy.xml index c2a6c4ccd..ce724bc3c 100644 --- a/ivy.xml +++ b/ivy.xml @@ -60,6 +60,10 @@ + + + + diff --git a/settings/ivysettings.xml b/settings/ivysettings.xml index 1e47fa847..b77414df9 100644 --- a/settings/ivysettings.xml +++ b/settings/ivysettings.xml @@ -25,5 +25,6 @@ + diff --git a/settings/repository/com.google/cofoja-1.0-20110609.jar b/settings/repository/com.google.code.cofoja/cofoja-1.0-20110609.jar similarity index 100% rename from settings/repository/com.google/cofoja-1.0-20110609.jar rename to settings/repository/com.google.code.cofoja/cofoja-1.0-20110609.jar diff --git a/settings/repository/com.google.code.cofoja/cofoja-1.0-20110609.xml b/settings/repository/com.google.code.cofoja/cofoja-1.0-20110609.xml new file mode 100644 index 000000000..38d4e88f1 --- /dev/null +++ b/settings/repository/com.google.code.cofoja/cofoja-1.0-20110609.xml @@ -0,0 +1,3 @@ + + +