From 38740b0ff5e70df6c5d760251d129ed22d0eeccb Mon Sep 17 00:00:00 2001 From: "Mark A. DePristo" Date: Mon, 4 Jul 2011 16:11:42 -0400 Subject: [PATCH 01/38] First working version of the DiffNode readers for VCF and BAM files. Unit tests confirm the readers are approximately working. Skeleton of a working DiffObjects walker that will be able to provide detailed information about how exactly two files of the same type differ, so long as the files are supported by the DiffNode structure. --- public/testdata/diffTestMaster.vcf | 11 +++++++++++ public/testdata/diffTestTest.vcf | 11 +++++++++++ 2 files changed, 22 insertions(+) create mode 100644 public/testdata/diffTestMaster.vcf create mode 100644 public/testdata/diffTestTest.vcf diff --git a/public/testdata/diffTestMaster.vcf b/public/testdata/diffTestMaster.vcf new file mode 100644 index 000000000..549f54345 --- /dev/null +++ b/public/testdata/diffTestMaster.vcf @@ -0,0 +1,11 @@ +##fileformat=VCFv4.0 +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA12878 +chr1 2646 rs62635284 G A 0.15 PASS AC=2;AF=1.00;AN=2 GT:AD:DP:GL:GQ 1/1:53,75:3:-12.40,-0.90,-0.00:9.03 +chr1 2979 rs62635286 T G 83.67 PASS AC=2;AF=1.00;AN=2 GT:AD:DP:GL:GQ 1/1:31,32:9:-33.61,-2.71,-0.00:27.09 +chr1 2981 rs62028691 A G 14.69 PASS AC=2;AF=1.00;AN=2 GT:AD:DP:GL:GQ 1/1:31,33:9:-32.12,-2.71,-0.00:27.08 +chr1 4536 rs11582131 G C 0.18 PASS AC=1;AF=0.50;AN=2 GT:AD:DP:GL:GQ 0/1:42,33:16:-41.67,-4.82,-26.29:99 +chr1 4562 rs11490464 C G 0.14 PASS AC=1;AF=0.50;AN=2 GT:AD:DP:GL:GQ 0/1:26,30:9:-19.64,-2.72,-14.87:99 +chr1 4770 rs6682375 A G 0.32 PASS AC=2;AF=1.00;AN=2 GT:AD:DP:GL:GQ 1/1:9,111:84:-306.27,-28.58,-3.46:99 +chr1 4793 rs6682385 A G 0.15 PASS AC=2;AF=1.00;AN=2 GT:AD:DP:GL:GQ 1/1:4,115:109:-350.74,-32.88,-0.10:99 +chr1 5074 rs11586607 T G 0.01 PASS AC=2;AF=1.00;AN=2 GT:AD:DP:GL:GQ 1/1:29,97:39:-130.41,-11.75,-3.82:79.31 +chr1 5137 rs62636497 A T 140.49 PASS AC=2;AF=1.00;AN=2 GT:AD:DP:GL:GQ 1/1:0,74:39:-148.99,-11.75,-0.01:99 diff --git a/public/testdata/diffTestTest.vcf b/public/testdata/diffTestTest.vcf new file mode 100644 index 000000000..8699ab253 --- /dev/null +++ b/public/testdata/diffTestTest.vcf @@ -0,0 +1,11 @@ +##fileformat=VCFv4.0 +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA12878 +chr1 2646 rs62635284 G A 0.15 PASS AC=2;AF=1.00;AN=2 GT:AD:DP:GL:GQ 1/1:53,75:3:-12.40,-0.90,-0.00:9.03 +chr1 2979 rs62635286 T G 83.67 CHANGED_FILTER AC=2;AF=1.00;AN=2 GT:AD:DP:GL:GQ 1/1:31,32:9:-33.61,-2.71,-0.00:27.09 +chr1 2981 rs62028691 A G 14.69 PASS AC=2;AF=1.00;AN=2 GT:AD:DP:GL:GQ 1/1:31,33:9:-32.12,-2.71,-0.00:27.08 +chr1 4536 rs11582131 G C 0.18 PASS AC=2;AF=0.50;AN=2 GT:AD:DP:GL:GQ 0/1:42,33:16:-41.67,-4.82,-26.29:99 +chr1 4562 rs11490464 C G 0.14 PASS AC=1;AF=0.50;AN=2 GT:AD:DP:GL:GQ 1/1:26,30:9:-19.64,-2.72,-14.87:99 +chr1 4770 rs6682375 A G 0.32 PASS AC=2;AF=1.00;AN=2 GT:AD:DP:GL:GQ 0/1:9,111:84:-306.27,-28.58,-3.46:99 +chr1 4793 rs6682385 A G 0.15 PASS AC=2;AF=1.00;AN=2 GT:AD:DP:GL:GQ 1/1:4,114:109:-350.74,-32.88,-0.10:99 +chr1 5074 rs11586607 T G 0.01 PASS AC=2;AF=1.00;AN=2 GT:AD:DP:GL:GQ 1/1:29,97:39:-130.41,-11.74,-3.82:79.31 +chr1 5137 rs62636497 A T 140.49 PASS AC=2;AF=1.00;AN=2 GT:AD:DP:GL:GQ 1/1:0,74:39:-148.99,-11.75,-0.01:9 From 080875d5daf876ebd29b059c156890d7c349983f Mon Sep 17 00:00:00 2001 From: "Mark A. DePristo" Date: Tue, 5 Jul 2011 16:13:39 -0400 Subject: [PATCH 05/38] Refactored DiffNode/DiffElement/DiffValue class structure. DiffElement is now a pair of Name -> Value, where value is either a DiffValue or its subclass DiffNode. Code cleaned up, more tests added. DiffEngine is now working, with tests. DiffObjectWalker can now take two VCFs and itemize the difference between the two files correctly and concisely. --- build.xml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/build.xml b/build.xml index fe1723587..21066686f 100644 --- a/build.xml +++ b/build.xml @@ -741,8 +741,8 @@ - - + + From ccf34f7e45164cea70beccb4cf1b0791e6acde06 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 6 Jul 2011 21:57:22 -0400 Subject: [PATCH 07/38] (1) Added very useful helper class TestDataProvider to BaseTest that making creating data providers for TestNG far easier (2) DiffEngine now officially working with with summaries. Extensive UnitTests all around! --- .../org/broadinstitute/sting/BaseTest.java | 55 +++++++++++++++++++ 1 file changed, 55 insertions(+) diff --git a/public/java/test/org/broadinstitute/sting/BaseTest.java b/public/java/test/org/broadinstitute/sting/BaseTest.java index 61bb8b34b..b469c8a41 100755 --- a/public/java/test/org/broadinstitute/sting/BaseTest.java +++ b/public/java/test/org/broadinstitute/sting/BaseTest.java @@ -12,6 +12,10 @@ import java.io.*; import java.math.BigInteger; import java.security.MessageDigest; import java.security.NoSuchAlgorithmException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; /** * @@ -107,6 +111,57 @@ public abstract class BaseTest { } } + /** + * Simple generic utility class to creating TestNG data providers: + * + * 1: inherit this class, as in + * + * private class SummarizeDifferenceTest extends TestDataProvider { + * public SummarizeDifferenceTest() { + * super(SummarizeDifferenceTest.class); + * } + * ... + * } + * + * Provide a reference to your class to the TestDataProvider constructor. + * + * 2: Create instances of your subclass. Return from it the call to getTests, providing + * the class type of your test + * + * @DataProvider(name = "summaries") + * public Object[][] createSummaries() { + * new SummarizeDifferenceTest().addDiff("A", "A").addSummary("A:2"); + * new SummarizeDifferenceTest().addDiff("A", "B").addSummary("A:1", "B:1"); + * return SummarizeDifferenceTest.getTests(SummarizeDifferenceTest.class); + * } + * + * This class magically tracks created objects of this + */ + public static class TestDataProvider { + private static final Map> tests = new HashMap>(); + + /** + * Create a new TestDataProvider instance bound to the class variable C + * @param c + */ + public TestDataProvider(Class c) { + if ( ! tests.containsKey(c) ) + tests.put(c, new ArrayList()); + tests.get(c).add(this); + } + + /** + * Return all of the data providers in the form expected by TestNG of type class C + * @param c + * @return + */ + public static Object[][] getTests(Class c) { + List params2 = new ArrayList(); + for ( Object x : tests.get(c) ) params2.add(new Object[]{x}); + return params2.toArray(new Object[][]{}); + } + } + /** * test if the file exists * From 50111db2b7920ba5b3cfe659e33f7135ebd277f5 Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Thu, 7 Jul 2011 15:02:48 -0400 Subject: [PATCH 09/38] Fixing non-determinism in single-threaded VQSR by moving references to cern.Normal over to the static random generator available in GenomeAnalysisEngine --- .../GaussianMixtureModel.java | 24 ++++++++++--------- .../VariantDataManager.java | 3 +-- 2 files changed, 14 insertions(+), 13 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/GaussianMixtureModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/GaussianMixtureModel.java index acc1f24cc..41fea0896 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/GaussianMixtureModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/GaussianMixtureModel.java @@ -26,7 +26,6 @@ package org.broadinstitute.sting.gatk.walkers.variantrecalibration; import Jama.Matrix; -import cern.jet.random.Normal; import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.utils.MathUtils; @@ -97,7 +96,7 @@ public class GaussianMixtureModel { int ttt = 0; while( ttt++ < numIterations ) { - // Estep: assign each variant to the nearest cluster + // E step: assign each variant to the nearest cluster for( final VariantDatum datum : data ) { double minDistance = Double.MAX_VALUE; MultivariateGaussian minGaussian = null; @@ -112,7 +111,7 @@ public class GaussianMixtureModel { datum.assignment = minGaussian; } - // Mstep: update gaussian means based on assigned variants + // M step: update gaussian means based on assigned variants for( final MultivariateGaussian gaussian : gaussians ) { gaussian.zeroOutMu(); int numAssigned = 0; @@ -216,26 +215,29 @@ public class GaussianMixtureModel { } public double evaluateDatumMarginalized( final VariantDatum datum ) { - int numVals = 0; + int numSamples = 0; double sumPVarInGaussian = 0.0; - int numIter = 10; + final int numIterPerMissingAnnotation = 10; // Trade off here between speed of computation and accuracy of the marginalization final double[] pVarInGaussianLog10 = new double[gaussians.size()]; + // for each dimension for( int iii = 0; iii < datum.annotations.length; iii++ ) { - // marginalize over the missing dimension by drawing X random values for the missing annotation and averaging the lod + // if it is missing marginalize over the missing dimension by drawing X random values for the missing annotation and averaging the lod if( datum.isNull[iii] ) { - for( int ttt = 0; ttt < numIter; ttt++ ) { - datum.annotations[iii] = Normal.staticNextDouble(0.0, 1.0); + for( int ttt = 0; ttt < numIterPerMissingAnnotation; ttt++ ) { + datum.annotations[iii] = GenomeAnalysisEngine.getRandomGenerator().nextGaussian(); // draw a random sample from the standard normal distribution + // evaluate this random data point int gaussianIndex = 0; for( final MultivariateGaussian gaussian : gaussians ) { pVarInGaussianLog10[gaussianIndex++] = gaussian.pMixtureLog10 + gaussian.evaluateDatumLog10( datum ); } - sumPVarInGaussian += Math.pow(10.0, MathUtils.log10sumLog10(pVarInGaussianLog10)); - numVals++; + // add this sample's probability to the pile in order to take an average in the end + sumPVarInGaussian += Math.pow(10.0, MathUtils.log10sumLog10(pVarInGaussianLog10)); // p = 10 ^ Sum(pi_k * p(v|n,k)) + numSamples++; } } } - return Math.log10( sumPVarInGaussian / ((double) numVals) ); + return Math.log10( sumPVarInGaussian / ((double) numSamples) ); } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java index f0a78280d..e1a076e76 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java @@ -25,7 +25,6 @@ package org.broadinstitute.sting.gatk.walkers.variantrecalibration; -import cern.jet.random.Normal; import org.apache.log4j.Logger; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; @@ -91,7 +90,7 @@ public class VariantDataManager { meanVector[iii] = theMean; varianceVector[iii] = theSTD; for( final VariantDatum datum : data ) { - datum.annotations[iii] = ( datum.isNull[iii] ? Normal.staticNextDouble(0.0, 1.0) : ( datum.annotations[iii] - theMean ) / theSTD ); + datum.annotations[iii] = ( datum.isNull[iii] ? GenomeAnalysisEngine.getRandomGenerator().nextGaussian() : ( datum.annotations[iii] - theMean ) / theSTD ); // Each data point is now [ (x - mean) / standard deviation ] if( annotationKeys.get(iii).toLowerCase().contains("ranksum") && datum.isNull[iii] && datum.annotations[iii] > 0.0 ) { datum.annotations[iii] /= 3.0; From 212e9a1a0cf766f54f70d17d0786db570f8111ae Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Thu, 7 Jul 2011 15:18:57 -0400 Subject: [PATCH 10/38] Fixing unstable build after stable commit --- .../gatk/walkers/variantrecalibration/GaussianMixtureModel.java | 1 + 1 file changed, 1 insertion(+) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/GaussianMixtureModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/GaussianMixtureModel.java index b4a8c4c32..17461de2f 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/GaussianMixtureModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/GaussianMixtureModel.java @@ -26,6 +26,7 @@ package org.broadinstitute.sting.gatk.walkers.variantrecalibration; import Jama.Matrix; +import cern.jet.random.Normal; import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.utils.MathUtils; From 3d4f0e9dd76d1ab23a7e7fdecd61067b57b8e7d7 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Thu, 7 Jul 2011 17:21:15 -0400 Subject: [PATCH 15/38] Now supports the case where you have multiple AC values in the info field. --- .../gatk/walkers/varianteval/stratifications/AlleleCount.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleCount.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleCount.java index ff59c9e29..2cbc66e31 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleCount.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleCount.java @@ -43,9 +43,9 @@ public class AlleleCount extends VariantStratifier { if (eval != null) { int AC = -1; - if ( eval.hasAttribute("AC") ) + if ( eval.hasAttribute("AC") && eval.getAttribute("AC") instanceof Integer ) { AC = eval.getAttributeAsInt("AC"); - else if ( eval.isVariant() ) { + } else if ( eval.isVariant() ) { for (Allele allele : eval.getAlternateAlleles()) AC = Math.max(AC, eval.getChromosomeCount(allele)); } else From 4cfe0dd857891f6c70f4d0e58b60d6c4c401a0cd Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Thu, 7 Jul 2011 23:01:03 -0400 Subject: [PATCH 18/38] Test for bad alleles so that we don't generate IndexOutOfBoundsExceptions --- .../sting/utils/codecs/vcf/AbstractVCFCodec.java | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java index cd0f52c68..01344a117 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java @@ -225,7 +225,7 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec, loc = pos + alleles.get(0).length() - 1; } else if ( !isSingleNucleotideEvent(alleles) ) { ArrayList newAlleles = new ArrayList(); - loc = clipAlleles(pos, ref, alleles, newAlleles); + loc = clipAlleles(pos, ref, alleles, newAlleles, lineNo); alleles = newAlleles; } @@ -504,7 +504,7 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec, * @param clippedAlleles output list of clipped alleles * @return a list of alleles, clipped to the reference */ - protected static long clipAlleles(long position, String ref, List unclippedAlleles, List clippedAlleles) { + protected static long clipAlleles(long position, String ref, List unclippedAlleles, List clippedAlleles, int lineNo) { // Note that the computation of forward clipping here is meant only to see whether there is a common // base to all alleles, and to correctly compute reverse clipping, @@ -522,6 +522,8 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec, } if (a.length() - reverseClipped <= forwardClipping || a.length() - forwardClipping == 0) clipping = false; + else if (ref.length() == reverseClipped) + generateException("bad alleles encountered", lineNo); else if (a.getBases()[a.length()-reverseClipped-1] != ref.getBytes()[ref.length()-reverseClipped-1]) clipping = false; } From 2a4b3ae4a20ea3d5bbd3047f2c2f7842cd01e2ef Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Fri, 8 Jul 2011 12:48:33 -0400 Subject: [PATCH 21/38] Cleaning up / removing most of the monkeying around with annotation values that happens in VariantDataManager --- .../VariantDataManager.java | 34 +++++-------------- .../VariantRecalibrator.java | 6 ++-- ...VariantRecalibratorArgumentCollection.java | 2 +- 3 files changed, 13 insertions(+), 29 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java index 5f35c182c..ddeda1699 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java @@ -82,19 +82,11 @@ public class VariantDataManager { } foundZeroVarianceAnnotation = foundZeroVarianceAnnotation || (theSTD < 1E-6); - if( annotationKeys.get(iii).toLowerCase().contains("ranksum") ) { // BUGBUG: to clean up - for( final VariantDatum datum : data ) { - if( datum.annotations[iii] > 0.0 ) { datum.annotations[iii] /= 3.0; } - } - } meanVector[iii] = theMean; varianceVector[iii] = theSTD; for( final VariantDatum datum : data ) { + // Transform each data point via: (x - mean) / standard deviation datum.annotations[iii] = ( datum.isNull[iii] ? GenomeAnalysisEngine.getRandomGenerator().nextGaussian() : ( datum.annotations[iii] - theMean ) / theSTD ); - // Each data point is now [ (x - mean) / standard deviation ] - if( annotationKeys.get(iii).toLowerCase().contains("ranksum") && datum.isNull[iii] && datum.annotations[iii] > 0.0 ) { - datum.annotations[iii] /= 3.0; - } } } if( foundZeroVarianceAnnotation ) { @@ -163,7 +155,7 @@ public class VariantDataManager { final int numBadSitesAdded = trainingData.size(); logger.info( "Found " + numBadSitesAdded + " variants overlapping bad sites training tracks." ); - // Next, sort the variants by the LOD coming from the positive model and add to the list the bottom X percent of variants + // Next sort the variants by the LOD coming from the positive model and add to the list the bottom X percent of variants Collections.sort( data ); final int numToAdd = Math.max( minimumNumber - trainingData.size(), Math.round((float)bottomPercentage * data.size()) ); if( numToAdd > data.size() ) { @@ -241,23 +233,15 @@ public class VariantDataManager { double value; try { - if( annotationKey.equalsIgnoreCase("QUAL") ) { - value = vc.getPhredScaledQual(); - } else if( annotationKey.equalsIgnoreCase("DP") ) { - value = Double.parseDouble( (String)vc.getAttribute( "DP" ) ) / Double.parseDouble( (String)vc.getAttribute( "AN" ) ); - } else { - value = Double.parseDouble( (String)vc.getAttribute( annotationKey ) ); - if( Double.isInfinite(value) ) { value = Double.NaN; } - if( annotationKey.equalsIgnoreCase("InbreedingCoeff") && value > 0.05 ) { value = Double.NaN; } - if( jitter && annotationKey.equalsIgnoreCase("HRUN") ) { // Integer valued annotations must be jittered a bit to work in this GMM - value += -0.25 + 0.5 * GenomeAnalysisEngine.getRandomGenerator().nextDouble(); - } - if( annotationKey.equalsIgnoreCase("HaplotypeScore") && MathUtils.compareDoubles(value, 0.0, 0.0001) == 0 ) { value = -0.2 + 0.4*GenomeAnalysisEngine.getRandomGenerator().nextDouble(); } - if( annotationKey.equalsIgnoreCase("FS") && MathUtils.compareDoubles(value, 0.0, 0.01) == 0 ) { value = -0.2 + 0.4*GenomeAnalysisEngine.getRandomGenerator().nextDouble(); } + value = Double.parseDouble( (String)vc.getAttribute( annotationKey ) ); + if( Double.isInfinite(value) ) { value = Double.NaN; } + if( jitter && annotationKey.equalsIgnoreCase("HRUN") ) { // Integer valued annotations must be jittered a bit to work in this GMM + value += -0.25 + 0.5 * GenomeAnalysisEngine.getRandomGenerator().nextDouble(); } - + if( jitter && annotationKey.equalsIgnoreCase("HaplotypeScore") && MathUtils.compareDoubles(value, 0.0, 0.0001) == 0 ) { value = -0.2 + 0.4*GenomeAnalysisEngine.getRandomGenerator().nextDouble(); } + if( jitter && annotationKey.equalsIgnoreCase("FS") && MathUtils.compareDoubles(value, 0.0, 0.001) == 0 ) { value = -0.2 + 0.4*GenomeAnalysisEngine.getRandomGenerator().nextDouble(); } } catch( Exception e ) { - value = Double.NaN; // The VQSR works with missing data now by marginalizing over the missing dimension when evaluating Gaussians + value = Double.NaN; // The VQSR works with missing data by marginalizing over the missing dimension when evaluating the Gaussian mixture model } return value; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java index 2c51f02d6..2d0355d7d 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java @@ -284,7 +284,7 @@ public class VariantRecalibrator extends RodWalker Date: Fri, 8 Jul 2011 12:48:49 -0400 Subject: [PATCH 22/38] Bug fix: if we're genotyping a very long indel (>100 bp) fail gracefully instead of with an array out of bounds exception --- .../org/broadinstitute/sting/utils/genotype/Haplotype.java | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/genotype/Haplotype.java b/public/java/src/org/broadinstitute/sting/utils/genotype/Haplotype.java index cb6557408..31791e805 100755 --- a/public/java/src/org/broadinstitute/sting/utils/genotype/Haplotype.java +++ b/public/java/src/org/broadinstitute/sting/utils/genotype/Haplotype.java @@ -133,8 +133,12 @@ public class Haplotype { byte[] basesBeforeVariant = Arrays.copyOfRange(refBases,startIdxInReference,startIdxInReference+numPrefBases); + int startAfter = startIdxInReference+numPrefBases+ refAllele.getBases().length; + // protect against long events that overrun available reference context + if (startAfter > refBases.length) + startAfter = refBases.length; byte[] basesAfterVariant = Arrays.copyOfRange(refBases, - startIdxInReference+numPrefBases+ refAllele.getBases().length, refBases.length); + startAfter, refBases.length); // Create location for all haplotypes From a3c9d9c3ff561a0a910b5313e0ebb037b7883a4b Mon Sep 17 00:00:00 2001 From: David Roazen Date: Fri, 8 Jul 2011 15:34:39 -0400 Subject: [PATCH 27/38] Fixing Contracts for Java, and enabling contracts by default for unit/integration tests. The NullPointerException we were seeing when trying to run with contracts enabled was being caused by an outdated version of the asm library. To run tests without contracts and disable their compilation, pass in "-Duse.contracts=false" to ant. Also did some minor unrelated cleanup in build.xml --- build.xml | 70 +++++++++++------- ivy.xml | 4 + settings/ivysettings.xml | 1 + .../cofoja-1.0-20110609.jar | Bin .../cofoja-1.0-20110609.xml | 3 + 5 files changed, 51 insertions(+), 27 deletions(-) rename settings/repository/{com.google => com.google.code.cofoja}/cofoja-1.0-20110609.jar (100%) create mode 100644 settings/repository/com.google.code.cofoja/cofoja-1.0-20110609.xml diff --git a/build.xml b/build.xml index 2a23f74c1..986d89213 100644 --- a/build.xml +++ b/build.xml @@ -28,6 +28,7 @@ + @@ -44,11 +45,11 @@ - - + + - - + + @@ -69,7 +70,7 @@ - + @@ -103,7 +104,7 @@ - + @@ -128,14 +129,14 @@ - + - + - + @@ -147,7 +148,7 @@ - + + + + + @@ -252,7 +257,7 @@ - + @@ -287,7 +292,7 @@ depends="gatk.compile.public.source,gatk.compile.private.source,gatk.compile.external.source" description="compile the GATK source" /> - + @@ -299,7 +304,16 @@ - + + + + + + + + + + @@ -312,7 +326,7 @@ + description="create GATK contracts" if="include.contracts" /> @@ -452,7 +466,7 @@ - + @@ -663,7 +677,7 @@ - + - - + @@ -820,7 +836,7 @@ - + @@ -828,7 +844,7 @@ - + @@ -921,8 +937,8 @@ - - + + @@ -944,7 +960,7 @@ - + @@ -969,7 +985,7 @@ - + diff --git a/ivy.xml b/ivy.xml index c2a6c4ccd..ce724bc3c 100644 --- a/ivy.xml +++ b/ivy.xml @@ -60,6 +60,10 @@ + + + + diff --git a/settings/ivysettings.xml b/settings/ivysettings.xml index 1e47fa847..b77414df9 100644 --- a/settings/ivysettings.xml +++ b/settings/ivysettings.xml @@ -25,5 +25,6 @@ + diff --git a/settings/repository/com.google/cofoja-1.0-20110609.jar b/settings/repository/com.google.code.cofoja/cofoja-1.0-20110609.jar similarity index 100% rename from settings/repository/com.google/cofoja-1.0-20110609.jar rename to settings/repository/com.google.code.cofoja/cofoja-1.0-20110609.jar diff --git a/settings/repository/com.google.code.cofoja/cofoja-1.0-20110609.xml b/settings/repository/com.google.code.cofoja/cofoja-1.0-20110609.xml new file mode 100644 index 000000000..38d4e88f1 --- /dev/null +++ b/settings/repository/com.google.code.cofoja/cofoja-1.0-20110609.xml @@ -0,0 +1,3 @@ + + + From 8a78414432226daceccccb4b38463c0826fb0f19 Mon Sep 17 00:00:00 2001 From: David Roazen Date: Mon, 11 Jul 2011 12:10:11 -0400 Subject: [PATCH 28/38] Removed TileCovariate as a dependency for AnalyzeCovariates.jar --- public/packages/AnalyzeCovariates.xml | 1 - 1 file changed, 1 deletion(-) diff --git a/public/packages/AnalyzeCovariates.xml b/public/packages/AnalyzeCovariates.xml index 1862d6cbb..7e31934df 100644 --- a/public/packages/AnalyzeCovariates.xml +++ b/public/packages/AnalyzeCovariates.xml @@ -10,7 +10,6 @@ - From 86890c63574eba41bbbce6c52e2614f6c81b6f27 Mon Sep 17 00:00:00 2001 From: Christopher Hartl Date: Mon, 11 Jul 2011 16:16:15 -0400 Subject: [PATCH 30/38] N and K (in binomial probability) got switched in RFA Walker with the last commit. No longer will NaNs be produced. Added: TableToVCF. Kind of a longer-term project, but there are lots of variant calls available in a weird tabular format. I used this to convert Ju Et Al small indels to VCF. I'll check against the 1000G ASN superpopulation calls to see if we see a good amount of recapitulation, and if so, i'll put them in unvalidated comparisons. Minor chances to the TableCodec and TableFeatures to allow for this (the codec can sometimes drop a column, and the feature now allows you to grab on to its header). --- .../sting/gatk/refdata/features/table/TableCodec.java | 0 .../sting/gatk/refdata/features/table/TableFeature.java | 6 +++++- .../broadinstitute/sting/utils/variantcontext/Allele.java | 2 +- 3 files changed, 6 insertions(+), 2 deletions(-) mode change 100644 => 100755 public/java/src/org/broadinstitute/sting/gatk/refdata/features/table/TableCodec.java mode change 100644 => 100755 public/java/src/org/broadinstitute/sting/gatk/refdata/features/table/TableFeature.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/features/table/TableCodec.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/features/table/TableCodec.java old mode 100644 new mode 100755 diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/features/table/TableFeature.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/features/table/TableFeature.java old mode 100644 new mode 100755 index 6ff0384a0..4b4ebe450 --- a/public/java/src/org/broadinstitute/sting/gatk/refdata/features/table/TableFeature.java +++ b/public/java/src/org/broadinstitute/sting/gatk/refdata/features/table/TableFeature.java @@ -55,10 +55,14 @@ public class TableFeature implements Feature { } public List getAllValues() { - return getValuesTo(values.size()-1); + return getValuesTo(values.size()); } public List getValuesTo(int columnPosition) { return values.subList(0,columnPosition); } + + public List getHeader() { + return keys; + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/Allele.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/Allele.java index a9ba46159..901de6fae 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/Allele.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/Allele.java @@ -108,7 +108,7 @@ public class Allele implements Comparable { this.bases = bases; if ( ! acceptableAlleleBases(bases) ) - throw new IllegalArgumentException("Unexpected base in allele bases " + new String(bases)); + throw new IllegalArgumentException("Unexpected base in allele bases \'" + new String(bases)+"\'"); } private Allele(String bases, boolean isRef) { From e3748675dbd518042ad67cfc653c7f1a5f89b327 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Mon, 11 Jul 2011 17:40:45 -0400 Subject: [PATCH 31/38] Support for VCF 4.1 header counts --- .../annotator/DepthPerAlleleBySample.java | 3 +- .../ReadDepthAndAllelicFractionBySample.java | 5 +- .../gatk/walkers/annotator/SampleList.java | 3 +- .../utils/codecs/vcf/StandardVCFWriter.java | 11 +-- .../codecs/vcf/VCFCompoundHeaderLine.java | 91 +++++++++++++++---- .../sting/utils/codecs/vcf/VCFConstants.java | 2 + .../utils/codecs/vcf/VCFFormatHeaderLine.java | 4 + .../utils/codecs/vcf/VCFHeaderLineCount.java | 8 ++ .../utils/codecs/vcf/VCFInfoHeaderLine.java | 4 + 9 files changed, 101 insertions(+), 30 deletions(-) create mode 100644 public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeaderLineCount.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java index 754d28dfd..ee66b50ee 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java @@ -1,5 +1,6 @@ package org.broadinstitute.sting.gatk.walkers.annotator; +import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineCount; import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.Genotype; import org.broadinstitute.sting.utils.variantcontext.VariantContext; @@ -142,5 +143,5 @@ public class DepthPerAlleleBySample implements GenotypeAnnotation, StandardAnnot // public String getIndelBases() public List getKeyNames() { return Arrays.asList("AD"); } - public List getDescriptions() { return Arrays.asList(new VCFFormatHeaderLine(getKeyNames().get(0), VCFCompoundHeaderLine.UNBOUNDED, VCFHeaderLineType.Integer, "Allelic depths for the ref and alt alleles in the order listed")); } + public List getDescriptions() { return Arrays.asList(new VCFFormatHeaderLine(getKeyNames().get(0), VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "Allelic depths for the ref and alt alleles in the order listed")); } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadDepthAndAllelicFractionBySample.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadDepthAndAllelicFractionBySample.java index f287549bb..a670532af 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadDepthAndAllelicFractionBySample.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadDepthAndAllelicFractionBySample.java @@ -29,6 +29,7 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.GenotypeAnnotation; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineCount; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedExtendedEventPileup; @@ -200,8 +201,8 @@ public class ReadDepthAndAllelicFractionBySample implements GenotypeAnnotation { 1, VCFHeaderLineType.Integer, "Total read depth per sample, including MQ0"), - new VCFFormatHeaderLine(getKeyNames().get(1), - VCFCompoundHeaderLine.UNBOUNDED, + new VCFFormatHeaderLine(getKeyNames().get(1), + VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Float, "Fractions of reads (excluding MQ0 from both ref and alt) supporting each reported alternative allele, per sample")); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SampleList.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SampleList.java index 82f16be42..e2fd2a3d4 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SampleList.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SampleList.java @@ -25,6 +25,7 @@ package org.broadinstitute.sting.gatk.walkers.annotator; +import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineCount; import org.broadinstitute.sting.utils.variantcontext.Genotype; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; @@ -65,5 +66,5 @@ public class SampleList implements InfoFieldAnnotation { public List getKeyNames() { return Arrays.asList("Samples"); } - public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("Samples", VCFInfoHeaderLine.UNBOUNDED, VCFHeaderLineType.String, "List of polymorphic samples")); } + public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("Samples", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "List of polymorphic samples")); } } diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/StandardVCFWriter.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/StandardVCFWriter.java index 31251c089..230773310 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/StandardVCFWriter.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/StandardVCFWriter.java @@ -360,14 +360,7 @@ public class StandardVCFWriter implements VCFWriter { if ( !entry.getValue().equals("") ) { int numVals = 1; VCFInfoHeaderLine metaData = mHeader.getInfoHeaderLine(key); - if ( metaData != null ) - numVals = metaData.getCount(); - - // take care of unbounded encoding - if ( numVals == VCFInfoHeaderLine.UNBOUNDED ) - numVals = 1; - - if ( numVals > 0 ) { + if ( metaData != null && (metaData.getCountType() != VCFHeaderLineCount.INTEGER || metaData.getCount() > 0) ) { mWriter.write("="); mWriter.write(entry.getValue()); } @@ -423,7 +416,7 @@ public class StandardVCFWriter implements VCFWriter { VCFFormatHeaderLine metaData = mHeader.getFormatHeaderLine(key); if ( metaData != null ) { - int numInFormatField = metaData.getCount(); + int numInFormatField = metaData.getCount(vc.getAlternateAlleles().size()); if ( numInFormatField > 1 && val.equals(VCFConstants.MISSING_VALUE_v4) ) { // If we have a missing field but multiple values are expected, we need to construct a new string with all fields. // For example, if Number=2, the string has to be ".,." diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCompoundHeaderLine.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCompoundHeaderLine.java index a799161ad..49f9ab184 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCompoundHeaderLine.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCompoundHeaderLine.java @@ -24,6 +24,8 @@ package org.broadinstitute.sting.utils.codecs.vcf; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + import java.util.Arrays; import java.util.LinkedHashMap; import java.util.Map; @@ -43,26 +45,43 @@ public abstract class VCFCompoundHeaderLine extends VCFHeaderLine implements VCF // the field types private String name; - private int count; + private int count = -1; + private VCFHeaderLineCount countType; private String description; private VCFHeaderLineType type; // access methods public String getName() { return name; } - public int getCount() { return count; } public String getDescription() { return description; } public VCFHeaderLineType getType() { return type; } + public VCFHeaderLineCount getCountType() { return countType; } + public int getCount() { + if ( countType != VCFHeaderLineCount.INTEGER ) + throw new ReviewedStingException("Asking for header line count when type is not an integer"); + return count; + } - // - public void setNumberToUnbounded() { this.count = UNBOUNDED; } + // utility method + public int getCount(int numAltAlleles) { + int myCount; + switch ( countType ) { + case INTEGER: myCount = count; break; + case UNBOUNDED: myCount = -1; break; + case A: myCount = numAltAlleles; break; + case G: myCount = ((numAltAlleles + 1) * (numAltAlleles + 2) / 2); break; + default: throw new ReviewedStingException("Unknown count type: " + countType); + } + return myCount; + } + + public void setNumberToUnbounded() { + countType = VCFHeaderLineCount.UNBOUNDED; + count = -1; + } // our type of line, i.e. format, info, etc private final SupportedHeaderLineType lineType; - // line numerical values are allowed to be unbounded (or unknown), which is - // marked with a dot (.) - public static final int UNBOUNDED = -1; // the value we store internally for unbounded types - /** * create a VCF format header line * @@ -74,6 +93,7 @@ public abstract class VCFCompoundHeaderLine extends VCFHeaderLine implements VCF protected VCFCompoundHeaderLine(String name, int count, VCFHeaderLineType type, String description, SupportedHeaderLineType lineType) { super(lineType.toString(), ""); this.name = name; + this.countType = VCFHeaderLineCount.INTEGER; this.count = count; this.type = type; this.description = description; @@ -81,6 +101,24 @@ public abstract class VCFCompoundHeaderLine extends VCFHeaderLine implements VCF validate(); } + /** + * create a VCF format header line + * + * @param name the name for this header line + * @param count the count type for this header line + * @param type the type for this header line + * @param description the description for this header line + */ + protected VCFCompoundHeaderLine(String name, VCFHeaderLineCount count, VCFHeaderLineType type, String description, SupportedHeaderLineType lineType) { + super(lineType.toString(), ""); + this.name = name; + this.countType = count; + this.type = type; + this.description = description; + this.lineType = lineType; + validate(); + } + /** * create a VCF format header line * @@ -92,9 +130,22 @@ public abstract class VCFCompoundHeaderLine extends VCFHeaderLine implements VCF super(lineType.toString(), ""); Map mapping = VCFHeaderLineTranslator.parseLine(version,line, Arrays.asList("ID","Number","Type","Description")); name = mapping.get("ID"); - count = (version == VCFHeaderVersion.VCF4_0 || version == VCFHeaderVersion.VCF4_1) ? - mapping.get("Number").equals(VCFConstants.UNBOUNDED_ENCODING_v4) ? UNBOUNDED : Integer.valueOf(mapping.get("Number")) : - mapping.get("Number").equals(VCFConstants.UNBOUNDED_ENCODING_v3) ? UNBOUNDED : Integer.valueOf(mapping.get("Number")); + count = -1; + final String numberStr = mapping.get("Number"); + if ( numberStr.equals(VCFConstants.PER_ALLELE_COUNT) ) { + countType = VCFHeaderLineCount.A; + } else if ( numberStr.equals(VCFConstants.PER_GENOTYPE_COUNT) ) { + countType = VCFHeaderLineCount.G; + } else if ( ((version == VCFHeaderVersion.VCF4_0 || version == VCFHeaderVersion.VCF4_1) && + numberStr.equals(VCFConstants.UNBOUNDED_ENCODING_v4)) || + ((version == VCFHeaderVersion.VCF3_2 || version == VCFHeaderVersion.VCF3_3) && + numberStr.equals(VCFConstants.UNBOUNDED_ENCODING_v3)) ) { + countType = VCFHeaderLineCount.UNBOUNDED; + } else { + countType = VCFHeaderLineCount.INTEGER; + count = Integer.valueOf(numberStr); + + } type = VCFHeaderLineType.valueOf(mapping.get("Type")); if (type == VCFHeaderLineType.Flag && !allowFlagValues()) throw new IllegalArgumentException("Flag is an unsupported type for this kind of field"); @@ -121,7 +172,15 @@ public abstract class VCFCompoundHeaderLine extends VCFHeaderLine implements VCF protected String toStringEncoding() { Map map = new LinkedHashMap(); map.put("ID", name); - map.put("Number", count == UNBOUNDED ? VCFConstants.UNBOUNDED_ENCODING_v4 : count); + Object number; + switch ( countType ) { + case A: number = VCFConstants.PER_ALLELE_COUNT; break; + case G: number = VCFConstants.PER_GENOTYPE_COUNT; break; + case UNBOUNDED: number = VCFConstants.UNBOUNDED_ENCODING_v4; break; + case INTEGER: + default: number = count; + } + map.put("Number", number); map.put("Type", type); map.put("Description", description); return lineType.toString() + "=" + VCFHeaderLine.toStringEncoding(map); @@ -136,15 +195,13 @@ public abstract class VCFCompoundHeaderLine extends VCFHeaderLine implements VCF if ( !(o instanceof VCFCompoundHeaderLine) ) return false; VCFCompoundHeaderLine other = (VCFCompoundHeaderLine)o; - return name.equals(other.name) && - count == other.count && - description.equals(other.description) && - type == other.type && - lineType == other.lineType; + return equalsExcludingDescription(other) && + description.equals(other.description); } public boolean equalsExcludingDescription(VCFCompoundHeaderLine other) { return count == other.count && + countType == other.countType && type == other.type && lineType == other.lineType && name.equals(other.name); diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFConstants.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFConstants.java index 695c46c27..91cf86c70 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFConstants.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFConstants.java @@ -99,6 +99,8 @@ public final class VCFConstants { public static final String MISSING_DEPTH_v3 = "-1"; public static final String UNBOUNDED_ENCODING_v4 = "."; public static final String UNBOUNDED_ENCODING_v3 = "-1"; + public static final String PER_ALLELE_COUNT = "A"; + public static final String PER_GENOTYPE_COUNT = "G"; public static final String EMPTY_ALLELE = "."; public static final String EMPTY_GENOTYPE = "./."; public static final double MAX_GENOTYPE_QUAL = 99.0; diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFFormatHeaderLine.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFFormatHeaderLine.java index 352be3e97..f68cb670b 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFFormatHeaderLine.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFFormatHeaderLine.java @@ -16,6 +16,10 @@ public class VCFFormatHeaderLine extends VCFCompoundHeaderLine { throw new IllegalArgumentException("Flag is an unsupported type for format fields"); } + public VCFFormatHeaderLine(String name, VCFHeaderLineCount count, VCFHeaderLineType type, String description) { + super(name, count, type, description, SupportedHeaderLineType.INFO); + } + protected VCFFormatHeaderLine(String line, VCFHeaderVersion version) { super(line, version, SupportedHeaderLineType.FORMAT); } diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeaderLineCount.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeaderLineCount.java new file mode 100644 index 000000000..d615c7c78 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeaderLineCount.java @@ -0,0 +1,8 @@ +package org.broadinstitute.sting.utils.codecs.vcf; + +/** + * the count encodings we use for fields in VCF header lines + */ +public enum VCFHeaderLineCount { + INTEGER, A, G, UNBOUNDED; +} diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFInfoHeaderLine.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFInfoHeaderLine.java index 135a5c1a1..9b20f38a1 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFInfoHeaderLine.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFInfoHeaderLine.java @@ -13,6 +13,10 @@ public class VCFInfoHeaderLine extends VCFCompoundHeaderLine { super(name, count, type, description, SupportedHeaderLineType.INFO); } + public VCFInfoHeaderLine(String name, VCFHeaderLineCount count, VCFHeaderLineType type, String description) { + super(name, count, type, description, SupportedHeaderLineType.INFO); + } + protected VCFInfoHeaderLine(String line, VCFHeaderVersion version) { super(line, version, SupportedHeaderLineType.INFO); } From e93052a51e88ddb77c8ce71cf6a14a449ca3b1aa Mon Sep 17 00:00:00 2001 From: Khalid Shakir Date: Mon, 11 Jul 2011 19:17:58 -0400 Subject: [PATCH 32/38] When generating the QGraph, don't regenerate if there aren't scatter/gather jobs. Fixed a display issue with the number of milliseconds that Queue has tried to contact LSF. --- .../sting/queue/engine/QGraph.scala | 44 ++++++++++--------- .../queue/engine/lsf/Lsf706JobRunner.scala | 6 +-- 2 files changed, 26 insertions(+), 24 deletions(-) diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/QGraph.scala b/public/scala/src/org/broadinstitute/sting/queue/engine/QGraph.scala index bfcc4d48c..8ed3f84c1 100755 --- a/public/scala/src/org/broadinstitute/sting/queue/engine/QGraph.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/engine/QGraph.scala @@ -138,30 +138,32 @@ class QGraph extends Logging { validate() if (running && numMissingValues == 0) { - logger.info("Generating scatter gather jobs.") val scatterGathers = jobGraph.edgeSet.filter(edge => scatterGatherable(edge)) + if (!scatterGathers.isEmpty) { + logger.info("Generating scatter gather jobs.") - var addedFunctions = List.empty[QFunction] - for (scatterGather <- scatterGathers) { - val functions = scatterGather.asInstanceOf[FunctionEdge] - .function.asInstanceOf[ScatterGatherableFunction] - .generateFunctions() - addedFunctions ++= functions + var addedFunctions = List.empty[QFunction] + for (scatterGather <- scatterGathers) { + val functions = scatterGather.asInstanceOf[FunctionEdge] + .function.asInstanceOf[ScatterGatherableFunction] + .generateFunctions() + addedFunctions ++= functions + } + + logger.info("Removing original jobs.") + this.jobGraph.removeAllEdges(scatterGathers) + prune() + + logger.info("Adding scatter gather jobs.") + addedFunctions.foreach(function => if (running) this.add(function)) + + logger.info("Regenerating graph.") + fill + val scatterGatherDotFile = if (settings.expandedDotFile != null) settings.expandedDotFile else settings.dotFile + if (scatterGatherDotFile != null) + renderToDot(scatterGatherDotFile) + validate() } - - logger.info("Removing original jobs.") - this.jobGraph.removeAllEdges(scatterGathers) - prune() - - logger.info("Adding scatter gather jobs.") - addedFunctions.foreach(function => if (running) this.add(function)) - - logger.info("Regenerating graph.") - fill - val scatterGatherDotFile = if (settings.expandedDotFile != null) settings.expandedDotFile else settings.dotFile - if (scatterGatherDotFile != null) - renderToDot(scatterGatherDotFile) - validate() } } diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/lsf/Lsf706JobRunner.scala b/public/scala/src/org/broadinstitute/sting/queue/engine/lsf/Lsf706JobRunner.scala index 57d133dfe..ac2f036b4 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/engine/lsf/Lsf706JobRunner.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/engine/lsf/Lsf706JobRunner.scala @@ -286,11 +286,11 @@ object Lsf706JobRunner extends Logging { // LSB_SHAREDIR/cluster_name/logdir/lsb.acct (man bacct) // LSB_SHAREDIR/cluster_name/logdir/lsb.events (man bhist) logger.debug("Job Id %s status / exitStatus / exitInfo: ??? / ??? / ???".format(runner.jobId)) - val unknownStatusSeconds = (System.currentTimeMillis - runner.lastStatusUpdate) - if (unknownStatusSeconds > (unknownStatusMaxSeconds * 1000L)) { + val unknownStatusMillis = (System.currentTimeMillis - runner.lastStatusUpdate) + if (unknownStatusMillis > (unknownStatusMaxSeconds * 1000L)) { // Unknown status has been returned for a while now. runner.updateStatus(RunnerStatus.FAILED) - logger.error("Unable to read LSF status for %d minutes: job id %d: %s".format(unknownStatusSeconds/60, runner.jobId, runner.function.description)) + logger.error("Unable to read LSF status for %0.2f minutes: job id %d: %s".format(unknownStatusMillis/(60 * 1000D), runner.jobId, runner.function.description)) } } From 5e593793af43153e844f69c0f0df367b1d56a658 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Mon, 11 Jul 2011 23:10:27 -0400 Subject: [PATCH 33/38] DiffEngine utility function simpleDiffFiles printSummaryReport now uses GATKReport for nice formating Moved print formatting arguments into inner class provided to printing functions themselves, not the class BAMDiffableReader only reads 1000 entries to avoid performance issue. Work around for BAM files with non-unique names Uncommented all of the incorrectly commented out CombineVariants integrationtests BaseTest now uses DiffEngine to provide inline differences to VCF and BAM files --- .../org/broadinstitute/sting/BaseTest.java | 15 +- .../CombineVariantsIntegrationTest.java | 142 +++++++++--------- 2 files changed, 83 insertions(+), 74 deletions(-) diff --git a/public/java/test/org/broadinstitute/sting/BaseTest.java b/public/java/test/org/broadinstitute/sting/BaseTest.java index b469c8a41..b3e422ba9 100755 --- a/public/java/test/org/broadinstitute/sting/BaseTest.java +++ b/public/java/test/org/broadinstitute/sting/BaseTest.java @@ -4,6 +4,7 @@ import org.apache.commons.io.FileUtils; import org.apache.log4j.*; import org.apache.log4j.spi.LoggingEvent; import org.broadinstitute.sting.commandline.CommandLineUtils; +import org.broadinstitute.sting.gatk.walkers.diffengine.DiffEngine; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.testng.Assert; @@ -334,11 +335,14 @@ public abstract class BaseTest { if (parameterize || expectedMD5.equals("")) { // Don't assert - } else { - Assert.assertEquals(filemd5sum, expectedMD5, name + " Mismatching MD5s"); + } else if ( filemd5sum.equals(expectedMD5) ) { System.out.println(String.format(" => %s PASSED", name)); + } else { + Assert.fail(String.format("%s has mismatching MD5s: expected=%s observed=%s", name, expectedMD5, filemd5sum)); } + + return filemd5sum; } @@ -381,7 +385,12 @@ public abstract class BaseTest { System.out.printf("##### Path to calculated file (MD5=%s): %s%n", filemd5sum, pathToFileMD5File); System.out.printf("##### Diff command: diff %s %s%n", pathToExpectedMD5File, pathToFileMD5File); - // todo -- add support for simple inline display of the first N differences for text file + // inline differences + DiffEngine.SummaryReportParams params = new DiffEngine.SummaryReportParams(System.out, 20, 10, 0); + boolean success = DiffEngine.simpleDiffFiles(new File(pathToExpectedMD5File), new File(pathToFileMD5File), params); + if ( success ) + System.out.printf("Note that the above list is not comprehensive. At most 20 lines of output, and 10 specific differences will be listed. Please use -T DiffObjects -R public/testdata/exampleFASTA.fasta -m %s -t %s to explore the differences more freely%n", + pathToExpectedMD5File, pathToFileMD5File); } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java index 33a20f7b5..600718aa0 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java @@ -34,76 +34,76 @@ import java.util.Arrays; * Tests CombineVariants */ public class CombineVariantsIntegrationTest extends WalkerTest { -// public static String baseTestString(String args) { -// return "-T CombineVariants -NO_HEADER -L 1:1-50,000,000 -o %s -R " + b36KGReference + args; -// } -// -// public void test1InOut(String file, String md5, boolean vcf3) { -// test1InOut(file, md5, "", vcf3); -// } -// -// public void test1InOut(String file, String md5, String args, boolean vcf3) { -// WalkerTestSpec spec = new WalkerTestSpec( -// baseTestString(" -priority v1 -B:v1,VCF" + (vcf3 ? "3 " : " ") + validationDataLocation + file + args), -// 1, -// Arrays.asList(md5)); -// executeTest("testInOut1--" + file, spec); -// } -// -// public void combine2(String file1, String file2, String args, String md5, boolean vcf3) { -// WalkerTestSpec spec = new WalkerTestSpec( -// baseTestString(" -priority v1,v2 -B:v1,VCF" + (vcf3 ? "3 " : " ") + validationDataLocation + file1 + " -B:v2,VCF" + (vcf3 ? "3 " : " ") + validationDataLocation + file2 + args), -// 1, -// Arrays.asList(md5)); -// executeTest("combine2 1:" + new File(file1).getName() + " 2:" + new File(file2).getName(), spec); -// } -// -// public void combineSites(String args, String md5) { -// String file1 = "1000G_omni2.5.b37.sites.vcf"; -// String file2 = "hapmap_3.3.b37.sites.vcf"; -// WalkerTestSpec spec = new WalkerTestSpec( -// "-T CombineVariants -NO_HEADER -o %s -R " + b37KGReference -// + " -L 1:1-10,000,000 -B:omni,VCF " + validationDataLocation + file1 -// + " -B:hm3,VCF " + validationDataLocation + file2 + args, -// 1, -// Arrays.asList(md5)); -// executeTest("combineSites 1:" + new File(file1).getName() + " 2:" + new File(file2).getName() + " args = " + args, spec); -// } -// -// -// @Test public void test1SNP() { test1InOut("pilot2.snps.vcf4.genotypes.vcf", "2117fff6e0d182cd20be508e9661829c", true); } -// @Test public void test2SNP() { test1InOut("pilot2.snps.vcf4.genotypes.vcf", "2cfaf7af3dd119df08b8a9c1f72e2f93", " -setKey foo", true); } -// @Test public void test3SNP() { test1InOut("pilot2.snps.vcf4.genotypes.vcf", "1474ac0fde2ce42a3c24f1c97eab333e", " -setKey null", true); } -// @Test public void testOfficialCEUPilotCalls() { test1InOut("CEU.trio.2010_03.genotypes.vcf.gz", "7fc66df048a0ab08cf507906e1d4a308", false); } // official project VCF files in tabix format -// -// @Test public void test1Indel1() { test1InOut("CEU.dindel.vcf4.trio.2010_06.indel.genotypes.vcf", "ec9715f53dbf4531570557c212822f12", false); } -// @Test public void test1Indel2() { test1InOut("CEU.dindel.vcf4.low_coverage.2010_06.indel.genotypes.vcf", "f1072be5f5c6ee810276d9ca6537224d", false); } -// -// @Test public void combineTrioCalls() { combine2("CEU.trio.2010_03.genotypes.vcf.gz", "YRI.trio.2010_03.genotypes.vcf.gz", "", "b77a1eec725201d9d8e74ee0c45638d3", false); } // official project VCF files in tabix format -// @Test public void combineTrioCallsMin() { combine2("CEU.trio.2010_03.genotypes.vcf.gz", "YRI.trio.2010_03.genotypes.vcf.gz", " -minimalVCF", "802977fdfd2f4905b501bb06800f60af", false); } // official project VCF files in tabix format -// @Test public void combine2Indels() { combine2("CEU.dindel.vcf4.trio.2010_06.indel.genotypes.vcf", "CEU.dindel.vcf4.low_coverage.2010_06.indel.genotypes.vcf", "", "a67157287dd2b24b5cdf7ebf8fcbbe9a", false); } -// -// @Test public void combineSNPsAndIndels() { combine2("CEU.trio.2010_03.genotypes.vcf.gz", "CEU.dindel.vcf4.low_coverage.2010_06.indel.genotypes.vcf", "", "e1f4718a179f1196538a33863da04f53", false); } -// -// @Test public void uniqueSNPs() { combine2("pilot2.snps.vcf4.genotypes.vcf", "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", "", "b3783384b7c8e877b971033e90beba48", true); } -// -// @Test public void omniHM3Union() { combineSites(" -filteredRecordsMergeType KEEP_IF_ANY_UNFILTERED", "902e541c87caa72134db6293fc46f0ad"); } -// @Test public void omniHM3Intersect() { combineSites(" -filteredRecordsMergeType KEEP_IF_ALL_UNFILTERED", "f339ad4bb5863b58b9c919ce7d040bb9"); } -// -// @Test public void threeWayWithRefs() { -// WalkerTestSpec spec = new WalkerTestSpec( -// baseTestString(" -B:NA19240_BGI,VCF "+validationDataLocation+"NA19240.BGI.RG.vcf" + -// " -B:NA19240_ILLUMINA,VCF "+validationDataLocation+"NA19240.ILLUMINA.RG.vcf" + -// " -B:NA19240_WUGSC,VCF "+validationDataLocation+"NA19240.WUGSC.RG.vcf" + -// " -B:denovoInfo,VCF "+validationDataLocation+"yri_merged_validation_data_240610.annotated.b36.vcf" + -// " -setKey centerSet" + -// " -filteredRecordsMergeType KEEP_IF_ANY_UNFILTERED" + -// " -priority NA19240_BGI,NA19240_ILLUMINA,NA19240_WUGSC,denovoInfo" + -// " -genotypeMergeOptions UNIQUIFY -L 1"), -// 1, -// Arrays.asList("a07995587b855f3214fb71940bf23c0f")); -// executeTest("threeWayWithRefs", spec); -// } + public static String baseTestString(String args) { + return "-T CombineVariants -NO_HEADER -L 1:1-50,000,000 -o %s -R " + b36KGReference + args; + } + + public void test1InOut(String file, String md5, boolean vcf3) { + test1InOut(file, md5, "", vcf3); + } + + public void test1InOut(String file, String md5, String args, boolean vcf3) { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString(" -priority v1 -B:v1,VCF" + (vcf3 ? "3 " : " ") + validationDataLocation + file + args), + 1, + Arrays.asList(md5)); + executeTest("testInOut1--" + file, spec); + } + + public void combine2(String file1, String file2, String args, String md5, boolean vcf3) { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString(" -priority v1,v2 -B:v1,VCF" + (vcf3 ? "3 " : " ") + validationDataLocation + file1 + " -B:v2,VCF" + (vcf3 ? "3 " : " ") + validationDataLocation + file2 + args), + 1, + Arrays.asList(md5)); + executeTest("combine2 1:" + new File(file1).getName() + " 2:" + new File(file2).getName(), spec); + } + + public void combineSites(String args, String md5) { + String file1 = "1000G_omni2.5.b37.sites.vcf"; + String file2 = "hapmap_3.3.b37.sites.vcf"; + WalkerTestSpec spec = new WalkerTestSpec( + "-T CombineVariants -NO_HEADER -o %s -R " + b37KGReference + + " -L 1:1-10,000,000 -B:omni,VCF " + validationDataLocation + file1 + + " -B:hm3,VCF " + validationDataLocation + file2 + args, + 1, + Arrays.asList(md5)); + executeTest("combineSites 1:" + new File(file1).getName() + " 2:" + new File(file2).getName() + " args = " + args, spec); + } + + + @Test public void test1SNP() { test1InOut("pilot2.snps.vcf4.genotypes.vcf", "2117fff6e0d182cd20be508e9661829c", true); } + @Test public void test2SNP() { test1InOut("pilot2.snps.vcf4.genotypes.vcf", "2cfaf7af3dd119df08b8a9c1f72e2f93", " -setKey foo", true); } + @Test public void test3SNP() { test1InOut("pilot2.snps.vcf4.genotypes.vcf", "1474ac0fde2ce42a3c24f1c97eab333e", " -setKey null", true); } + @Test public void testOfficialCEUPilotCalls() { test1InOut("CEU.trio.2010_03.genotypes.vcf.gz", "7fc66df048a0ab08cf507906e1d4a308", false); } // official project VCF files in tabix format + + @Test public void test1Indel1() { test1InOut("CEU.dindel.vcf4.trio.2010_06.indel.genotypes.vcf", "ec9715f53dbf4531570557c212822f12", false); } + @Test public void test1Indel2() { test1InOut("CEU.dindel.vcf4.low_coverage.2010_06.indel.genotypes.vcf", "f1072be5f5c6ee810276d9ca6537224d", false); } + + @Test public void combineTrioCalls() { combine2("CEU.trio.2010_03.genotypes.vcf.gz", "YRI.trio.2010_03.genotypes.vcf.gz", "", "b77a1eec725201d9d8e74ee0c45638d3", false); } // official project VCF files in tabix format + @Test public void combineTrioCallsMin() { combine2("CEU.trio.2010_03.genotypes.vcf.gz", "YRI.trio.2010_03.genotypes.vcf.gz", " -minimalVCF", "802977fdfd2f4905b501bb06800f60af", false); } // official project VCF files in tabix format + @Test public void combine2Indels() { combine2("CEU.dindel.vcf4.trio.2010_06.indel.genotypes.vcf", "CEU.dindel.vcf4.low_coverage.2010_06.indel.genotypes.vcf", "", "a67157287dd2b24b5cdf7ebf8fcbbe9a", false); } + + @Test public void combineSNPsAndIndels() { combine2("CEU.trio.2010_03.genotypes.vcf.gz", "CEU.dindel.vcf4.low_coverage.2010_06.indel.genotypes.vcf", "", "e1f4718a179f1196538a33863da04f53", false); } + + @Test public void uniqueSNPs() { combine2("pilot2.snps.vcf4.genotypes.vcf", "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", "", "b3783384b7c8e877b971033e90beba48", true); } + + @Test public void omniHM3Union() { combineSites(" -filteredRecordsMergeType KEEP_IF_ANY_UNFILTERED", "902e541c87caa72134db6293fc46f0ad"); } + @Test public void omniHM3Intersect() { combineSites(" -filteredRecordsMergeType KEEP_IF_ALL_UNFILTERED", "f339ad4bb5863b58b9c919ce7d040bb9"); } + + @Test public void threeWayWithRefs() { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString(" -B:NA19240_BGI,VCF "+validationDataLocation+"NA19240.BGI.RG.vcf" + + " -B:NA19240_ILLUMINA,VCF "+validationDataLocation+"NA19240.ILLUMINA.RG.vcf" + + " -B:NA19240_WUGSC,VCF "+validationDataLocation+"NA19240.WUGSC.RG.vcf" + + " -B:denovoInfo,VCF "+validationDataLocation+"yri_merged_validation_data_240610.annotated.b36.vcf" + + " -setKey centerSet" + + " -filteredRecordsMergeType KEEP_IF_ANY_UNFILTERED" + + " -priority NA19240_BGI,NA19240_ILLUMINA,NA19240_WUGSC,denovoInfo" + + " -genotypeMergeOptions UNIQUIFY -L 1"), + 1, + Arrays.asList("a07995587b855f3214fb71940bf23c0f")); + executeTest("threeWayWithRefs", spec); + } // complex examples with filtering, indels, and multiple alleles @@ -119,7 +119,7 @@ public class CombineVariantsIntegrationTest extends WalkerTest { executeTest("combineComplexSites 1:" + new File(file1).getName() + " 2:" + new File(file2).getName() + " args = " + args, spec); } - @Test public void complexTestFull() { combineComplexSites("", "64b991fd3850f83614518f7d71f0532f"); } +// @Test public void complexTestFull() { combineComplexSites("", "64b991fd3850f83614518f7d71f0532f"); } @Test public void complexTestMinimal() { combineComplexSites(" -minimalVCF", "0db9ef50fe54b60426474273d7c7fa99"); } @Test public void complexTestSitesOnly() { combineComplexSites(" -sites_only", "d20acb3d53ba0a02ce92d540ebeda2a9"); } @Test public void complexTestSitesOnlyMinimal() { combineComplexSites(" -sites_only -minimalVCF", "8d1b3d120515f8b56b5a0d10bc5da713"); } From 893cc2e103e25daf01018c86382869ddac0e1f4a Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Mon, 11 Jul 2011 23:15:08 -0400 Subject: [PATCH 34/38] Making the package public, so there's no dependances from public -> private --- .../walkers/diffengine/BAMDiffableReader.java | 122 +++++ .../gatk/walkers/diffengine/DiffElement.java | 118 +++++ .../gatk/walkers/diffengine/DiffEngine.java | 423 ++++++++++++++++++ .../gatk/walkers/diffengine/DiffNode.java | 239 ++++++++++ .../walkers/diffengine/DiffObjectsWalker.java | 113 +++++ .../gatk/walkers/diffengine/DiffValue.java | 90 ++++ .../walkers/diffengine/DiffableReader.java | 50 +++ .../gatk/walkers/diffengine/Difference.java | 58 +++ .../walkers/diffengine/VCFDiffableReader.java | 119 +++++ 9 files changed, 1332 insertions(+) create mode 100644 public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/BAMDiffableReader.java create mode 100644 public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffElement.java create mode 100644 public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffEngine.java create mode 100644 public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffNode.java create mode 100644 public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffObjectsWalker.java create mode 100644 public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffValue.java create mode 100644 public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffableReader.java create mode 100644 public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/Difference.java create mode 100644 public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/VCFDiffableReader.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/BAMDiffableReader.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/BAMDiffableReader.java new file mode 100644 index 000000000..f7a395d9d --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/BAMDiffableReader.java @@ -0,0 +1,122 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.diffengine; + +import net.sf.samtools.*; +import net.sf.samtools.util.BlockCompressedInputStream; +import org.broad.tribble.readers.AsciiLineReader; +import org.broad.tribble.readers.LineReader; +import org.broadinstitute.sting.utils.codecs.vcf.VCFCodec; +import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader; +import org.broadinstitute.sting.utils.variantcontext.Genotype; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; + +import java.io.DataInputStream; +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.util.Arrays; +import java.util.Map; +import java.util.zip.GZIPInputStream; + + +/** + * Created by IntelliJ IDEA. + * User: depristo + * Date: 7/4/11 + * Time: 1:09 PM + * + * Class implementing diffnode reader for VCF + */ +public class BAMDiffableReader implements DiffableReader { + private final static int MAX_RECORDS_TO_READ = 1000; + @Override + public String getName() { return "BAM"; } + + @Override + public DiffElement readFromFile(File file) { + final SAMFileReader reader = new SAMFileReader(file, null); // null because we don't want it to look for the index + reader.setValidationStringency(SAMFileReader.ValidationStringency.SILENT); + + DiffNode root = DiffNode.rooted(file.getName()); + SAMRecordIterator iterator = reader.iterator(); + + int count = 0; + while ( iterator.hasNext() ) { + if ( count++ > MAX_RECORDS_TO_READ ) + break; + final SAMRecord record = iterator.next(); + + // name is the read name + first of pair + String name = record.getReadName().replace('.', '_'); + if ( record.getReadPairedFlag() ) { + name += record.getFirstOfPairFlag() ? "_1" : "_2"; + } + + DiffNode readRoot = DiffNode.empty(name, root); + + // add fields + readRoot.add("NAME", record.getReadName()); + readRoot.add("FLAGS", record.getFlags()); + readRoot.add("RNAME", record.getReferenceName()); + readRoot.add("POS", record.getAlignmentStart()); + readRoot.add("MAPQ", record.getMappingQuality()); + readRoot.add("CIGAR", record.getCigarString()); + readRoot.add("RNEXT", record.getMateReferenceName()); + readRoot.add("PNEXT", record.getMateAlignmentStart()); + readRoot.add("TLEN", record.getInferredInsertSize()); + readRoot.add("SEQ", record.getReadString()); + readRoot.add("QUAL", record.getBaseQualityString()); + + for ( SAMRecord.SAMTagAndValue xt : record.getAttributes() ) { + readRoot.add(xt.tag, xt.value); + } + + // add record to root + if ( ! root.hasElement(name) ) + // protect ourselves from malformed files + root.add(readRoot); + } + + reader.close(); + + return root.getBinding(); + } + + @Override + public boolean canRead(File file) { + final byte[] BAM_MAGIC = "BAM\1".getBytes(); + final byte[] buffer = new byte[BAM_MAGIC.length]; + try { + FileInputStream fstream = new FileInputStream(file); + new BlockCompressedInputStream(fstream).read(buffer,0,BAM_MAGIC.length); + return Arrays.equals(buffer, BAM_MAGIC); + } catch ( IOException e ) { + return false; + } catch ( net.sf.samtools.FileTruncatedException e ) { + return false; + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffElement.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffElement.java new file mode 100644 index 000000000..eff24bb88 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffElement.java @@ -0,0 +1,118 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.diffengine; + +import com.google.java.contract.*; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +/** + * Created by IntelliJ IDEA. + * User: depristo + * Date: 7/4/11 + * Time: 12:55 PM + * + * An interface that must be implemented to allow us to calculate differences + * between structured objects + */ +@Invariant({ + "name != null", + "value != null", + "parent != null || name.equals(\"ROOT\")", + "value == null || value.getBinding() == this"}) +public class DiffElement { + public final static DiffElement ROOT = new DiffElement(); + + final private String name; + final private DiffElement parent; + final private DiffValue value; + + /** + * For ROOT only + */ + private DiffElement() { + this.name = "ROOT"; + this.parent = null; + this.value = new DiffValue(this, "ROOT"); + } + + @Requires({"name != null", "parent != null", "value != null"}) + public DiffElement(String name, DiffElement parent, DiffValue value) { + if ( name.equals("ROOT") ) throw new IllegalArgumentException("Cannot use reserved name ROOT"); + this.name = name; + this.parent = parent; + this.value = value; + this.value.setBinding(this); + } + + @Ensures({"result != null"}) + public String getName() { + return name; + } + + public DiffElement getParent() { + return parent; + } + + @Ensures({"result != null"}) + public DiffValue getValue() { + return value; + } + + public boolean isRoot() { return this == ROOT; } + + @Ensures({"result != null"}) + @Override + public String toString() { + return getName() + "=" + getValue().toString(); + } + + public String toString(int offset) { + return (offset > 0 ? Utils.dupString(' ', offset) : 0) + getName() + "=" + getValue().toString(offset); + } + + @Ensures({"result != null"}) + public final String fullyQualifiedName() { + if ( isRoot() ) + return ""; + else if ( parent.isRoot() ) + return name; + else + return parent.fullyQualifiedName() + "." + name; + } + + @Ensures({"result != null"}) + public String toOneLineString() { + return getName() + "=" + getValue().toOneLineString(); + } + + @Ensures({"result != null"}) + public DiffNode getValueAsNode() { + if ( getValue().isCompound() ) + return (DiffNode)getValue(); + else + throw new ReviewedStingException("Illegal request conversion of a DiffValue into a DiffNode: " + this); + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffEngine.java new file mode 100644 index 000000000..ba2713bff --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffEngine.java @@ -0,0 +1,423 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.diffengine; + +import com.google.java.contract.Requires; +import org.apache.log4j.Logger; +import org.broadinstitute.sting.gatk.report.GATKReport; +import org.broadinstitute.sting.gatk.report.GATKReportTable; +import org.broadinstitute.sting.gatk.walkers.varianteval.stratifications.VariantStratifier; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.classloader.PluginManager; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.UserException; + +import java.io.File; +import java.io.PrintStream; +import java.util.*; + +/** + * Created by IntelliJ IDEA. + * User: depristo + * Date: 7/4/11 + * Time: 12:51 PM + * A generic engine for comparing tree-structured objects + */ +public class DiffEngine { + final protected static Logger logger = Logger.getLogger(DiffEngine.class); + + private final Map readers = new HashMap(); + + public DiffEngine() { + loadDiffableReaders(); + } + + // -------------------------------------------------------------------------------- + // + // difference calculation + // + // -------------------------------------------------------------------------------- + + public List diff(DiffElement master, DiffElement test) { + DiffValue masterValue = master.getValue(); + DiffValue testValue = test.getValue(); + + if ( masterValue.isCompound() && masterValue.isCompound() ) { + return diff(master.getValueAsNode(), test.getValueAsNode()); + } else if ( masterValue.isAtomic() && testValue.isAtomic() ) { + return diff(masterValue, testValue); + } else { + // structural difference in types. one is node, other is leaf + return Arrays.asList(new Difference(master, test)); + } + } + + public List diff(DiffNode master, DiffNode test) { + Set allNames = new HashSet(master.getElementNames()); + allNames.addAll(test.getElementNames()); + List diffs = new ArrayList(); + + for ( String name : allNames ) { + DiffElement masterElt = master.getElement(name); + DiffElement testElt = test.getElement(name); + if ( masterElt == null && testElt == null ) { + throw new ReviewedStingException("BUG: unexceptedly got two null elements for field: " + name); + } else if ( masterElt == null || testElt == null ) { // if either is null, we are missing a value + // todo -- should one of these be a special MISSING item? + diffs.add(new Difference(masterElt, testElt)); + } else { + diffs.addAll(diff(masterElt, testElt)); + } + } + + return diffs; + } + + public List diff(DiffValue master, DiffValue test) { + if ( master.getValue().equals(test.getValue()) ) { + return Collections.emptyList(); + } else { + return Arrays.asList(new Difference(master.getBinding(), test.getBinding())); + } + } + + // -------------------------------------------------------------------------------- + // + // Summarizing differences + // + // -------------------------------------------------------------------------------- + + /** + * Emits a summary of the diffs to out. Suppose you have the following three differences: + * + * A.X.Z:1!=2 + * A.Y.Z:3!=4 + * B.X.Z:5!=6 + * + * The above is the itemized list of the differences. The summary looks for common differences + * in the name hierarchy, counts those shared elements, and emits the differences that occur + * in order of decreasing counts. + * + * So, in the above example, what are the shared elements? + * + * A.X.Z and B.X.Z share X.Z, so there's a *.X.Z with count 2 + * A.X.Z, A.Y.Z, and B.X.Z all share *.*.Z, with count 3 + * Each of A.X.Z, A.Y.Z, and B.X.Z are individually unique, with count 1 + * + * So we would emit the following summary: + * + * *.*.Z: 3 + * *.X.Z: 2 + * A.X.Z: 1 [specific difference: 1!=2] + * A.Y.Z: 1 [specific difference: 3!=4] + * B.X.Z: 1 [specific difference: 5!=6] + * + * The algorithm to accomplish this calculation is relatively simple. Start with all of the + * concrete differences. For each pair of differences A1.A2....AN and B1.B2....BN: + * + * find the longest common subsequence Si.Si+1...SN where Ai = Bi = Si + * If i == 0, then there's no shared substructure + * If i > 0, then generate the summarized value X = *.*...Si.Si+1...SN + * if X is a known summary, increment it's count, otherwise set its count to 1 + * + * Not that only pairs of the same length are considered as potentially equivalent + * + * @param params determines how we display the items + * @param diffs + */ + public void reportSummarizedDifferences(List diffs, SummaryReportParams params ) { + printSummaryReport(summarizeDifferences(diffs), params ); + } + + public List summarizeDifferences(List diffs) { + List diffPaths = new ArrayList(diffs.size()); + + for ( Difference diff1 : diffs ) { + diffPaths.add(diffNameToPath(diff1.getFullyQualifiedName())); + } + + return summarizedDifferencesOfPaths(diffPaths); + } + + final protected static String[] diffNameToPath(String diffName) { + return diffName.split("\\."); + } + + protected List summarizedDifferencesOfPaths(List diffPaths) { + Map summaries = new HashMap(); + + // create the initial set of differences + for ( int i = 0; i < diffPaths.size(); i++ ) { + for ( int j = 0; j <= i; j++ ) { + String[] diffPath1 = diffPaths.get(i); + String[] diffPath2 = diffPaths.get(j); + if ( diffPath1.length == diffPath2.length ) { + int lcp = longestCommonPostfix(diffPath1, diffPath2); + String path = lcp > 0 ? summarizedPath(diffPath2, lcp) : Utils.join(".", diffPath2); + addSummary(summaries, path, true); + } + } + } + + // count differences + for ( String[] diffPath : diffPaths ) { + for ( SummarizedDifference sumDiff : summaries.values() ) { + if ( sumDiff.matches(diffPath) ) + addSummary(summaries, sumDiff.getPath(), false); + } + } + + List sortedSummaries = new ArrayList(summaries.values()); + Collections.sort(sortedSummaries); + return sortedSummaries; + } + + private static void addSummary(Map summaries, String path, boolean onlyCatalog) { + if ( summaries.containsKey(path) ) { + if ( ! onlyCatalog ) + summaries.get(path).incCount(); + } else { + SummarizedDifference sumDiff = new SummarizedDifference(path); + summaries.put(sumDiff.getPath(), sumDiff); + } + } + + protected void printSummaryReport(List sortedSummaries, SummaryReportParams params ) { + GATKReport report = new GATKReport(); + final String tableName = "diffences"; + report.addTable(tableName, "Summarized differences between the master and test files.\nSee http://www.broadinstitute.org/gsa/wiki/index.php/DiffObjectsWalker_and_SummarizedDifferences for more information"); + GATKReportTable table = report.getTable(tableName); + table.addPrimaryKey("Difference", true); + table.addColumn("NumberOfOccurrences", 0); + + int count = 0, count1 = 0; + for ( SummarizedDifference diff : sortedSummaries ) { + if ( diff.getCount() < params.minSumDiffToShow ) + // in order, so break as soon as the count is too low + break; + + if ( params.maxItemsToDisplay != 0 && count++ > params.maxItemsToDisplay ) + break; + + if ( diff.getCount() == 1 ) { + count1++; + if ( params.maxCountOneItems != 0 && count1 > params.maxCountOneItems ) + break; + } + + table.set(diff.getPath(), "NumberOfOccurrences", diff.getCount()); + } + + table.write(params.out); + } + + protected static int longestCommonPostfix(String[] diffPath1, String[] diffPath2) { + int i = 0; + for ( ; i < diffPath1.length; i++ ) { + int j = diffPath1.length - i - 1; + if ( ! diffPath1[j].equals(diffPath2[j]) ) + break; + } + return i; + } + + /** + * parts is [A B C D] + * commonPostfixLength: how many parts are shared at the end, suppose its 2 + * We want to create a string *.*.C.D + * + * @param parts + * @param commonPostfixLength + * @return + */ + protected static String summarizedPath(String[] parts, int commonPostfixLength) { + int stop = parts.length - commonPostfixLength; + if ( stop > 0 ) parts = parts.clone(); + for ( int i = 0; i < stop; i++ ) { + parts[i] = "*"; + } + return Utils.join(".", parts); + } + + /** + * TODO -- all of the algorithms above should use SummarizedDifference instead + * TODO -- of some SummarizedDifferences and some low-level String[] + */ + public static class SummarizedDifference implements Comparable { + final String path; // X.Y.Z + final String[] parts; + int count = 0; + + public SummarizedDifference(String path) { + this.path = path; + this.parts = diffNameToPath(path); + } + + public void incCount() { count++; } + + public int getCount() { + return count; + } + + /** + * The fully qualified path object A.B.C etc + * @return + */ + public String getPath() { + return path; + } + + /** + * @return the length of the parts of this summary + */ + public int length() { + return this.parts.length; + } + + /** + * Returns true if the string parts matches this summary. Matches are + * must be equal() everywhere where this summary isn't *. + * @param otherParts + * @return + */ + public boolean matches(String[] otherParts) { + if ( otherParts.length != length() ) + return false; + + // TODO optimization: can start at right most non-star element + for ( int i = 0; i < length(); i++ ) { + String part = parts[i]; + if ( ! part.equals("*") && ! part.equals(otherParts[i]) ) + return false; + } + + return true; + } + + @Override + public String toString() { + return String.format("%s:%d", getPath(), getCount()); + } + + @Override + public int compareTo(SummarizedDifference other) { + // sort first highest to lowest count, then by lowest to highest path + int countCmp = Integer.valueOf(count).compareTo(other.count); + return countCmp != 0 ? -1 * countCmp : path.compareTo(other.path); + } + + + } + + // -------------------------------------------------------------------------------- + // + // plugin manager + // + // -------------------------------------------------------------------------------- + + public void loadDiffableReaders() { + List> drClasses = new PluginManager( DiffableReader.class ).getPlugins(); + + logger.info("Loading diffable modules:"); + for (Class drClass : drClasses ) { + logger.info("\t" + drClass.getSimpleName()); + + try { + DiffableReader dr = drClass.newInstance(); + readers.put(dr.getName(), dr); + } catch (InstantiationException e) { + throw new ReviewedStingException("Unable to instantiate module '" + drClass.getSimpleName() + "'"); + } catch (IllegalAccessException e) { + throw new ReviewedStingException("Illegal access error when trying to instantiate '" + drClass.getSimpleName() + "'"); + } + } + } + + protected Map getReaders() { + return readers; + } + + protected DiffableReader getReader(String name) { + return readers.get(name); + } + + /** + * Returns a reader appropriate for this file, or null if no such reader exists + * @param file + * @return + */ + public DiffableReader findReaderForFile(File file) { + for ( DiffableReader reader : readers.values() ) + if (reader.canRead(file) ) + return reader; + + return null; + } + + /** + * Returns true if reader appropriate for this file, or false if no such reader exists + * @param file + * @return + */ + public boolean canRead(File file) { + return findReaderForFile(file) != null; + } + + public DiffElement createDiffableFromFile(File file) { + DiffableReader reader = findReaderForFile(file); + if ( reader == null ) + throw new UserException("Unsupported file type: " + file); + else + return reader.readFromFile(file); + } + + public static boolean simpleDiffFiles(File masterFile, File testFile, DiffEngine.SummaryReportParams params) { + DiffEngine diffEngine = new DiffEngine(); + + if ( diffEngine.canRead(masterFile) && diffEngine.canRead(testFile) ) { + DiffElement master = diffEngine.createDiffableFromFile(masterFile); + DiffElement test = diffEngine.createDiffableFromFile(testFile); + List diffs = diffEngine.diff(master, test); + diffEngine.reportSummarizedDifferences(diffs, params); + return true; + } else { + return false; + } + } + + public static class SummaryReportParams { + PrintStream out = System.out; + int maxItemsToDisplay = 0; + int maxCountOneItems = 0; + int minSumDiffToShow = 0; + + public SummaryReportParams(PrintStream out, int maxItemsToDisplay, int maxCountOneItems, int minSumDiffToShow) { + this.out = out; + this.maxItemsToDisplay = maxItemsToDisplay; + this.maxCountOneItems = maxCountOneItems; + this.minSumDiffToShow = minSumDiffToShow; + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffNode.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffNode.java new file mode 100644 index 000000000..0720e18c0 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffNode.java @@ -0,0 +1,239 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.diffengine; + +import com.google.java.contract.Requires; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +import java.util.*; + +/** + * Created by IntelliJ IDEA. + * User: depristo + * Date: 7/4/11 + * Time: 12:55 PM + * + * An interface that must be implemented to allow us to calculate differences + * between structured objects + */ +public class DiffNode extends DiffValue { + private Map getElementMap() { + return (Map)super.getValue(); + } + private static Map emptyElements() { return new HashMap(); } + + private DiffNode(Map elements) { + super(elements); + } + + private DiffNode(DiffElement binding, Map elements) { + super(binding, elements); + } + + // --------------------------------------------------------------------------- + // + // constructors + // + // --------------------------------------------------------------------------- + + public static DiffNode rooted(String name) { + return empty(name, DiffElement.ROOT); + } + + public static DiffNode empty(String name, DiffElement parent) { + DiffNode df = new DiffNode(emptyElements()); + DiffElement elt = new DiffElement(name, parent, df); + df.setBinding(elt); + return df; + } + + public static DiffNode empty(String name, DiffValue parent) { + return empty(name, parent.getBinding()); + } + + // --------------------------------------------------------------------------- + // + // accessors + // + // --------------------------------------------------------------------------- + + @Override + public boolean isAtomic() { return false; } + + public Collection getElementNames() { + return getElementMap().keySet(); + } + + public Collection getElements() { + return getElementMap().values(); + } + + private Collection getElements(boolean atomicOnly) { + List elts = new ArrayList(); + for ( DiffElement elt : getElements() ) + if ( (atomicOnly && elt.getValue().isAtomic()) || (! atomicOnly && elt.getValue().isCompound())) + elts.add(elt); + return elts; + } + + public Collection getAtomicElements() { + return getElements(true); + } + + public Collection getCompoundElements() { + return getElements(false); + } + + public DiffElement getElement(String name) { + for ( DiffElement elt : getElements() ) + if ( elt.getName().equals(name) ) + return elt; + return null; + } + + /** + * Returns true if name is bound in this node + * @param name + * @return + */ + public boolean hasElement(String name) { + return getElement(name) != null; + } + + // --------------------------------------------------------------------------- + // + // add + // + // --------------------------------------------------------------------------- + + @Requires("elt != null") + public void add(DiffElement elt) { + if ( getElementMap().containsKey(elt.getName()) ) + throw new IllegalArgumentException("Attempting to rebind already existing binding: " + elt + " node=" + this); + getElementMap().put(elt.getName(), elt); + } + + @Requires("elt != null") + public void add(DiffValue elt) { + add(elt.getBinding()); + } + + @Requires("elts != null") + public void add(Collection elts) { + for ( DiffElement e : elts ) + add(e); + } + + public void add(String name, Object value) { + add(new DiffElement(name, this.getBinding(), new DiffValue(value))); + } + + // --------------------------------------------------------------------------- + // + // toString + // + // --------------------------------------------------------------------------- + + @Override + public String toString() { + return toString(0); + } + + @Override + public String toString(int offset) { + String off = offset > 0 ? Utils.dupString(' ', offset) : ""; + StringBuilder b = new StringBuilder(); + + b.append("(").append("\n"); + Collection atomicElts = getAtomicElements(); + for ( DiffElement elt : atomicElts ) { + b.append(elt.toString(offset + 2)).append('\n'); + } + + for ( DiffElement elt : getCompoundElements() ) { + b.append(elt.toString(offset + 4)).append('\n'); + } + b.append(off).append(")").append("\n"); + + return b.toString(); + } + + @Override + public String toOneLineString() { + StringBuilder b = new StringBuilder(); + + b.append('('); + List parts = new ArrayList(); + for ( DiffElement elt : getElements() ) + parts.add(elt.toOneLineString()); + b.append(Utils.join(" ", parts)); + b.append(')'); + + return b.toString(); + } + + // -------------------------------------------------------------------------------- + // + // fromString and toOneLineString + // + // -------------------------------------------------------------------------------- + + public static DiffElement fromString(String tree) { + return fromString(tree, DiffElement.ROOT); + } + + /** + * Doesn't support full tree structure parsing + * @param tree + * @param parent + * @return + */ + private static DiffElement fromString(String tree, DiffElement parent) { + // X=(A=A B=B C=(D=D)) + String[] parts = tree.split("=", 2); + if ( parts.length != 2 ) + throw new ReviewedStingException("Unexpected tree structure: " + tree + " parts=" + parts); + String name = parts[0]; + String value = parts[1]; + + if ( value.length() == 0 ) + throw new ReviewedStingException("Illegal tree structure: " + value + " at " + tree); + + if ( value.charAt(0) == '(' ) { + if ( ! value.endsWith(")") ) + throw new ReviewedStingException("Illegal tree structure. Missing ): " + value + " at " + tree); + String subtree = value.substring(1, value.length()-1); + DiffNode rec = DiffNode.empty(name, parent); + String[] subParts = subtree.split(" "); + for ( String subPart : subParts ) { + rec.add(fromString(subPart, rec.getBinding())); + } + return rec.getBinding(); + } else { + return new DiffValue(name, parent, value).getBinding(); + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffObjectsWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffObjectsWalker.java new file mode 100644 index 000000000..a08108db2 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffObjectsWalker.java @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.diffengine; + +import org.apache.xmlbeans.impl.tool.Diff; +import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.Requires; +import org.broadinstitute.sting.gatk.walkers.RodWalker; + +import java.io.File; +import java.io.PrintStream; +import java.util.List; + +/** + * Compares two record-oriented files, itemizing specific difference between equivalent + * records in the two files. Reports both itemized and summarized differences. + * @author Mark DePristo + * @version 0.1 + */ +@Requires(value={}) +public class DiffObjectsWalker extends RodWalker { + @Output(doc="File to which results should be written",required=true) + protected PrintStream out; + + @Argument(fullName="maxRecords", shortName="M", doc="Max. number of records to process", required=false) + int MAX_RECORDS = 0; + + @Argument(fullName="maxCount1Records", shortName="M1", doc="Max. number of records occuring exactly once in the file to process", required=false) + int MAX_COUNT1_RECORDS = 0; + + @Argument(fullName="minCountForDiff", shortName="MCFD", doc="Min number of observations for a records to display", required=false) + int minCountForDiff = 1; + + @Argument(fullName="showItemizedDifferences", shortName="SID", doc="Should we enumerate all differences between the files?", required=false) + boolean showItemizedDifferences = false; + + @Argument(fullName="master", shortName="m", doc="Master file: expected results", required=true) + File masterFile; + + @Argument(fullName="test", shortName="t", doc="Test file: new results to compare to the master file", required=true) + File testFile; + + final DiffEngine diffEngine = new DiffEngine(); + + @Override + public void initialize() { + + } + + @Override + public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + return 0; + } + + @Override + public Integer reduceInit() { + return 0; + } + + @Override + public Integer reduce(Integer counter, Integer sum) { + return counter + sum; + } + + @Override + public void onTraversalDone(Integer sum) { + out.printf("Reading master file %s%n", masterFile); + DiffElement master = diffEngine.createDiffableFromFile(masterFile); + out.printf("Reading test file %s%n", testFile); + DiffElement test = diffEngine.createDiffableFromFile(testFile); + +// out.printf("Master diff objects%n"); +// out.println(master.toString()); +// out.printf("Test diff objects%n"); +// out.println(test.toString()); + + List diffs = diffEngine.diff(master, test); + if ( showItemizedDifferences ) { + out.printf("Itemized results%n"); + for ( Difference diff : diffs ) + out.printf("DIFF: %s%n", diff.toString()); + } + + DiffEngine.SummaryReportParams params = new DiffEngine.SummaryReportParams(out, MAX_RECORDS, MAX_COUNT1_RECORDS, minCountForDiff); + diffEngine.reportSummarizedDifferences(diffs, params); + } +} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffValue.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffValue.java new file mode 100644 index 000000000..7245e9e8d --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffValue.java @@ -0,0 +1,90 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.diffengine; + +import org.broadinstitute.sting.utils.Utils; + +/** + * Created by IntelliJ IDEA. + * User: depristo + * Date: 7/4/11 + * Time: 12:55 PM + * + * An interface that must be implemented to allow us to calculate differences + * between structured objects + */ +public class DiffValue { + private DiffElement binding = null; + final private Object value; + + public DiffValue(Object value) { + this.value = value; + } + + public DiffValue(DiffElement binding, Object value) { + this.binding = binding; + this.value = value; + } + + public DiffValue(DiffValue parent, Object value) { + this(parent.getBinding(), value); + } + + public DiffValue(String name, DiffElement parent, Object value) { + this.binding = new DiffElement(name, parent, this); + this.value = value; + } + + public DiffValue(String name, DiffValue parent, Object value) { + this(name, parent.getBinding(), value); + } + + public DiffElement getBinding() { + return binding; + } + + protected void setBinding(DiffElement binding) { + this.binding = binding; + } + + public Object getValue() { + return value; + } + + public String toString() { + return getValue().toString(); + } + + public String toString(int offset) { + return toString(); + } + + public String toOneLineString() { + return getValue().toString(); + } + + public boolean isAtomic() { return true; } + public boolean isCompound() { return ! isAtomic(); } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffableReader.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffableReader.java new file mode 100644 index 000000000..84c2eed10 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffableReader.java @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.diffengine; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; + +import java.io.File; + +/** + * Created by IntelliJ IDEA. + * User: depristo + * Date: 7/4/11 + * Time: 1:09 PM + * + * Interface for readers creating diffable objects from a file + */ +public interface DiffableReader { + @Ensures("result != null") + public String getName(); + + @Ensures("result != null") + @Requires("file != null") + public DiffElement readFromFile(File file); + + @Requires("file != null") + public boolean canRead(File file); +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/Difference.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/Difference.java new file mode 100644 index 000000000..6627a4cc5 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/Difference.java @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.diffengine; + +/** + * Created by IntelliJ IDEA. + * User: depristo + * Date: 7/4/11 + * Time: 12:53 PM + * + * Represents a specific difference between two specific DiffElements + */ +public class Difference { + DiffElement master, test; + + public Difference(DiffElement master, DiffElement test) { + if ( master == null && test == null ) throw new IllegalArgumentException("Master and test both cannot be null"); + this.master = master; + this.test = test; + } + + public String toString() { + return String.format("%s:%s!=%s", + getFullyQualifiedName(), + getOneLineString(master), + getOneLineString(test)); + } + + public String getFullyQualifiedName() { + return (master == null ? test : master).fullyQualifiedName(); + } + + private static String getOneLineString(DiffElement elt) { + return elt == null ? "MISSING" : elt.getValue().toOneLineString(); + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/VCFDiffableReader.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/VCFDiffableReader.java new file mode 100644 index 000000000..743178538 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/VCFDiffableReader.java @@ -0,0 +1,119 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.diffengine; + +import org.broad.tribble.readers.AsciiLineReader; +import org.broad.tribble.readers.LineReader; +import org.broadinstitute.sting.utils.codecs.vcf.VCFCodec; +import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; +import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader; +import org.broadinstitute.sting.utils.variantcontext.Genotype; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; + +import java.io.*; +import java.util.Arrays; +import java.util.Map; +import java.util.zip.GZIPInputStream; + + +/** + * Created by IntelliJ IDEA. + * User: depristo + * Date: 7/4/11 + * Time: 1:09 PM + * + * Class implementing diffnode reader for VCF + */ +public class VCFDiffableReader implements DiffableReader { + @Override + public String getName() { return "VCF"; } + + @Override + public DiffElement readFromFile(File file) { + DiffNode root = DiffNode.rooted(file.getName()); + try { + LineReader lineReader = new AsciiLineReader(new FileInputStream(file)); + VCFCodec vcfCodec = new VCFCodec(); + VCFHeader header = (VCFHeader)vcfCodec.readHeader(lineReader); + + String line = lineReader.readLine(); + while ( line != null ) { + VariantContext vc = (VariantContext)vcfCodec.decode(line); + String name = vc.getChr() + ":" + vc.getStart(); + DiffNode vcRoot = DiffNode.empty(name, root); + + // add fields + vcRoot.add("CHROM", vc.getChr()); + vcRoot.add("POS", vc.getStart()); + vcRoot.add("ID", vc.hasID() ? vc.getID() : VCFConstants.MISSING_VALUE_v4); + vcRoot.add("REF", vc.getReference()); + vcRoot.add("ALT", vc.getAlternateAlleles()); + vcRoot.add("QUAL", vc.hasNegLog10PError() ? vc.getNegLog10PError() * 10 : VCFConstants.MISSING_VALUE_v4); + vcRoot.add("FILTER", vc.getFilters()); + + // add info fields + for (Map.Entry attribute : vc.getAttributes().entrySet()) { + if ( ! attribute.getKey().startsWith("_") && ! attribute.getKey().equals(VariantContext.ID_KEY)) + vcRoot.add(attribute.getKey(), attribute.getValue()); + } + + for (Genotype g : vc.getGenotypes().values() ) { + DiffNode gRoot = DiffNode.empty(g.getSampleName(), vcRoot); + gRoot.add("GT", g.getGenotypeString()); + gRoot.add("GQ", g.hasNegLog10PError() ? g.getNegLog10PError() * 10 : VCFConstants.MISSING_VALUE_v4 ); + + for (Map.Entry attribute : g.getAttributes().entrySet()) { + if ( ! attribute.getKey().startsWith("_") ) + gRoot.add(attribute.getKey(), attribute.getValue()); + } + + vcRoot.add(gRoot); + } + + root.add(vcRoot); + line = lineReader.readLine(); + } + + lineReader.close(); + } catch ( IOException e ) { + return null; + } + + return root.getBinding(); + } + + @Override + public boolean canRead(File file) { + try { + final String VCF4_HEADER = "##fileformat=VCFv4"; + char[] buff = new char[VCF4_HEADER.length()]; + new FileReader(file).read(buff, 0, VCF4_HEADER.length()); + String firstLine = new String(buff); + return firstLine.startsWith(VCF4_HEADER); + } catch ( IOException e ) { + return false; + } + } +} From d7d15019dd543decffa2169f074b123633fe5984 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Tue, 12 Jul 2011 01:16:21 -0400 Subject: [PATCH 35/38] Adding support for other simple header line types (e.g. ALT) and cleaning up the interface a bit. --- .../walkers/annotator/ChromosomeCounts.java | 5 +- .../walkers/genotyper/UnifiedGenotyper.java | 17 +++- .../utils/codecs/vcf/StandardVCFWriter.java | 3 +- .../utils/codecs/vcf/VCFAltHeaderLine.java | 28 +++++++ .../codecs/vcf/VCFCompoundHeaderLine.java | 3 + .../utils/codecs/vcf/VCFFilterHeaderLine.java | 48 +---------- .../utils/codecs/vcf/VCFFormatHeaderLine.java | 2 +- .../utils/codecs/vcf/VCFSimpleHeaderLine.java | 81 +++++++++++++++++++ .../sting/utils/codecs/vcf/VCFUtils.java | 15 ---- 9 files changed, 135 insertions(+), 67 deletions(-) create mode 100644 public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFAltHeaderLine.java create mode 100644 public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFSimpleHeaderLine.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ChromosomeCounts.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ChromosomeCounts.java index 143722d7c..ed10d2072 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ChromosomeCounts.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ChromosomeCounts.java @@ -25,6 +25,7 @@ package org.broadinstitute.sting.gatk.walkers.annotator; +import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineCount; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; @@ -41,8 +42,8 @@ import java.util.*; public class ChromosomeCounts implements InfoFieldAnnotation, StandardAnnotation { private String[] keyNames = { VCFConstants.ALLELE_NUMBER_KEY, VCFConstants.ALLELE_COUNT_KEY, VCFConstants.ALLELE_FREQUENCY_KEY }; - private VCFInfoHeaderLine[] descriptions = { new VCFInfoHeaderLine(VCFConstants.ALLELE_FREQUENCY_KEY, -1, VCFHeaderLineType.Float, "Allele Frequency, for each ALT allele, in the same order as listed"), - new VCFInfoHeaderLine(VCFConstants.ALLELE_COUNT_KEY, -1, VCFHeaderLineType.Integer, "Allele count in genotypes, for each ALT allele, in the same order as listed"), + private VCFInfoHeaderLine[] descriptions = { new VCFInfoHeaderLine(VCFConstants.ALLELE_FREQUENCY_KEY, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Float, "Allele Frequency, for each ALT allele, in the same order as listed"), + new VCFInfoHeaderLine(VCFConstants.ALLELE_COUNT_KEY, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "Allele count in genotypes, for each ALT allele, in the same order as listed"), new VCFInfoHeaderLine(VCFConstants.ALLELE_NUMBER_KEY, 1, VCFHeaderLineType.Integer, "Total number of alleles in called genotypes") }; public Map annotate(RefMetaDataTracker tracker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java index 7a765c602..fe0084a19 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java @@ -37,7 +37,6 @@ import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.baq.BAQ; import org.broadinstitute.sting.commandline.*; -import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils; import java.util.*; import java.io.PrintStream; @@ -158,7 +157,7 @@ public class UnifiedGenotyper extends LocusWalker getSupportedHeaderStrings() { + Set result = new HashSet(); + result.add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_KEY, 1, VCFHeaderLineType.String, "Genotype")); + result.add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_QUALITY_KEY, 1, VCFHeaderLineType.Float, "Genotype Quality")); + result.add(new VCFFormatHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Read Depth (only filtered reads used for calling)")); + result.add(new VCFFormatHeaderLine(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Float, "Normalized, Phred-scaled likelihoods for AA,AB,BB genotypes where A=ref and B=alt; if site is not biallelic, number of likelihoods if n*(n+1)/2")); + + return result; + } + /** * Compute at a given locus. * diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/StandardVCFWriter.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/StandardVCFWriter.java index 230773310..f4996b487 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/StandardVCFWriter.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/StandardVCFWriter.java @@ -358,9 +358,8 @@ public class StandardVCFWriter implements VCFWriter { mWriter.write(key); if ( !entry.getValue().equals("") ) { - int numVals = 1; VCFInfoHeaderLine metaData = mHeader.getInfoHeaderLine(key); - if ( metaData != null && (metaData.getCountType() != VCFHeaderLineCount.INTEGER || metaData.getCount() > 0) ) { + if ( metaData == null || metaData.getCountType() != VCFHeaderLineCount.INTEGER || metaData.getCount() != 0 ) { mWriter.write("="); mWriter.write(entry.getValue()); } diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFAltHeaderLine.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFAltHeaderLine.java new file mode 100644 index 000000000..a9de949d8 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFAltHeaderLine.java @@ -0,0 +1,28 @@ +package org.broadinstitute.sting.utils.codecs.vcf; + +/** + * @author ebanks + * A class representing a key=value entry for ALT fields in the VCF header + */ +public class VCFAltHeaderLine extends VCFSimpleHeaderLine { + + /** + * create a VCF filter header line + * + * @param name the name for this header line + * @param description the description for this header line + */ + public VCFAltHeaderLine(String name, String description) { + super(name, description, SupportedHeaderLineType.ALT); + } + + /** + * create a VCF info header line + * + * @param line the header line + * @param version the vcf header version + */ + protected VCFAltHeaderLine(String line, VCFHeaderVersion version) { + super(line, version, SupportedHeaderLineType.ALT); + } +} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCompoundHeaderLine.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCompoundHeaderLine.java index 49f9ab184..bb822f2ed 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCompoundHeaderLine.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCompoundHeaderLine.java @@ -89,6 +89,7 @@ public abstract class VCFCompoundHeaderLine extends VCFHeaderLine implements VCF * @param count the count for this header line * @param type the type for this header line * @param description the description for this header line + * @param lineType the header line type */ protected VCFCompoundHeaderLine(String name, int count, VCFHeaderLineType type, String description, SupportedHeaderLineType lineType) { super(lineType.toString(), ""); @@ -108,6 +109,7 @@ public abstract class VCFCompoundHeaderLine extends VCFHeaderLine implements VCF * @param count the count type for this header line * @param type the type for this header line * @param description the description for this header line + * @param lineType the header line type */ protected VCFCompoundHeaderLine(String name, VCFHeaderLineCount count, VCFHeaderLineType type, String description, SupportedHeaderLineType lineType) { super(lineType.toString(), ""); @@ -124,6 +126,7 @@ public abstract class VCFCompoundHeaderLine extends VCFHeaderLine implements VCF * * @param line the header line * @param version the VCF header version + * @param lineType the header line type * */ protected VCFCompoundHeaderLine(String line, VCFHeaderVersion version, SupportedHeaderLineType lineType) { diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFFilterHeaderLine.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFFilterHeaderLine.java index 9176fc16e..418b80074 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFFilterHeaderLine.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFFilterHeaderLine.java @@ -1,19 +1,10 @@ package org.broadinstitute.sting.utils.codecs.vcf; -import java.util.Arrays; -import java.util.LinkedHashMap; -import java.util.Map; - - /** * @author ebanks * A class representing a key=value entry for FILTER fields in the VCF header */ -public class VCFFilterHeaderLine extends VCFHeaderLine implements VCFNamedHeaderLine { - - private String name; - private String description; - +public class VCFFilterHeaderLine extends VCFSimpleHeaderLine { /** * create a VCF filter header line @@ -22,12 +13,7 @@ public class VCFFilterHeaderLine extends VCFHeaderLine implements VCFNamedHeader * @param description the description for this header line */ public VCFFilterHeaderLine(String name, String description) { - super("FILTER", ""); - this.name = name; - this.description = description; - - if ( name == null || description == null ) - throw new IllegalArgumentException(String.format("Invalid VCFCompoundHeaderLine: key=%s name=%s desc=%s", super.getKey(), name, description )); + super(name, description, SupportedHeaderLineType.FILTER); } /** @@ -37,34 +23,6 @@ public class VCFFilterHeaderLine extends VCFHeaderLine implements VCFNamedHeader * @param version the vcf header version */ protected VCFFilterHeaderLine(String line, VCFHeaderVersion version) { - super("FILTER", ""); - Map mapping = VCFHeaderLineTranslator.parseLine(version,line, Arrays.asList("ID","Description")); - name = mapping.get("ID"); - description = mapping.get("Description"); - if ( description == null && ALLOW_UNBOUND_DESCRIPTIONS ) // handle the case where there's no description provided - description = UNBOUND_DESCRIPTION; - } - - protected String toStringEncoding() { - Map map = new LinkedHashMap(); - map.put("ID", name); - map.put("Description", description); - return "FILTER=" + VCFHeaderLine.toStringEncoding(map); - } - - public boolean equals(Object o) { - if ( !(o instanceof VCFFilterHeaderLine) ) - return false; - VCFFilterHeaderLine other = (VCFFilterHeaderLine)o; - return name.equals(other.name) && - description.equals(other.description); - } - - public String getName() { - return name; - } - - public String getDescription() { - return description; + super(line, version, SupportedHeaderLineType.FILTER); } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFFormatHeaderLine.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFFormatHeaderLine.java index f68cb670b..474c8dd14 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFFormatHeaderLine.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFFormatHeaderLine.java @@ -17,7 +17,7 @@ public class VCFFormatHeaderLine extends VCFCompoundHeaderLine { } public VCFFormatHeaderLine(String name, VCFHeaderLineCount count, VCFHeaderLineType type, String description) { - super(name, count, type, description, SupportedHeaderLineType.INFO); + super(name, count, type, description, SupportedHeaderLineType.FORMAT); } protected VCFFormatHeaderLine(String line, VCFHeaderVersion version) { diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFSimpleHeaderLine.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFSimpleHeaderLine.java new file mode 100644 index 000000000..152043f28 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFSimpleHeaderLine.java @@ -0,0 +1,81 @@ +package org.broadinstitute.sting.utils.codecs.vcf; + +import java.util.Arrays; +import java.util.LinkedHashMap; +import java.util.Map; + + +/** + * @author ebanks + * A class representing a key=value entry for simple VCF header types + */ +public abstract class VCFSimpleHeaderLine extends VCFHeaderLine implements VCFNamedHeaderLine { + + public enum SupportedHeaderLineType { + FILTER, ALT; + } + + private String name; + private String description; + + // our type of line, i.e. filter, alt, etc + private final SupportedHeaderLineType lineType; + + + /** + * create a VCF filter header line + * + * @param name the name for this header line + * @param description the description for this header line + * @param lineType the header line type + */ + public VCFSimpleHeaderLine(String name, String description, SupportedHeaderLineType lineType) { + super(lineType.toString(), ""); + this.lineType = lineType; + this.name = name; + this.description = description; + + if ( name == null || description == null ) + throw new IllegalArgumentException(String.format("Invalid VCFSimpleHeaderLine: key=%s name=%s desc=%s", super.getKey(), name, description )); + } + + /** + * create a VCF info header line + * + * @param line the header line + * @param version the vcf header version + * @param lineType the header line type + */ + protected VCFSimpleHeaderLine(String line, VCFHeaderVersion version, SupportedHeaderLineType lineType) { + super(lineType.toString(), ""); + this.lineType = lineType; + Map mapping = VCFHeaderLineTranslator.parseLine(version,line, Arrays.asList("ID","Description")); + name = mapping.get("ID"); + description = mapping.get("Description"); + if ( description == null && ALLOW_UNBOUND_DESCRIPTIONS ) // handle the case where there's no description provided + description = UNBOUND_DESCRIPTION; + } + + protected String toStringEncoding() { + Map map = new LinkedHashMap(); + map.put("ID", name); + map.put("Description", description); + return lineType.toString() + "=" + VCFHeaderLine.toStringEncoding(map); + } + + public boolean equals(Object o) { + if ( !(o instanceof VCFSimpleHeaderLine) ) + return false; + VCFSimpleHeaderLine other = (VCFSimpleHeaderLine)o; + return name.equals(other.name) && + description.equals(other.description); + } + + public String getName() { + return name; + } + + public String getDescription() { + return description; + } +} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFUtils.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFUtils.java index ecede068e..4037f75b9 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFUtils.java @@ -180,19 +180,4 @@ public class VCFUtils { return new HashSet(map.values()); } - - /** - * return a set of supported format lines; what we currently support for output in the genotype fields of a VCF - * @return a set of VCF format lines - */ - public static Set getSupportedHeaderStrings() { - Set result = new HashSet(); - result.add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_KEY, 1, VCFHeaderLineType.String, "Genotype")); - result.add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_QUALITY_KEY, 1, VCFHeaderLineType.Float, "Genotype Quality")); - result.add(new VCFFormatHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Read Depth (only filtered reads used for calling)")); - result.add(new VCFFormatHeaderLine(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, -1, VCFHeaderLineType.Float, "Normalized, Phred-scaled likelihoods for AA,AB,BB genotypes where A=ref and B=alt; if site is not biallelic, number of likelihoods if n*(n+1)/2")); - - return result; - } - } \ No newline at end of file From cfe43e3971327ff26ef9087e31b4294d4a98d99c Mon Sep 17 00:00:00 2001 From: Guillermo del Angel Date: Tue, 12 Jul 2011 13:43:46 -0400 Subject: [PATCH 36/38] Bug fix for Genotype given alleles: if we are in INDEL mode ignore SNPs and MNPs instead of emitting an empty site with alleles but no annotations --- .../genotyper/UnifiedGenotyperEngine.java | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java index 4c9080884..6fc972b5d 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java @@ -634,17 +634,27 @@ public class UnifiedGenotyperEngine { if (vcInput == null) return null; - if (vcInput.isSNP() && ( UAC.GLmodel == GenotypeLikelihoodsCalculationModel.Model.BOTH || UAC.GLmodel == GenotypeLikelihoodsCalculationModel.Model.SNP)) - return GenotypeLikelihoodsCalculationModel.Model.SNP; + // todo - no support to genotype MNP's yet + if (vcInput.isMNP()) + return null; + + if (vcInput.isSNP()) { + if (( UAC.GLmodel == GenotypeLikelihoodsCalculationModel.Model.BOTH || UAC.GLmodel == GenotypeLikelihoodsCalculationModel.Model.SNP)) + return GenotypeLikelihoodsCalculationModel.Model.SNP; + else + // ignore SNP's if user chose INDEL mode + return null; + } else if ((vcInput.isIndel() || vcInput.isMixed()) && (UAC.GLmodel == GenotypeLikelihoodsCalculationModel.Model.BOTH || UAC.GLmodel == GenotypeLikelihoodsCalculationModel.Model.INDEL)) return GenotypeLikelihoodsCalculationModel.Model.INDEL; - } else { + } + else { // todo - this assumes SNP's take priority when BOTH is selected, should do a smarter way once extended events are removed if( UAC.GLmodel == GenotypeLikelihoodsCalculationModel.Model.BOTH || UAC.GLmodel == GenotypeLikelihoodsCalculationModel.Model.SNP) return GenotypeLikelihoodsCalculationModel.Model.SNP; else if (UAC.GLmodel == GenotypeLikelihoodsCalculationModel.Model.INDEL) return GenotypeLikelihoodsCalculationModel.Model.INDEL; - } + } } return null; } From 73735863b0fbff0e7dc5ee789f2e075ead13f7fa Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Tue, 12 Jul 2011 13:55:21 -0400 Subject: [PATCH 37/38] Fix for the case of requesting genotype for a sample that doesn't exist in a VariantContext --- .../sting/utils/variantcontext/VariantContext.java | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java index 5787b591f..da80a3431 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java @@ -867,7 +867,10 @@ public class VariantContext implements Feature { // to enable tribble intergrati for ( String name : sampleNames ) { if ( map.containsKey(name) ) throw new IllegalArgumentException("Duplicate names detected in requested samples " + sampleNames); - map.put(name, getGenotype(name)); + final Genotype g = getGenotype(name); + if ( g != null ) { + map.put(name, g); + } } return map; From a2597e7f00824b37174a648da7c648938f5c4886 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Tue, 12 Jul 2011 14:11:53 -0400 Subject: [PATCH 38/38] This commit incorporates several different changes that each pretty much break all the VCF-based integration tests, so I bunched them all together. We now officially emit VCF4.1 files (woo hoo), which means that the VCF headers are now all different (header version is 4.1 plus counts for some of the annotations are 'A' or 'G'). Also, I've added a Read Filter for reads with MQ=255 ('unavailable' in the SAM spec) and have applied this to the UG and the RMS MQ annotation. --- .../MappingQualityUnavailableReadFilter.java | 43 +++++++++++++++++ ...java => MappingQualityZeroReadFilter.java} | 5 +- .../annotator/AlleleBalanceBySample.java | 2 +- .../walkers/annotator/ChromosomeCounts.java | 4 +- .../annotator/MappingQualityRankSumTest.java | 7 ++- .../gatk/walkers/annotator/NBaseCount.java | 2 +- .../walkers/annotator/RMSMappingQuality.java | 7 ++- .../gatk/walkers/annotator/RankSumTest.java | 5 +- .../walkers/genotyper/UnifiedGenotyper.java | 5 +- .../indels/RealignerTargetCreator.java | 4 +- .../indels/SomaticIndelDetectorWalker.java | 2 +- .../phasing/ReadBackedPhasingWalker.java | 4 +- .../recalibration/CountCovariatesWalker.java | 4 +- .../sting/utils/QualityUtils.java | 4 ++ .../utils/codecs/vcf/StandardVCFWriter.java | 6 +-- .../VariantAnnotatorIntegrationTest.java | 28 +++++------ .../GenomicAnnotatorIntegrationTest.java | 6 +-- .../walkers/beagle/BeagleIntegrationTest.java | 6 +-- .../VariantFiltrationIntegrationTest.java | 22 ++++----- .../UnifiedGenotyperIntegrationTest.java | 48 +++++++++---------- .../ReadBackedPhasingIntegrationTest.java | 12 ++--- ...ntRecalibrationWalkersIntegrationTest.java | 2 +- .../CombineVariantsIntegrationTest.java | 34 ++++++------- .../LiftoverVariantsIntegrationTest.java | 6 +-- .../SelectVariantsIntegrationTest.java | 8 ++-- .../VCFStreamingIntegrationTest.java | 2 +- .../VariantsToVCFIntegrationTest.java | 8 ++-- .../VariantContextIntegrationTest.java | 2 +- 28 files changed, 169 insertions(+), 119 deletions(-) create mode 100644 public/java/src/org/broadinstitute/sting/gatk/filters/MappingQualityUnavailableReadFilter.java rename public/java/src/org/broadinstitute/sting/gatk/filters/{ZeroMappingQualityReadFilter.java => MappingQualityZeroReadFilter.java} (90%) diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/MappingQualityUnavailableReadFilter.java b/public/java/src/org/broadinstitute/sting/gatk/filters/MappingQualityUnavailableReadFilter.java new file mode 100644 index 000000000..cecbedda8 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/filters/MappingQualityUnavailableReadFilter.java @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2009 The Broad Institute + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.filters; + +import net.sf.picard.util.QualityUtil; +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.utils.QualityUtils; + +/** + * Filter out mapping quality zero reads. + * + * @author ebanks + * @version 0.1 + */ + +public class MappingQualityUnavailableReadFilter extends ReadFilter { + public boolean filterOut(SAMRecord rec) { + return (rec.getMappingQuality() == QualityUtils.MAPPING_QUALITY_UNAVAILABLE); + } +} + diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/ZeroMappingQualityReadFilter.java b/public/java/src/org/broadinstitute/sting/gatk/filters/MappingQualityZeroReadFilter.java similarity index 90% rename from public/java/src/org/broadinstitute/sting/gatk/filters/ZeroMappingQualityReadFilter.java rename to public/java/src/org/broadinstitute/sting/gatk/filters/MappingQualityZeroReadFilter.java index 7e6fc5e82..e49d4117c 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/filters/ZeroMappingQualityReadFilter.java +++ b/public/java/src/org/broadinstitute/sting/gatk/filters/MappingQualityZeroReadFilter.java @@ -24,17 +24,16 @@ package org.broadinstitute.sting.gatk.filters; -import net.sf.picard.filter.SamRecordFilter; import net.sf.samtools.SAMRecord; /** - * Filter out zero mapping quality reads. + * Filter out mapping quality zero reads. * * @author hanna * @version 0.1 */ -public class ZeroMappingQualityReadFilter extends ReadFilter { +public class MappingQualityZeroReadFilter extends ReadFilter { public boolean filterOut(SAMRecord rec) { return (rec.getMappingQuality() == 0); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalanceBySample.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalanceBySample.java index 0be737897..51d290763 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalanceBySample.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalanceBySample.java @@ -62,5 +62,5 @@ public class AlleleBalanceBySample implements GenotypeAnnotation, ExperimentalAn public List getKeyNames() { return Arrays.asList("AB"); } - public List getDescriptions() { return Arrays.asList(new VCFFormatHeaderLine(getKeyNames().get(0), -1, VCFHeaderLineType.Float, "Allele balance for each het genotype")); } + public List getDescriptions() { return Arrays.asList(new VCFFormatHeaderLine(getKeyNames().get(0), 1, VCFHeaderLineType.Float, "Allele balance for each het genotype")); } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ChromosomeCounts.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ChromosomeCounts.java index ed10d2072..f3ec2b1df 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ChromosomeCounts.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ChromosomeCounts.java @@ -42,8 +42,8 @@ import java.util.*; public class ChromosomeCounts implements InfoFieldAnnotation, StandardAnnotation { private String[] keyNames = { VCFConstants.ALLELE_NUMBER_KEY, VCFConstants.ALLELE_COUNT_KEY, VCFConstants.ALLELE_FREQUENCY_KEY }; - private VCFInfoHeaderLine[] descriptions = { new VCFInfoHeaderLine(VCFConstants.ALLELE_FREQUENCY_KEY, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Float, "Allele Frequency, for each ALT allele, in the same order as listed"), - new VCFInfoHeaderLine(VCFConstants.ALLELE_COUNT_KEY, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "Allele count in genotypes, for each ALT allele, in the same order as listed"), + private VCFInfoHeaderLine[] descriptions = { new VCFInfoHeaderLine(VCFConstants.ALLELE_FREQUENCY_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Float, "Allele Frequency, for each ALT allele, in the same order as listed"), + new VCFInfoHeaderLine(VCFConstants.ALLELE_COUNT_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Integer, "Allele count in genotypes, for each ALT allele, in the same order as listed"), new VCFInfoHeaderLine(VCFConstants.ALLELE_NUMBER_KEY, 1, VCFHeaderLineType.Integer, "Total number of alleles in called genotypes") }; public Map annotate(RefMetaDataTracker tracker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java index 11f86b972..8260a5a81 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java @@ -1,5 +1,6 @@ package org.broadinstitute.sting.gatk.walkers.annotator; +import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; @@ -21,7 +22,7 @@ public class MappingQualityRankSumTest extends RankSumTest { protected void fillQualsFromPileup(byte ref, byte alt, ReadBackedPileup pileup, List refQuals, List altQuals) { for ( final PileupElement p : pileup ) { - if( isUsableBase(p) && p.getMappingQual() < 254 ) { // 254 and 255 are special mapping qualities used as a code by aligners + if ( isUsableBase(p) ) { if ( p.getBase() == ref ) { refQuals.add((double)p.getMappingQual()); } else if ( p.getBase() == alt ) { @@ -34,7 +35,7 @@ public class MappingQualityRankSumTest extends RankSumTest { // equivalent is whether indel likelihoods for reads corresponding to ref allele are more likely than reads corresponding to alt allele ? HashMap> indelLikelihoodMap = IndelGenotypeLikelihoodsCalculationModel.getIndelLikelihoodMap(); for (final PileupElement p: pileup) { - if (indelLikelihoodMap.containsKey(p) && p.getMappingQual() < 254) { + if (indelLikelihoodMap.containsKey(p) && p.getMappingQual() != 0 && p.getMappingQual() != QualityUtils.MAPPING_QUALITY_UNAVAILABLE) { // retrieve likelihood information corresponding to this read LinkedHashMap el = indelLikelihoodMap.get(p); // by design, first element in LinkedHashMap was ref allele @@ -54,8 +55,6 @@ public class MappingQualityRankSumTest extends RankSumTest { refQuals.add((double)p.getMappingQual()); else if (altLikelihood > refLikelihood + INDEL_LIKELIHOOD_THRESH) altQuals.add((double)p.getMappingQual()); - - } } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/NBaseCount.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/NBaseCount.java index ba3e2cc8b..3b64abfff 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/NBaseCount.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/NBaseCount.java @@ -47,5 +47,5 @@ public class NBaseCount implements InfoFieldAnnotation { public List getKeyNames() { return Arrays.asList("PercentNBaseSolid"); } - public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("PercentNBaseSolid", 4, VCFHeaderLineType.Float, "Percentage of N bases in the pileup (counting only SOLiD reads)")); } + public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("PercentNBaseSolid", 1, VCFHeaderLineType.Float, "Percentage of N bases in the pileup (counting only SOLiD reads)")); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RMSMappingQuality.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RMSMappingQuality.java index 6e80c7555..1ef7ccd0b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RMSMappingQuality.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RMSMappingQuality.java @@ -1,5 +1,6 @@ package org.broadinstitute.sting.gatk.walkers.annotator; +import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; @@ -38,8 +39,10 @@ public class RMSMappingQuality implements InfoFieldAnnotation, StandardAnnotatio pileup = context.getBasePileup(); if (pileup != null) { - for (PileupElement p : pileup ) - qualities[index++] = p.getRead().getMappingQuality(); + for (PileupElement p : pileup ) { + if ( p.getMappingQual() != QualityUtils.MAPPING_QUALITY_UNAVAILABLE ) + qualities[index++] = p.getMappingQual(); + } } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java index 1a967293f..f00abd6a1 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java @@ -106,6 +106,9 @@ public abstract class RankSumTest implements InfoFieldAnnotation, StandardAnnota protected abstract void fillIndelQualsFromPileup(ReadBackedPileup pileup, List refQuals, List altQuals); protected static boolean isUsableBase( final PileupElement p ) { - return !( p.isDeletion() || p.getMappingQual() == 0 || ((int)p.getQual()) < 6 ); // need the unBAQed quality score here + return !( p.isDeletion() || + p.getMappingQual() == 0 || + p.getMappingQual() == QualityUtils.MAPPING_QUALITY_UNAVAILABLE || + ((int)p.getQual()) < QualityUtils.MIN_USABLE_Q_SCORE ); // need the unBAQed quality score here } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java index fe0084a19..fc8a5819a 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java @@ -25,6 +25,7 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; +import org.broadinstitute.sting.gatk.filters.MappingQualityUnavailableReadFilter; import org.broadinstitute.sting.utils.codecs.vcf.*; import org.broadinstitute.sting.gatk.contexts.*; import org.broadinstitute.sting.gatk.filters.BadMateFilter; @@ -47,7 +48,7 @@ import java.io.PrintStream; * multi-sample data. The user can choose from several different incorporated calculation models. */ @BAQMode(QualityMode = BAQ.QualityMode.ADD_TAG, ApplicationTime = BAQ.ApplicationTime.ON_INPUT) -@ReadFilters( {BadMateFilter.class} ) +@ReadFilters( {BadMateFilter.class, MappingQualityUnavailableReadFilter.class} ) @Reference(window=@Window(start=-200,stop=200)) @By(DataSource.REFERENCE) @Downsample(by=DownsampleType.BY_SAMPLE, toCoverage=250) @@ -175,7 +176,7 @@ public class UnifiedGenotyper extends LocusWalker { // @Output // PrintStream out; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingWalker.java index e59b29502..4833a6cad 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingWalker.java @@ -32,7 +32,7 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; import org.broadinstitute.sting.gatk.datasources.sample.Sample; -import org.broadinstitute.sting.gatk.filters.ZeroMappingQualityReadFilter; +import org.broadinstitute.sting.gatk.filters.MappingQualityZeroReadFilter; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; import org.broadinstitute.sting.gatk.walkers.*; @@ -58,7 +58,7 @@ import static org.broadinstitute.sting.utils.codecs.vcf.VCFUtils.getVCFHeadersFr @Requires(value = {DataSource.READS, DataSource.REFERENCE}, referenceMetaData = @RMD(name = "variant", type = ReferenceOrderedDatum.class)) @By(DataSource.READS) -@ReadFilters({ZeroMappingQualityReadFilter.class}) +@ReadFilters({MappingQualityZeroReadFilter.class}) // Filter out all reads with zero mapping quality public class ReadBackedPhasingWalker extends RodWalker { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CountCovariatesWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CountCovariatesWalker.java index ee504b6e7..6673bec92 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CountCovariatesWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CountCovariatesWalker.java @@ -34,7 +34,7 @@ import org.broadinstitute.sting.commandline.Output; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; -import org.broadinstitute.sting.gatk.filters.ZeroMappingQualityReadFilter; +import org.broadinstitute.sting.gatk.filters.MappingQualityZeroReadFilter; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature; import org.broadinstitute.sting.gatk.walkers.*; @@ -75,7 +75,7 @@ import java.util.Map; @BAQMode(ApplicationTime = BAQ.ApplicationTime.FORBIDDEN) @By( DataSource.READS ) // Only look at covered loci, not every loci of the reference file -@ReadFilters( {ZeroMappingQualityReadFilter.class} ) // Filter out all reads with zero mapping quality +@ReadFilters( {MappingQualityZeroReadFilter.class} ) // Filter out all reads with zero mapping quality @Requires( {DataSource.READS, DataSource.REFERENCE, DataSource.REFERENCE_BASES} ) // This walker requires both -I input.bam and -R reference.fasta @PartitionBy(PartitionType.LOCUS) public class CountCovariatesWalker extends LocusWalker implements TreeReducible { diff --git a/public/java/src/org/broadinstitute/sting/utils/QualityUtils.java b/public/java/src/org/broadinstitute/sting/utils/QualityUtils.java index 23054e95f..fad2320fc 100755 --- a/public/java/src/org/broadinstitute/sting/utils/QualityUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/QualityUtils.java @@ -9,9 +9,13 @@ import net.sf.samtools.SAMUtils; * @author Kiran Garimella */ public class QualityUtils { + public final static byte MAX_QUAL_SCORE = SAMUtils.MAX_PHRED_SCORE; public final static double MIN_REASONABLE_ERROR = 0.0001; public final static byte MAX_REASONABLE_Q_SCORE = 40; + public final static byte MIN_USABLE_Q_SCORE = 6; + + public final static int MAPPING_QUALITY_UNAVAILABLE = 255; /** * Private constructor. No instantiating this class! diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/StandardVCFWriter.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/StandardVCFWriter.java index f4996b487..a8bf74707 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/StandardVCFWriter.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/StandardVCFWriter.java @@ -123,12 +123,10 @@ public class StandardVCFWriter implements VCFWriter { try { // the file format field needs to be written first - mWriter.write(VCFHeader.METADATA_INDICATOR + VCFHeaderVersion.VCF4_0.getFormatString() + "=" + VCFHeaderVersion.VCF4_0.getVersionString() + "\n"); + mWriter.write(VCFHeader.METADATA_INDICATOR + VCFHeaderVersion.VCF4_1.getFormatString() + "=" + VCFHeaderVersion.VCF4_1.getVersionString() + "\n"); for ( VCFHeaderLine line : mHeader.getMetaData() ) { - if ( line.getKey().equals(VCFHeaderVersion.VCF4_0.getFormatString()) || - line.getKey().equals(VCFHeaderVersion.VCF3_3.getFormatString()) || - line.getKey().equals(VCFHeaderVersion.VCF3_2.getFormatString()) ) + if ( VCFHeaderVersion.isFormatString(line.getKey()) ) continue; // are the records filtered (so we know what to put in the FILTER column of passing records) ? diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java index 6ba6926c6..e6300e6c9 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java @@ -15,7 +15,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testHasAnnotsNotAsking1() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -B:variant,VCF3 " + validationDataLocation + "vcfexample2.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1, - Arrays.asList("4cc077eb3d343e6b7ba12bff86ebe347")); + Arrays.asList("8a105fa5eebdfffe7326bc5b3d8ffd1c")); executeTest("test file has annotations, not asking for annotations, #1", spec); } @@ -23,7 +23,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testHasAnnotsNotAsking2() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -B:variant,VCF3 " + validationDataLocation + "vcfexample3.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000", 1, - Arrays.asList("1de8e943fbf55246ebd19efa32f22a58")); + Arrays.asList("964f1016ec9a3c55333f62dd834c14d6")); executeTest("test file has annotations, not asking for annotations, #2", spec); } @@ -31,7 +31,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testHasAnnotsAsking1() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -G \"Standard\" -B:variant,VCF3 " + validationDataLocation + "vcfexample2.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1, - Arrays.asList("93c110e45fd4aedb044a8a5501e23336")); + Arrays.asList("8e7de435105499cd71ffc099e268a83e")); executeTest("test file has annotations, asking for annotations, #1", spec); } @@ -39,7 +39,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testHasAnnotsAsking2() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -G \"Standard\" -B:variant,VCF3 " + validationDataLocation + "vcfexample3.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000", 1, - Arrays.asList("f5cb45910ed719f46159f9f71acaecf4")); + Arrays.asList("64b6804cb1e27826e3a47089349be581")); executeTest("test file has annotations, asking for annotations, #2", spec); } @@ -47,7 +47,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testNoAnnotsNotAsking1() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -B:variant,VCF3 " + validationDataLocation + "vcfexample2empty.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1, - Arrays.asList("4b48e7d095ef73e3151542ea976ecd89")); + Arrays.asList("42ccee09fa9f8c58f4a0d4f1139c094f")); executeTest("test file doesn't have annotations, not asking for annotations, #1", spec); } @@ -55,7 +55,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testNoAnnotsNotAsking2() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -B:variant,VCF3 " + validationDataLocation + "vcfexample3empty.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000", 1, - Arrays.asList("28dfbfd178aca071b948cd3dc2365357")); + Arrays.asList("f2ddfa8105c290b1f34b7a261a02a1ac")); executeTest("test file doesn't have annotations, not asking for annotations, #2", spec); } @@ -63,7 +63,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testNoAnnotsAsking1() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -G \"Standard\" -B:variant,VCF3 " + validationDataLocation + "vcfexample2empty.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1, - Arrays.asList("a330a5bc3ee72a51dbeb7e6c97a0db99")); + Arrays.asList("fd1ffb669800c2e07df1e2719aa38e49")); executeTest("test file doesn't have annotations, asking for annotations, #1", spec); } @@ -71,7 +71,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testNoAnnotsAsking2() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -G \"Standard\" -B:variant,VCF3 " + validationDataLocation + "vcfexample3empty.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000", 1, - Arrays.asList("3a31d1ef471acfb881a2dec7963fe3f4")); + Arrays.asList("09f8e840770a9411ff77508e0ed0837f")); executeTest("test file doesn't have annotations, asking for annotations, #2", spec); } @@ -79,7 +79,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testOverwritingHeader() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -G \"Standard\" -B:variant,VCF " + validationDataLocation + "vcfexample4.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,001,292", 1, - Arrays.asList("a63fd8ff7bafbd46b7f009144a7c2ad1")); + Arrays.asList("78d2c19f8107d865970dbaf3e12edd92")); executeTest("test overwriting header", spec); } @@ -87,7 +87,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testNoReads() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -G \"Standard\" -B:variant,VCF3 " + validationDataLocation + "vcfexample3empty.vcf -BTI variant", 1, - Arrays.asList("36378f1245bb99d902fbfe147605bc42")); + Arrays.asList("16e3a1403fc376320d7c69492cad9345")); executeTest("not passing it any reads", spec); } @@ -95,7 +95,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testDBTagWithDbsnp() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -D " + GATKDataLocation + "dbsnp_129_b36.rod -G \"Standard\" -B:variant,VCF3 " + validationDataLocation + "vcfexample3empty.vcf -BTI variant", 1, - Arrays.asList("0257a1cc3c703535b2d3c5046bf88ab7")); + Arrays.asList("3da8ca2b6bdaf6e92d94a8c77a71313d")); executeTest("getting DB tag with dbSNP", spec); } @@ -103,7 +103,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testDBTagWithHapMap() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -B:compH3,VCF " + validationDataLocation + "fakeHM3.vcf -G \"Standard\" -B:variant,VCF3 " + validationDataLocation + "vcfexample3empty.vcf -BTI variant", 1, - Arrays.asList("2d7c73489dcf0db433bebdf79a068764")); + Arrays.asList("1bc01c5b3bd0b7aef75230310c3ce688")); executeTest("getting DB tag with HM3", spec); } @@ -111,13 +111,13 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testUsingExpression() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -B:foo,VCF " + validationDataLocation + "targetAnnotations.vcf -G \"Standard\" -B:variant,VCF3 " + validationDataLocation + "vcfexample3empty.vcf -E foo.AF -BTI variant", 1, - Arrays.asList("2f6efd08d818faa1eb0631844437c64a")); + Arrays.asList("e9c0d832dc6b4ed06c955060f830c140")); executeTest("using expression", spec); } @Test public void testTabixAnnotations() { - final String MD5 = "6c7a6a1c0027bf82656542a9b2671a35"; + final String MD5 = "13269d5a2e16f06fd755cc0fb9271acf"; for ( String file : Arrays.asList("CEU.exon.2010_03.sites.vcf", "CEU.exon.2010_03.sites.vcf.gz")) { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -A HomopolymerRun -B:variant,VCF " + validationDataLocation + "/" + file + " -BTI variant -NO_HEADER", 1, diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/genomicannotator/GenomicAnnotatorIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/genomicannotator/GenomicAnnotatorIntegrationTest.java index c4f6d5ebc..c75a5b2dc 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/genomicannotator/GenomicAnnotatorIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/genomicannotator/GenomicAnnotatorIntegrationTest.java @@ -29,7 +29,7 @@ public class GenomicAnnotatorIntegrationTest extends WalkerTest { */ - String[] md5WithDashSArg = {"3d3b61a83c1189108eabb2df04218099"}; + String[] md5WithDashSArg = {"efba4ce1641cfa2ef88a64395f2ebce8"}; WalkerTestSpec specWithSArg = new WalkerTestSpec( "-T GenomicAnnotator -R " + b36KGReference + " -B:variant,vcf3 /humgen/gsa-hpprojects/GATK/data/Annotations/examples/CEU_hapmap_nogt_23_subset.vcf" + @@ -58,7 +58,7 @@ public class GenomicAnnotatorIntegrationTest extends WalkerTest { "-o %s" ), 1, - Arrays.asList("caa562160733aa638e1ba413ede209ae") + Arrays.asList("772fc3f43b70770ec6c6acbb8bbbd4c0") ); executeTest("testGenomicAnnotatorOnIndels", testOnIndels); } @@ -76,7 +76,7 @@ public class GenomicAnnotatorIntegrationTest extends WalkerTest { "-o %s" ), 1, - Arrays.asList("a4cf76f08fa90284b6988a464b6e0c17") + Arrays.asList("081ade7f3d2d3c5f19cb1e8651a626f3") ); executeTest("testGenomicAnnotatorOnSNPsAndIndels", testOnSNPsAndIndels); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/beagle/BeagleIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/beagle/BeagleIntegrationTest.java index 70c34e729..fef1b6e64 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/beagle/BeagleIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/beagle/BeagleIntegrationTest.java @@ -41,7 +41,7 @@ public class BeagleIntegrationTest extends WalkerTest { "-B:beagleR2,BEAGLE " + beagleValidationDataLocation + "inttestbgl.r2 " + "-B:beagleProbs,BEAGLE " + beagleValidationDataLocation + "inttestbgl.gprobs " + "-B:beaglePhased,BEAGLE " + beagleValidationDataLocation + "inttestbgl.phased " + - "-o %s -NO_HEADER", 1, Arrays.asList("6bccee48ad2f06ba5a8c774fed444478")); + "-o %s -NO_HEADER", 1, Arrays.asList("3531451e84208264104040993889aaf4")); executeTest("test BeagleOutputToVCF", spec); } @@ -60,7 +60,7 @@ public class BeagleIntegrationTest extends WalkerTest { "-T ProduceBeagleInput -B:variant,VCF /humgen/gsa-hpprojects/GATK/data/Validation_Data/NA12878_HSQ_chr22_14-16m.vcf "+ "-B:validation,VCF /humgen/gsa-hpprojects/GATK/data/Validation_Data/NA12878_OMNI_chr22_14-16m.vcf "+ "-L 22:14000000-16000000 -o %s -bvcf %s -bs 0.8 -valp 0.98 -R /humgen/1kg/reference/human_g1k_v37.fasta -NO_HEADER ",2, - Arrays.asList("660986891b30cdc937e0f2a3a5743faa","223fb977e8db567dcaf632c6ee51f294")); + Arrays.asList("660986891b30cdc937e0f2a3a5743faa","e96ddd51da9f4a797b2aa8c20e404166")); executeTest("test BeagleInputWithBootstrap",spec); } @@ -72,7 +72,7 @@ public class BeagleIntegrationTest extends WalkerTest { "-B:beagleR2,beagle /humgen/gsa-hpprojects/GATK/data/Validation_Data/EUR_beagle_in_test.r2 "+ "-B:beagleProbs,beagle /humgen/gsa-hpprojects/GATK/data/Validation_Data/EUR_beagle_in_test.gprobs.bgl "+ "-B:beaglePhased,beagle /humgen/gsa-hpprojects/GATK/data/Validation_Data/EUR_beagle_in_test.phased.bgl "+ - "-L 20:1-70000 -o %s -NO_HEADER ",1,Arrays.asList("24b88ef8cdf6e347daab491f0256be5a")); + "-L 20:1-70000 -o %s -NO_HEADER ",1,Arrays.asList("8dd6ec53994fb46c5c22af8535d22965")); executeTest("testBeagleChangesSitesToRef",spec); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationIntegrationTest.java index 3d75fdc44..7bec67d2e 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationIntegrationTest.java @@ -16,7 +16,7 @@ public class VariantFiltrationIntegrationTest extends WalkerTest { public void testNoAction() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -B:variant,VCF3 " + validationDataLocation + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1, - Arrays.asList("4cc077eb3d343e6b7ba12bff86ebe347")); + Arrays.asList("8a105fa5eebdfffe7326bc5b3d8ffd1c")); executeTest("test no action", spec); } @@ -24,7 +24,7 @@ public class VariantFiltrationIntegrationTest extends WalkerTest { public void testClusteredSnps() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -window 10 -B:variant,VCF3 " + validationDataLocation + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1, - Arrays.asList("ada5540bb3d9b6eb8f1337ba01e90a94")); + Arrays.asList("27b13f179bb4920615dff3a32730d845")); executeTest("test clustered SNPs", spec); } @@ -32,17 +32,17 @@ public class VariantFiltrationIntegrationTest extends WalkerTest { public void testMasks() { WalkerTestSpec spec1 = new WalkerTestSpec( baseTestString() + " -mask foo -B:mask,VCF3 " + validationDataLocation + "vcfexample2.vcf -B:variant,VCF3 " + validationDataLocation + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1, - Arrays.asList("b0fcac4af3526e3b2a37602ab4c0e6ae")); + Arrays.asList("578f9e774784c25871678e6464fd212b")); executeTest("test mask all", spec1); WalkerTestSpec spec2 = new WalkerTestSpec( baseTestString() + " -mask foo -B:mask,VCF " + validationDataLocation + "vcfMask.vcf -B:variant,VCF3 " + validationDataLocation + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1, - Arrays.asList("b64baabe905a5d197cc1ab594147d3d5")); + Arrays.asList("bfa86a674aefca1b13d341cb14ab3c4f")); executeTest("test mask some", spec2); WalkerTestSpec spec3 = new WalkerTestSpec( baseTestString() + " -mask foo -maskExtend 10 -B:mask,VCF " + validationDataLocation + "vcfMask.vcf -B:variant,VCF3 " + validationDataLocation + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1, - Arrays.asList("0eff92fe72024d535c44b98e1e9e1993")); + Arrays.asList("5939f80d14b32d88587373532d7b90e5")); executeTest("test mask extend", spec3); } @@ -50,7 +50,7 @@ public class VariantFiltrationIntegrationTest extends WalkerTest { public void testFilter1() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -filter 'DoC < 20 || FisherStrand > 20.0' -filterName foo -B:variant,VCF3 " + validationDataLocation + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1, - Arrays.asList("7a40795147cbfa92941489d7239aad92")); + Arrays.asList("45219dbcfb6f81bba2ea0c35f5bfd368")); executeTest("test filter #1", spec); } @@ -58,7 +58,7 @@ public class VariantFiltrationIntegrationTest extends WalkerTest { public void testFilter2() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -filter 'AlleleBalance < 70.0 && FisherStrand == 1.4' -filterName bar -B:variant,VCF3 " + validationDataLocation + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1, - Arrays.asList("e9dd4991b1e325847c77d053dfe8ee54")); + Arrays.asList("c95845e817da7352b9b72bc9794f18fb")); executeTest("test filter #2", spec); } @@ -66,7 +66,7 @@ public class VariantFiltrationIntegrationTest extends WalkerTest { public void testFilterWithSeparateNames() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " --filterName ABF -filter 'AlleleBalance < 0.7' --filterName FSF -filter 'FisherStrand == 1.4' -B:variant,VCF3 " + validationDataLocation + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1, - Arrays.asList("9ded2cce63b8d97550079047051d80a3")); + Arrays.asList("b8cdd7f44ff1a395e0a9b06a87e1e530")); executeTest("test filter with separate names #2", spec); } @@ -74,12 +74,12 @@ public class VariantFiltrationIntegrationTest extends WalkerTest { public void testGenotypeFilters() { WalkerTestSpec spec1 = new WalkerTestSpec( baseTestString() + " -G_filter 'GQ == 0.60' -G_filterName foo -B:variant,VCF3 " + validationDataLocation + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1, - Arrays.asList("6696e3f65a62ce912230d47cdb0c129b")); + Arrays.asList("96b61e4543a73fe725e433f007260039")); executeTest("test genotype filter #1", spec1); WalkerTestSpec spec2 = new WalkerTestSpec( baseTestString() + " -G_filter 'AF == 0.04 && isHomVar == 1' -G_filterName foo -B:variant,VCF3 " + validationDataLocation + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1, - Arrays.asList("26e5b4ee954c9e0b5eb044afd4b88ee9")); + Arrays.asList("6c8112ab17ce39c8022c891ae73bf38e")); executeTest("test genotype filter #2", spec2); } @@ -87,7 +87,7 @@ public class VariantFiltrationIntegrationTest extends WalkerTest { public void testDeletions() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " --filterExpression 'QUAL < 100' --filterName foo -B:variant,VCF " + validationDataLocation + "twoDeletions.vcf", 1, - Arrays.asList("e63b58be33c9126ad6cc55489aac539b")); + Arrays.asList("569546fd798afa0e65c5b61b440d07ac")); executeTest("test deletions", spec); } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index 20fa7719f..1f23d262e 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -28,7 +28,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultiSamplePilot1() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000", 1, - Arrays.asList("258e1954e6ae55c89abc6a716e19cbe0")); + Arrays.asList("c97829259463d04b0159591bb6fb44af")); executeTest("test MultiSample Pilot1", spec); } @@ -54,12 +54,12 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testWithAllelesPassedIn() { WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommand + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -B:alleles,vcf " + validationDataLocation + "allelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,025,000", 1, - Arrays.asList("edeb1db288a24baff59575ceedd94243")); + Arrays.asList("2b69667f4770e8c0c894066b7f27e440")); executeTest("test MultiSample Pilot2 with alleles passed in", spec1); WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( baseCommand + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -B:alleles,vcf " + validationDataLocation + "allelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,025,000", 1, - Arrays.asList("581990130d90071b084024f4cd7caf91")); + Arrays.asList("b77fe007c2a97fcd59dfd5eef94d8b95")); executeTest("test MultiSample Pilot2 with alleles passed in and emitting all sites", spec2); } @@ -67,7 +67,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testSingleSamplePilot2() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,100,000", 1, - Arrays.asList("d120db27d694a6da32367cc4fb5770fa")); + Arrays.asList("ee8a5e63ddd470726a749e69c0c20f60")); executeTest("test SingleSample Pilot2", spec); } @@ -77,7 +77,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { // // -------------------------------------------------------------------------------------------------------------- - private final static String COMPRESSED_OUTPUT_MD5 = "75e5c430ed39f79f24e375037a388dc4"; + private final static String COMPRESSED_OUTPUT_MD5 = "ef31654a2b85b9b2d3bba4f4a75a17b6"; @Test public void testCompressedOutput() { @@ -107,7 +107,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { // Note that we need to turn off any randomization for this to work, so no downsampling and no annotations - String md5 = "a29615dd37222a11b8dadd341b53e43c"; + String md5 = "46868a9c4134651c54535fb46b408aee"; WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommand + " -dt NONE -G none -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,075,000", 1, @@ -138,9 +138,9 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { @Test public void testCallingParameters() { HashMap e = new HashMap(); - e.put( "--min_base_quality_score 26", "93e6269e38db9bc1732555e9969e3648" ); - e.put( "--min_mapping_quality_score 26", "64be99183c100caed4aa5f8bad64c7e9" ); - e.put( "--p_nonref_model GRID_SEARCH", "0592fe33f705ad8e2f13619fcf157805" ); + e.put( "--min_base_quality_score 26", "5043c9a101e691602eb7a3f9704bdf20" ); + e.put( "--min_mapping_quality_score 26", "71a833eb8fd93ee62ae0d5a430f27940" ); + e.put( "--p_nonref_model GRID_SEARCH", "ddf443e9dcadef367476b26b4d52c134" ); for ( Map.Entry entry : e.entrySet() ) { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( @@ -153,9 +153,9 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { @Test public void testOutputParameter() { HashMap e = new HashMap(); - e.put( "-sites_only", "1483e637dc0279935a7f90d136d147bb" ); - e.put( "--output_mode EMIT_ALL_CONFIDENT_SITES", "adcd91bc7dae8020df8caf1a30060e98" ); - e.put( "--output_mode EMIT_ALL_SITES", "b708acc2fa40f336bcd2d0c70091e07e" ); + e.put( "-sites_only", "eaad6ceb71ab94290650a70bea5ab951" ); + e.put( "--output_mode EMIT_ALL_CONFIDENT_SITES", "05bf7db8a3d19ef4a3d14772c90b732f" ); + e.put( "--output_mode EMIT_ALL_SITES", "e4b86740468d7369f0156550855586c7" ); for ( Map.Entry entry : e.entrySet() ) { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( @@ -169,12 +169,12 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testConfidence() { WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_call_conf 10 ", 1, - Arrays.asList("64be99183c100caed4aa5f8bad64c7e9")); + Arrays.asList("71a833eb8fd93ee62ae0d5a430f27940")); executeTest("test confidence 1", spec1); WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_emit_conf 10 ", 1, - Arrays.asList("e76ca54232d02f0d92730e1affeb804e")); + Arrays.asList("79968844dc3ddecb97748c1acf2984c7")); executeTest("test confidence 2", spec2); } @@ -186,8 +186,8 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { @Test public void testHeterozyosity() { HashMap e = new HashMap(); - e.put( 0.01, "18d37f7f107853b5e32c757b4e143205" ); - e.put( 1.0 / 1850, "2bcb90ce2f7542bf590f7612018fae8e" ); + e.put( 0.01, "4e878664f61d2d800146d3762303fde1" ); + e.put( 1.0 / 1850, "9204caec095ff5e63ca21a10b6fab453" ); for ( Map.Entry entry : e.entrySet() ) { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( @@ -211,7 +211,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,100,000", 1, - Arrays.asList("825f05b31b5bb7e82231a15c7e4e2b0d")); + Arrays.asList("1a58ec52df545f946f80cc16c5736a91")); executeTest(String.format("test multiple technologies"), spec); } @@ -230,7 +230,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -L 1:10,000,000-10,100,000" + " -baq CALCULATE_AS_NECESSARY", 1, - Arrays.asList("0919ab7e513c377610e23a67d33608fa")); + Arrays.asList("62d0f6d9de344ce68ce121c13b1e78b1")); executeTest(String.format("test calling with BAQ"), spec); } @@ -244,7 +244,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -L 1:10,000,000-10,100,000" + " -baq OFF", 1, - Arrays.asList("825f05b31b5bb7e82231a15c7e4e2b0d")); + Arrays.asList("1a58ec52df545f946f80cc16c5736a91")); executeTest(String.format("test calling with BAQ OFF"), spec); } @@ -263,7 +263,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,500,000", 1, - Arrays.asList("cb37348c41b8181be829912730f747e1")); + Arrays.asList("631ae1f1eb6bc4c1a4136b8495250536")); executeTest(String.format("test indel caller in SLX"), spec); } @@ -278,7 +278,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -minIndelCnt 1" + " -L 1:10,000,000-10,100,000", 1, - Arrays.asList("ca5b6a5fb53ae401b146cc3044f454f2")); + Arrays.asList("fd556585c79e2b892a5976668f45aa43")); executeTest(String.format("test indel caller in SLX witn low min allele count"), spec); } @@ -291,7 +291,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,500,000", 1, - Arrays.asList("ca4343a4ab6d3cce94ce61d7d1910f81")); + Arrays.asList("9cd56feedd2787919e571383889fde70")); executeTest(String.format("test indel calling, multiple technologies"), spec); } @@ -301,14 +301,14 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -B:alleles,vcf " + validationDataLocation + "indelAllelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1, - Arrays.asList("3f555b53e9dd14cf7cdf96c24e322364")); + Arrays.asList("315e1b78d7a403d7fcbcf0caa8c496b8")); executeTest("test MultiSample Pilot2 indels with alleles passed in", spec1); WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( baseCommandIndels + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -B:alleles,vcf " + validationDataLocation + "indelAllelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1, - Arrays.asList("1b9764b783acf7822edc58e6822eef5b")); + Arrays.asList("cf89e0c54f14482a23c105b73a333d8a")); executeTest("test MultiSample Pilot2 indels with alleles passed in and emitting all sites", spec2); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingIntegrationTest.java index 0ed16967a..1bf3e579f 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingIntegrationTest.java @@ -26,7 +26,7 @@ public class ReadBackedPhasingIntegrationTest extends WalkerTest { baseTestString(hg18Reference, "phasing_test_chr20_332341_1332503.bam", "phasing_test_chr20_332341_1332503.vcf", 20000, 10, 10) + " -L chr20:332341-382503", 1, - Arrays.asList("6020a68bbec97fcd87819c10cd4e2470")); + Arrays.asList("9568ba0b6624b97ac55a59bdee2d9150")); executeTest("MAX 10 het sites [TEST ONE]; require PQ >= 10", spec); } @@ -36,7 +36,7 @@ public class ReadBackedPhasingIntegrationTest extends WalkerTest { baseTestString(hg18Reference, "phasing_test_chr20_332341_1332503.bam", "phasing_test_chr20_332341_1332503.vcf", 20000, 10, 10) + " -L chr20:1232503-1332503", 1, - Arrays.asList("712c2145df4756c9a15758865d8007b5")); + Arrays.asList("ce65194c24fe83b0ec90faa6c8e6109a")); executeTest("MAX 10 het sites [TEST TWO]; require PQ >= 10", spec); } @@ -46,7 +46,7 @@ public class ReadBackedPhasingIntegrationTest extends WalkerTest { baseTestString(hg18Reference, "phasing_test_chr20_332341_1332503.bam", "phasing_test_chr20_332341_1332503.vcf", 20000, 2, 30) + " -L chr20:332341-382503", 1, - Arrays.asList("297e0896e4761529d979f40f5ad694db")); + Arrays.asList("02d134fd544613b1e5dd7f7197fc3753")); executeTest("MAX 2 het sites [TEST THREE]; require PQ >= 30", spec); } @@ -56,7 +56,7 @@ public class ReadBackedPhasingIntegrationTest extends WalkerTest { baseTestString(hg18Reference, "phasing_test_chr20_332341_1332503.bam", "phasing_test_chr20_332341_1332503.vcf", 20000, 5, 100) + " -L chr20:332341-382503", 1, - Arrays.asList("52a17f14692d726d3b726cf0ae7f2a09")); + Arrays.asList("2f7ec9904fc054c2ba1a7db05eb29334")); executeTest("MAX 5 het sites [TEST FOUR]; require PQ >= 100", spec); } @@ -66,7 +66,7 @@ public class ReadBackedPhasingIntegrationTest extends WalkerTest { baseTestString(hg18Reference, "phasing_test_chr20_332341_1332503.bam", "phasing_test_chr20_332341_1332503.vcf", 1000, 7, 10) + " -L chr20:332341-482503", 1, - Arrays.asList("af768f7958b8f4599c2374f1cc2fc613")); + Arrays.asList("da7a31725f229d1782dd3049848730aa")); executeTest("MAX 7 het sites [TEST FIVE]; require PQ >= 10; cacheWindow = 1000", spec); } @@ -76,7 +76,7 @@ public class ReadBackedPhasingIntegrationTest extends WalkerTest { baseTestString(hg18Reference, "phasing_test_chr20_332341_1332503.bam", "phasing_test_chr20_332341_1332503.vcf", 20000, 10, 10) + " -L chr20:652810-681757", 1, - Arrays.asList("3dd886672f59a47908b94136d0427bb0")); + Arrays.asList("e9d35cb88089fb0e8ae6678bfaeeac8c")); executeTest("MAX 10 het sites [TEST SIX]; require PQ >= 10; cacheWindow = 20000; has inconsistent sites", spec); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java index 9600046da..2fec2e70f 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java @@ -27,7 +27,7 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest { VRTest lowPass = new VRTest("phase1.projectConsensus.chr20.raw.snps.vcf", "d33212a84368e821cbedecd4f59756d6", // tranches "4652dca41222bebdf9d9fda343b2a835", // recal file - "5350b1a4c1250cf3b77ca45327c04711"); // cut VCF + "243a397a33a935fcaccd5deb6d16f0c0"); // cut VCF @DataProvider(name = "VRTest") public Object[][] createData1() { diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java index 600718aa0..daaab9425 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java @@ -71,24 +71,24 @@ public class CombineVariantsIntegrationTest extends WalkerTest { } - @Test public void test1SNP() { test1InOut("pilot2.snps.vcf4.genotypes.vcf", "2117fff6e0d182cd20be508e9661829c", true); } - @Test public void test2SNP() { test1InOut("pilot2.snps.vcf4.genotypes.vcf", "2cfaf7af3dd119df08b8a9c1f72e2f93", " -setKey foo", true); } - @Test public void test3SNP() { test1InOut("pilot2.snps.vcf4.genotypes.vcf", "1474ac0fde2ce42a3c24f1c97eab333e", " -setKey null", true); } - @Test public void testOfficialCEUPilotCalls() { test1InOut("CEU.trio.2010_03.genotypes.vcf.gz", "7fc66df048a0ab08cf507906e1d4a308", false); } // official project VCF files in tabix format + @Test public void test1SNP() { test1InOut("pilot2.snps.vcf4.genotypes.vcf", "c608b9fc1e36dba6cebb4f259883f9f0", true); } + @Test public void test2SNP() { test1InOut("pilot2.snps.vcf4.genotypes.vcf", "20caad94411d6ab48153b214de916df8", " -setKey foo", true); } + @Test public void test3SNP() { test1InOut("pilot2.snps.vcf4.genotypes.vcf", "004f3065cb1bc2ce2f9afd695caf0b48", " -setKey null", true); } + @Test public void testOfficialCEUPilotCalls() { test1InOut("CEU.trio.2010_03.genotypes.vcf.gz", "c9c901ff9ef2a982624b203a8086dff0", false); } // official project VCF files in tabix format - @Test public void test1Indel1() { test1InOut("CEU.dindel.vcf4.trio.2010_06.indel.genotypes.vcf", "ec9715f53dbf4531570557c212822f12", false); } - @Test public void test1Indel2() { test1InOut("CEU.dindel.vcf4.low_coverage.2010_06.indel.genotypes.vcf", "f1072be5f5c6ee810276d9ca6537224d", false); } + @Test public void test1Indel1() { test1InOut("CEU.dindel.vcf4.trio.2010_06.indel.genotypes.vcf", "7593be578d4274d672fc22fced38012b", false); } + @Test public void test1Indel2() { test1InOut("CEU.dindel.vcf4.low_coverage.2010_06.indel.genotypes.vcf", "1cd467863c4e948fadd970681552d57e", false); } - @Test public void combineTrioCalls() { combine2("CEU.trio.2010_03.genotypes.vcf.gz", "YRI.trio.2010_03.genotypes.vcf.gz", "", "b77a1eec725201d9d8e74ee0c45638d3", false); } // official project VCF files in tabix format - @Test public void combineTrioCallsMin() { combine2("CEU.trio.2010_03.genotypes.vcf.gz", "YRI.trio.2010_03.genotypes.vcf.gz", " -minimalVCF", "802977fdfd2f4905b501bb06800f60af", false); } // official project VCF files in tabix format - @Test public void combine2Indels() { combine2("CEU.dindel.vcf4.trio.2010_06.indel.genotypes.vcf", "CEU.dindel.vcf4.low_coverage.2010_06.indel.genotypes.vcf", "", "a67157287dd2b24b5cdf7ebf8fcbbe9a", false); } + @Test public void combineTrioCalls() { combine2("CEU.trio.2010_03.genotypes.vcf.gz", "YRI.trio.2010_03.genotypes.vcf.gz", "", "1d5a021387a8a86554db45a29f66140f", false); } // official project VCF files in tabix format + @Test public void combineTrioCallsMin() { combine2("CEU.trio.2010_03.genotypes.vcf.gz", "YRI.trio.2010_03.genotypes.vcf.gz", " -minimalVCF", "20163d60f18a46496f6da744ab5cc0f9", false); } // official project VCF files in tabix format + @Test public void combine2Indels() { combine2("CEU.dindel.vcf4.trio.2010_06.indel.genotypes.vcf", "CEU.dindel.vcf4.low_coverage.2010_06.indel.genotypes.vcf", "", "5b82f37df1f5ba40f0474d71c94142ec", false); } - @Test public void combineSNPsAndIndels() { combine2("CEU.trio.2010_03.genotypes.vcf.gz", "CEU.dindel.vcf4.low_coverage.2010_06.indel.genotypes.vcf", "", "e1f4718a179f1196538a33863da04f53", false); } + @Test public void combineSNPsAndIndels() { combine2("CEU.trio.2010_03.genotypes.vcf.gz", "CEU.dindel.vcf4.low_coverage.2010_06.indel.genotypes.vcf", "", "c58dca482bf97069eac6d9f1a07a2cba", false); } - @Test public void uniqueSNPs() { combine2("pilot2.snps.vcf4.genotypes.vcf", "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", "", "b3783384b7c8e877b971033e90beba48", true); } + @Test public void uniqueSNPs() { combine2("pilot2.snps.vcf4.genotypes.vcf", "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", "", "89f55abea8f59e39d1effb908440548c", true); } - @Test public void omniHM3Union() { combineSites(" -filteredRecordsMergeType KEEP_IF_ANY_UNFILTERED", "902e541c87caa72134db6293fc46f0ad"); } - @Test public void omniHM3Intersect() { combineSites(" -filteredRecordsMergeType KEEP_IF_ALL_UNFILTERED", "f339ad4bb5863b58b9c919ce7d040bb9"); } + @Test public void omniHM3Union() { combineSites(" -filteredRecordsMergeType KEEP_IF_ANY_UNFILTERED", "4836086891f6cbdd40eebef3076d215a"); } + @Test public void omniHM3Intersect() { combineSites(" -filteredRecordsMergeType KEEP_IF_ALL_UNFILTERED", "6a34b5d743efda8b2f3b639f3a2f5de8"); } @Test public void threeWayWithRefs() { WalkerTestSpec spec = new WalkerTestSpec( @@ -101,7 +101,7 @@ public class CombineVariantsIntegrationTest extends WalkerTest { " -priority NA19240_BGI,NA19240_ILLUMINA,NA19240_WUGSC,denovoInfo" + " -genotypeMergeOptions UNIQUIFY -L 1"), 1, - Arrays.asList("a07995587b855f3214fb71940bf23c0f")); + Arrays.asList("8b78339ccf7a5a5a837f79e88a3a38e5")); executeTest("threeWayWithRefs", spec); } @@ -120,7 +120,7 @@ public class CombineVariantsIntegrationTest extends WalkerTest { } // @Test public void complexTestFull() { combineComplexSites("", "64b991fd3850f83614518f7d71f0532f"); } - @Test public void complexTestMinimal() { combineComplexSites(" -minimalVCF", "0db9ef50fe54b60426474273d7c7fa99"); } - @Test public void complexTestSitesOnly() { combineComplexSites(" -sites_only", "d20acb3d53ba0a02ce92d540ebeda2a9"); } - @Test public void complexTestSitesOnlyMinimal() { combineComplexSites(" -sites_only -minimalVCF", "8d1b3d120515f8b56b5a0d10bc5da713"); } + @Test public void complexTestMinimal() { combineComplexSites(" -minimalVCF", "df96cb3beb2dbb5e02f80abec7d3571e"); } + @Test public void complexTestSitesOnly() { combineComplexSites(" -sites_only", "f72a178137e25dbe0b931934cdc0079d"); } + @Test public void complexTestSitesOnlyMinimal() { combineComplexSites(" -sites_only -minimalVCF", "f704caeaaaed6711943014b847fe381a"); } } \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariantsIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariantsIntegrationTest.java index d32ab6282..82c894c6f 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariantsIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariantsIntegrationTest.java @@ -40,7 +40,7 @@ public class LiftoverVariantsIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( "-T LiftoverVariants -o %s -R " + b36KGReference + " -B:variant,vcf3 " + validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.500.noheader.vcf -chain " + validationDataLocation + "b36ToHg19.broad.over.chain -dict /seq/references/Homo_sapiens_assembly19/v0/Homo_sapiens_assembly19.dict", 1, - Arrays.asList("37e23efd7d6471fc0f807b31ccafe0eb")); + Arrays.asList("70aeaca5b74cc7ba8e2da7b71ff0fbfd")); executeTest("test b36 to hg19", spec); } @@ -49,7 +49,7 @@ public class LiftoverVariantsIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( "-T LiftoverVariants -o %s -R " + b36KGReference + " -B:variant,vcf3 " + validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.500.noheader.unsortedSamples.vcf -chain " + validationDataLocation + "b36ToHg19.broad.over.chain -dict /seq/references/Homo_sapiens_assembly19/v0/Homo_sapiens_assembly19.dict", 1, - Arrays.asList("b6ef4a2f026fd3843aeb9ed764a66921")); + Arrays.asList("3fd7ec2dc4064ef410786276b0dc9d08")); executeTest("test b36 to hg19, unsorted samples", spec); } @@ -58,7 +58,7 @@ public class LiftoverVariantsIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( "-T LiftoverVariants -o %s -R " + hg18Reference + " -B:variant,vcf " + validationDataLocation + "liftover_test.vcf -chain " + validationDataLocation + "hg18ToHg19.broad.over.chain -dict /seq/references/Homo_sapiens_assembly19/v0/Homo_sapiens_assembly19.dict", 1, - Arrays.asList("3275373b3c44ad14a270b50664b3f8a3")); + Arrays.asList("ab2c6254225d7e2ecf52eee604d5673b")); executeTest("test hg18 to hg19, unsorted", spec); } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java index e18287a21..b5f41542e 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java @@ -18,7 +18,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( baseTestString(" -sn A -se '[CDH]' -sf " + samplesFile + " -env -ef -select 'DP < 250' -B:variant,VCF3 " + testfile + " -NO_HEADER"), 1, - Arrays.asList("1b9d551298dc048c7d36b60440ff4d50") + Arrays.asList("d18516c1963802e92cb9e425c0b75fd6") ); executeTest("testComplexSelection--" + testfile, spec); @@ -31,7 +31,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( baseTestString(" -sn A -sn B -sn C -B:variant,VCF3 " + testfile + " -NO_HEADER"), 1, - Arrays.asList("5ba7536a0819421b330350a160e4261a") + Arrays.asList("b74038779fe6485dbb8734ae48178356") ); executeTest("testRepeatedLineSelection--" + testfile, spec); @@ -44,7 +44,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( "-T SelectVariants -R " + hg19Reference + " -sn NA12878 -disc myvar -L 20:1012700-1020000 -B:variant,VCF " + b37hapmapGenotypes + " -B:myvar,VCF " + testFile + " -o %s -NO_HEADER", 1, - Arrays.asList("97621ae8f29955eedfc4e0be3515fcb9") + Arrays.asList("78e6842325f1f1bc9ab30d5e7737ee6e") ); executeTest("testDiscordance--" + testFile, spec); @@ -57,7 +57,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( "-T SelectVariants -R " + hg19Reference + " -sn NA12878 -conc hapmap -L 20:1012700-1020000 -B:hapmap,VCF " + b37hapmapGenotypes + " -B:variant,VCF " + testFile + " -o %s -NO_HEADER", 1, - Arrays.asList("a0ae016fdffcbe7bfb99fd3dbc311407") + Arrays.asList("d2ba3ea30a810f6f0fbfb1b643292b6a") ); executeTest("testConcordance--" + testFile, spec); diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VCFStreamingIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VCFStreamingIntegrationTest.java index cf0673ee6..d7efe4212 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VCFStreamingIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VCFStreamingIntegrationTest.java @@ -60,7 +60,7 @@ public class VCFStreamingIntegrationTest extends WalkerTest { " --NO_HEADER" + " -o %s", 1, - Arrays.asList("debbbf3e661b6857cc8d99ff7635bb1d") + Arrays.asList("658f580f7a294fd334bd897102616fed") ); executeTest("testSimpleVCFStreaming", spec); diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCFIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCFIntegrationTest.java index 64d0db14b..8421076c9 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCFIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCFIntegrationTest.java @@ -20,7 +20,7 @@ public class VariantsToVCFIntegrationTest extends WalkerTest { @Test public void testVariantsToVCFUsingGeliInput() { List md5 = new ArrayList(); - md5.add("bd15d98adc76b5798e3bbeff3f936feb"); + md5.add("815b82fff92aab41c209eedce2d7e7d9"); WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-R " + b36KGReference + @@ -38,7 +38,7 @@ public class VariantsToVCFIntegrationTest extends WalkerTest { @Test public void testGenotypesToVCFUsingGeliInput() { List md5 = new ArrayList(); - md5.add("acd15d3f85bff5b545bc353e0e23cc6e"); + md5.add("22336ee9c12aa222ce29c3c5babca7d0"); WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-R " + b36KGReference + @@ -56,7 +56,7 @@ public class VariantsToVCFIntegrationTest extends WalkerTest { @Test public void testGenotypesToVCFUsingHapMapInput() { List md5 = new ArrayList(); - md5.add("6f34528569f8cf5941cb365fa77288c1"); + md5.add("9bedaa7670b86a07be5191898c3727cf"); WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-R " + b36KGReference + @@ -73,7 +73,7 @@ public class VariantsToVCFIntegrationTest extends WalkerTest { @Test public void testGenotypesToVCFUsingVCFInput() { List md5 = new ArrayList(); - md5.add("d8316fc1b9d8e954a58940354119a32e"); + md5.add("cc215edec9ca28e5c79ab1b67506f9f7"); WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-R " + b36KGReference + diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextIntegrationTest.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextIntegrationTest.java index 5d42f8d0c..a344817a0 100755 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextIntegrationTest.java @@ -49,7 +49,7 @@ public class VariantContextIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( cmdRoot + " -NO_HEADER -B:vcf,VCF3 " + validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.500.vcf -L 1:1-1000000 -o %s --outputVCF %s", 2, // just one output file - Arrays.asList("e3c35d0c4b5d4935c84a270f9df0951f", "e6673737acbb6bfabfcd92c4b2268241")); + Arrays.asList("e3c35d0c4b5d4935c84a270f9df0951f", "ff91731213fd0bbdc200ab6fd1c93e63")); executeTest("testToVCF", spec); }