From 306689421524a12d4c42bad19483554c1c07a934 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Tue, 24 Jul 2012 15:26:26 -0400 Subject: [PATCH 2/6] Bugfix for BCF2 -- Always decode genotypes block when writing out a BCF file. If the header changes (and we currently don't know this easily) then the dictionary keys used in the genotypes block may be invalid. Temporarily added a private static boolean that turns off writing of the blocks until Eric and his team rewrite the header. Signed-off-by: Mark DePristo --- .../writer/BCF2FieldWriterManager.java | 9 ++---- .../variantcontext/writer/BCF2Writer.java | 29 +++++++++++++++++-- 2 files changed, 29 insertions(+), 9 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2FieldWriterManager.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2FieldWriterManager.java index 269750c66..219daf315 100644 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2FieldWriterManager.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2FieldWriterManager.java @@ -160,7 +160,7 @@ public class BCF2FieldWriterManager { /** * Get a site writer specialized to encode values for site info field * @param field key found in the VCF header INFO records - * @return + * @return non-null writer if one can be found, or null if none exists for field */ public BCF2FieldWriter.SiteWriter getSiteFieldWriter(final String field) { return getWriter(field, siteWriters); @@ -169,17 +169,14 @@ public class BCF2FieldWriterManager { /** * Get a genotypes writer specialized to encode values for genotypes field * @param field key found in the VCF header FORMAT records - * @return + * @return non-null writer if one can be found, or null if none exists for field */ public BCF2FieldWriter.GenotypesWriter getGenotypeFieldWriter(final String field) { return getWriter(field, genotypesWriters); } @Requires({"map != null", "key != null"}) - @Ensures("result != null") public T getWriter(final String key, final Map map) { - final T writer = map.get(key); - if ( writer == null ) throw new ReviewedStingException("BUG: no writer found for " + key); - return writer; + return map.get(key); } } diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java index c77fe7172..2f7b8eabc 100644 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java @@ -83,6 +83,14 @@ import java.util.*; * @since 06/12 */ class BCF2Writer extends IndexingVariantContextWriter { + /** + * If true, we will write out the undecoded raw bytes for a genotypes block, if it + * is found in the input VC. This can be very dangerous as the genotype encoding + * depends on the exact ordering of the header. + * + * TODO -- enable when the new smart VCF header code is created by Eric Banks + */ + private final static boolean WRITE_UNDECODED_GENOTYPE_BLOCK = false; final protected static Logger logger = Logger.getLogger(BCF2Writer.class); final private static boolean ALLOW_MISSING_CONTIG_LINES = false; @@ -285,8 +293,9 @@ class BCF2Writer extends IndexingVariantContextWriter { private void buildInfo( VariantContext vc ) throws IOException { for ( Map.Entry infoFieldEntry : vc.getAttributes().entrySet() ) { - final String key = infoFieldEntry.getKey(); - final BCF2FieldWriter.SiteWriter writer = fieldManager.getSiteFieldWriter(key); + final String field = infoFieldEntry.getKey(); + final BCF2FieldWriter.SiteWriter writer = fieldManager.getSiteFieldWriter(field); + if ( writer == null ) errorUnexpectedFieldToWrite(vc, field, "INFO"); writer.start(encoder, vc); writer.site(encoder, vc); writer.done(encoder, vc); @@ -295,7 +304,7 @@ class BCF2Writer extends IndexingVariantContextWriter { private byte[] buildSamplesData(final VariantContext vc) throws IOException { final BCF2Codec.LazyData lazyData = getLazyData(vc); - if ( lazyData != null ) { + if ( WRITE_UNDECODED_GENOTYPE_BLOCK && lazyData != null ) { // we never decoded any data from this BCF file, so just pass it back return lazyData.bytes; } else { @@ -303,6 +312,7 @@ class BCF2Writer extends IndexingVariantContextWriter { final List genotypeFields = VCFWriter.calcVCFGenotypeKeys(vc, header); for ( final String field : genotypeFields ) { final BCF2FieldWriter.GenotypesWriter writer = fieldManager.getGenotypeFieldWriter(field); + if ( writer == null ) errorUnexpectedFieldToWrite(vc, field, "FORMAT"); writer.start(encoder, vc); for ( final String name : sampleNames ) { @@ -316,6 +326,19 @@ class BCF2Writer extends IndexingVariantContextWriter { } } + /** + * Throws a meaningful error message when a field (INFO or FORMAT) is found when writing out a file + * but there's no header line for it. + * + * @param vc + * @param field + * @param fieldType + */ + private final void errorUnexpectedFieldToWrite(final VariantContext vc, final String field, final String fieldType) { + throw new UserException("Found field " + field + " in the " + fieldType + " fields of VariantContext at " + + vc.getChr() + ":" + vc.getStart() + " from " + vc.getSource() + " but this hasn't been defined in the VCFHeader"); + } + // -------------------------------------------------------------------------------- // // Low-level block encoding From 19a257a5c1c50604d044936e908452a09996208e Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Tue, 24 Jul 2012 17:20:21 -0400 Subject: [PATCH 3/6] Multiple bugfixes -- VariantFiltration now properly sets passFilters in VC -- BCF2 writer now properly decodes lazy BCF genotype data that it uses. Improper use generated a horrible subtle bug but the good news is that the extra checks I put in (unnecessarily a few days ago) caught the bug! Signed-off-by: Mark DePristo --- .../walkers/filters/VariantFiltration.java | 6 ++- .../variantcontext/writer/BCF2Writer.java | 44 ++++++++++--------- .../VariantContextTestProvider.java | 2 + 3 files changed, 31 insertions(+), 21 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltration.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltration.java index 4ce291ca3..1415239b4 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltration.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltration.java @@ -333,7 +333,11 @@ public class VariantFiltration extends RodWalker { filters.add(exp.name); } } - builder.filters(filters); + + if ( filters.isEmpty() ) + builder.passFilters(); + else + builder.filters(filters); writer.add(builder.make()); } diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java index 2f7b8eabc..df2008e8e 100644 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java @@ -245,9 +245,11 @@ class BCF2Writer extends IndexingVariantContextWriter { private BCF2Codec.LazyData getLazyData(final VariantContext vc) { if ( vc.getGenotypes().isLazyWithData() ) { - LazyGenotypesContext lgc = (LazyGenotypesContext)vc.getGenotypes(); - if ( lgc.getUnparsedGenotypeData() instanceof BCF2Codec.LazyData ) + LazyGenotypesContext lgc = (LazyGenotypesContext)vc.getGenotypes(); + if ( WRITE_UNDECODED_GENOTYPE_BLOCK && lgc.getUnparsedGenotypeData() instanceof BCF2Codec.LazyData ) return (BCF2Codec.LazyData)lgc.getUnparsedGenotypeData(); + else + lgc.decode(); // WARNING -- required to avoid keeping around bad lazy data for too long } return null; @@ -286,6 +288,8 @@ class BCF2Writer extends IndexingVariantContextWriter { private void buildFilter( VariantContext vc ) throws IOException { if ( vc.isFiltered() ) { encodeStringsByRef(vc.getFilters()); + } else if ( vc.filtersWereApplied() ) { + encodeStringsByRef(Collections.singleton(VCFConstants.PASSES_FILTERS_v4)); } else { encoder.encodeTypedMissing(BCF2Type.INT8); } @@ -303,27 +307,27 @@ class BCF2Writer extends IndexingVariantContextWriter { } private byte[] buildSamplesData(final VariantContext vc) throws IOException { - final BCF2Codec.LazyData lazyData = getLazyData(vc); - if ( WRITE_UNDECODED_GENOTYPE_BLOCK && lazyData != null ) { + final BCF2Codec.LazyData lazyData = getLazyData(vc); // has critical side effects + if ( lazyData != null ) { // we never decoded any data from this BCF file, so just pass it back return lazyData.bytes; - } else { - // we have to do work to convert the VC into a BCF2 byte stream - final List genotypeFields = VCFWriter.calcVCFGenotypeKeys(vc, header); - for ( final String field : genotypeFields ) { - final BCF2FieldWriter.GenotypesWriter writer = fieldManager.getGenotypeFieldWriter(field); - if ( writer == null ) errorUnexpectedFieldToWrite(vc, field, "FORMAT"); - - writer.start(encoder, vc); - for ( final String name : sampleNames ) { - Genotype g = vc.getGenotype(name); - if ( g == null ) VCFWriter.missingSampleError(vc, header); - writer.addGenotype(encoder, vc, g); - } - writer.done(encoder, vc); - } - return encoder.getRecordBytes(); } + + // we have to do work to convert the VC into a BCF2 byte stream + final List genotypeFields = VCFWriter.calcVCFGenotypeKeys(vc, header); + for ( final String field : genotypeFields ) { + final BCF2FieldWriter.GenotypesWriter writer = fieldManager.getGenotypeFieldWriter(field); + if ( writer == null ) errorUnexpectedFieldToWrite(vc, field, "FORMAT"); + + writer.start(encoder, vc); + for ( final String name : sampleNames ) { + Genotype g = vc.getGenotype(name); + if ( g == null ) VCFWriter.missingSampleError(vc, header); + writer.addGenotype(encoder, vc, g); + } + writer.done(encoder, vc); + } + return encoder.getRecordBytes(); } /** diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextTestProvider.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextTestProvider.java index c436a7b45..1a0e8e39d 100644 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextTestProvider.java +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextTestProvider.java @@ -740,6 +740,8 @@ public class VariantContextTestProvider { Assert.assertEquals(actual.getAlleles(), expected.getAlleles(), "alleles"); assertAttributesEquals(actual.getAttributes(), expected.getAttributes()); + Assert.assertEquals(actual.filtersWereApplied(), expected.filtersWereApplied(), "filtersWereApplied"); + Assert.assertEquals(actual.isFiltered(), expected.isFiltered(), "isFiltered"); BaseTest.assertEqualsSet(actual.getFilters(), expected.getFilters(), "filters"); BaseTest.assertEqualsDoubleSmart(actual.getPhredScaledQual(), expected.getPhredScaledQual()); From fcefa61bcefa5cf49f3e8017849044853eac1a1e Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Tue, 24 Jul 2012 17:22:40 -0400 Subject: [PATCH 4/6] Remove reference dependence in BCF2Codec -- Adding BCF2Codec to VCF.jar and associated unit tests Signed-off-by: Mark DePristo --- build.xml | 2 +- .../sting/utils/codecs/bcf2/BCF2Codec.java | 25 +++++-------------- .../VCFJarClassLoadingUnitTest.java | 4 +-- 3 files changed, 9 insertions(+), 22 deletions(-) diff --git a/build.xml b/build.xml index 7389503bb..ca5d22a5a 100644 --- a/build.xml +++ b/build.xml @@ -646,7 +646,7 @@ - + diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java index 18b4d0b6c..0b9654610 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java @@ -26,15 +26,12 @@ package org.broadinstitute.sting.utils.codecs.bcf2; import com.google.java.contract.Ensures; import com.google.java.contract.Requires; -import net.sf.samtools.SAMSequenceRecord; import org.apache.log4j.Logger; import org.broad.tribble.Feature; import org.broad.tribble.FeatureCodec; import org.broad.tribble.FeatureCodecHeader; import org.broad.tribble.readers.AsciiLineReader; import org.broad.tribble.readers.PositionalBufferedStream; -import org.broadinstitute.sting.gatk.refdata.ReferenceDependentFeatureCodec; -import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.codecs.vcf.*; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; @@ -44,12 +41,15 @@ import java.io.ByteArrayInputStream; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; -import java.util.*; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; /** * Decode BCF2 files */ -public final class BCF2Codec implements FeatureCodec, ReferenceDependentFeatureCodec { +public final class BCF2Codec implements FeatureCodec { final protected static Logger logger = Logger.getLogger(BCF2Codec.class); private final static boolean FORBID_SYMBOLICS = false; @@ -162,7 +162,7 @@ public final class BCF2Codec implements FeatureCodec, ReferenceD contigNames.add(contig.getID()); } } else { - logger.info("Didn't find any contig lines in BCF2 file, falling back (dangerously) to GATK reference dictionary"); + throw new UserException.MalformedBCF2("Didn't find any contig lines in BCF2 file header"); } // create the string dictionary @@ -201,19 +201,6 @@ public final class BCF2Codec implements FeatureCodec, ReferenceD } } - // -------------------------------------------------------------------------------- - // - // Reference dependence - // - // -------------------------------------------------------------------------------- - - @Override - public void setGenomeLocParser(final GenomeLocParser genomeLocParser) { - // initialize contigNames to standard ones in reference - for ( final SAMSequenceRecord contig : genomeLocParser.getContigs().getSequences() ) - contigNames.add(contig.getSequenceName()); - } - // -------------------------------------------------------------------------------- // // implicit block diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VCFJarClassLoadingUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VCFJarClassLoadingUnitTest.java index fa5476150..50fbea708 100644 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VCFJarClassLoadingUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VCFJarClassLoadingUnitTest.java @@ -50,12 +50,12 @@ public class VCFJarClassLoadingUnitTest { ClassLoader classLoader = new URLClassLoader(jarURLs, null); classLoader.loadClass("org.broadinstitute.sting.utils.variantcontext.VariantContext"); -// TODO -- uncomment when we include BCF2 codec -// classLoader.loadClass("org.broadinstitute.sting.utils.codecs.bcf2.BCF2Codec"); + classLoader.loadClass("org.broadinstitute.sting.utils.codecs.bcf2.BCF2Codec"); classLoader.loadClass("org.broadinstitute.sting.utils.codecs.vcf.VCFCodec"); classLoader.loadClass("org.broadinstitute.sting.utils.codecs.vcf.VCF3Codec"); classLoader.loadClass("org.broadinstitute.sting.utils.variantcontext.writer.VariantContextWriter"); classLoader.loadClass("org.broadinstitute.sting.utils.variantcontext.writer.VCFWriter"); + classLoader.loadClass("org.broadinstitute.sting.utils.variantcontext.writer.BCF2Writer"); } /** From 16947e93f24484006252988729a3f9c1656a628d Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 25 Jul 2012 08:53:22 -0400 Subject: [PATCH 5/6] Integration test to ensure VariantFiltration makes . -> PASS/FAIL like VQSR Signed-off-by: Mark DePristo --- .../filters/VariantFiltrationIntegrationTest.java | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationIntegrationTest.java index ae5128c75..97b985a29 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationIntegrationTest.java @@ -99,4 +99,13 @@ public class VariantFiltrationIntegrationTest extends WalkerTest { Arrays.asList("8077eb3bab5ff98f12085eb04176fdc9")); executeTest("test deletions", spec); } + + @Test + public void testUnfilteredBecomesFilteredAndPass() { + WalkerTestSpec spec = new WalkerTestSpec( + "-T VariantFiltration -o %s --no_cmdline_in_header -R " + b37KGReference + + " --filterExpression 'FS > 60.0' --filterName SNP_FS -V " + privateTestDir + "unfilteredForFiltering.vcf", 1, + Arrays.asList("8ed32a2272bab8043a255362335395ef")); + executeTest("testUnfilteredBecomesFilteredAndPass", spec); + } }