From 2ca5fc62a2c9aa7613a1d0fa0b612f6cb9b4fe82 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Thu, 19 Jul 2012 16:14:11 -0400 Subject: [PATCH] Support for MISSING BCF2 type -- Heng wants to use 0x0? to represent any missing type value, which in our implementation was invalid. Updated our codebase to support this construct. Heng said he'll update the BCF2 quick reference. -- Enabled integration test reading Heng's ex2.bcf file -- GATK now only warns in the case where the END info field isn't the same (or +1 due to padding) as the getEnd() function as determined by the GATK. Turns out there's a single record in the 1000G SV call set that doesn't have the right length -- VariantContextTestProvider now tests that X = Y where X -> writing -> reading -> writing -> reading = Y for a variety of variant context inputs X -- Added integration test reading 1000G SV chr1 calls (from Chris) --- .../sting/utils/codecs/bcf2/BCF2Decoder.java | 31 ++++++++-------- .../sting/utils/codecs/bcf2/BCF2Type.java | 12 +++++-- .../sting/utils/codecs/bcf2/BCF2Utils.java | 10 +++--- .../utils/variantcontext/VariantContext.java | 17 +++++++-- .../writer/BCF2FieldEncoder.java | 2 +- .../variantcontext/writer/BCF2Writer.java | 2 +- .../utils/codecs/vcf/VCFIntegrationTest.java | 15 ++++++-- .../VariantContextTestProvider.java | 36 ++++++++++++++----- 8 files changed, 87 insertions(+), 38 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Decoder.java b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Decoder.java index 7a6d96131..a13be21c5 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Decoder.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Decoder.java @@ -139,25 +139,26 @@ public final class BCF2Decoder { return decodeTypedValue(typeDescriptor, size); } + @Requires("size >= 0") public final Object decodeTypedValue(final byte typeDescriptor, final int size) { - final BCF2Type type = BCF2Utils.decodeType(typeDescriptor); - - assert size >= 0; - if ( size == 0 ) { + // missing value => null in java return null; - } else if ( type == BCF2Type.CHAR ) { // special case string decoding for efficiency - return decodeLiteralString(size); - } else if ( size == 1 ) { - return decodeSingleValue(type); } else { - final ArrayList ints = new ArrayList(size); - for ( int i = 0; i < size; i++ ) { - final Object val = decodeSingleValue(type); - if ( val == null ) continue; // auto-pruning. We remove trailing nulls - ints.add(val); + final BCF2Type type = BCF2Utils.decodeType(typeDescriptor); + if ( type == BCF2Type.CHAR ) { // special case string decoding for efficiency + return decodeLiteralString(size); + } else if ( size == 1 ) { + return decodeSingleValue(type); + } else { + final ArrayList ints = new ArrayList(size); + for ( int i = 0; i < size; i++ ) { + final Object val = decodeSingleValue(type); + if ( val == null ) continue; // auto-pruning. We remove trailing nulls + ints.add(val); + } + return ints.isEmpty() ? null : ints; // return null when all of the values are null } - return ints.isEmpty() ? null : ints; // return null when all of the values are null } } @@ -256,7 +257,7 @@ public final class BCF2Decoder { * int elements are still forced to do a fresh allocation as well. * @return see description */ - @Requires({"BCF2Type.INTEGERS.contains(type)", "size >= 0", "type != null"}) + @Requires({"type != null", "type.isIntegerType()", "size >= 0"}) public final int[] decodeIntArray(final int size, final BCF2Type type, int[] maybeDest) { if ( size == 0 ) { return null; diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Type.java b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Type.java index f874b14cd..49f375b25 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Type.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Type.java @@ -35,6 +35,7 @@ import java.util.EnumSet; * @since 05/12 */ public enum BCF2Type { + MISSING(0, 0, 0x00), INT8 (1, 1, 0xFFFFFF80, -127, 127), // todo -- confirm range INT16(2, 2, 0xFFFF8000, -32767, 32767), INT32(3, 4, 0x80000000, -2147483647, 2147483647), @@ -86,7 +87,7 @@ public enum BCF2Type { * @param v * @return */ - @Requires("INTEGERS.contains(this)") + @Requires("this.isIntegerType()") public final boolean withinRange(final long v) { return v >= minValue && v <= maxValue; } /** @@ -108,7 +109,14 @@ public enum BCF2Type { /** * An enum set of the types that might represent Integer values */ - public final static EnumSet INTEGERS = EnumSet.of(INT8, INT16, INT32); + private final static EnumSet INTEGERS = EnumSet.of(INT8, INT16, INT32); + + /** + * @return true if this BCF2Type corresponds to the magic "MISSING" type (0x00) + */ + public boolean isMissingType() { + return this == MISSING; + } public boolean isIntegerType() { return INTEGERS.contains(this); diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Utils.java b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Utils.java index 143ab52df..43e933948 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Utils.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Utils.java @@ -225,7 +225,7 @@ public final class BCF2Utils { } } - @Ensures("BCF2Type.INTEGERS.contains(result)") + @Ensures("result.isIntegerType()") public final static BCF2Type determineIntegerType(final int value) { for ( final BCF2Type potentialType : INTEGER_TYPES_BY_SIZE) { if ( potentialType.withinRange(value) ) @@ -235,7 +235,7 @@ public final class BCF2Utils { throw new ReviewedStingException("Integer cannot be encoded in allowable range of even INT32: " + value); } - @Ensures("BCF2Type.INTEGERS.contains(result)") + @Ensures("result.isIntegerType()") public final static BCF2Type determineIntegerType(final int[] values) { // literally a copy of the code below, but there's no general way to unify lists and arrays in java BCF2Type maxType = BCF2Type.INT8; @@ -260,8 +260,8 @@ public final class BCF2Utils { * @param t2 * @return */ - @Requires({"BCF2Type.INTEGERS.contains(t1)","BCF2Type.INTEGERS.contains(t2)"}) - @Ensures("BCF2Type.INTEGERS.contains(result)") + @Requires({"t1.isIntegerType()","t2.isIntegerType()"}) + @Ensures("result.isIntegerType()") public final static BCF2Type maxIntegerType(final BCF2Type t1, final BCF2Type t2) { switch ( t1 ) { case INT8: return t2; @@ -271,7 +271,7 @@ public final class BCF2Utils { } } - @Ensures("BCF2Type.INTEGERS.contains(result)") + @Ensures("result.isIntegerType()") public final static BCF2Type determineIntegerType(final List values) { BCF2Type maxType = BCF2Type.INT8; for ( final int value : values ) { diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java index 1f0b2b054..dcdd95d00 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java @@ -1,5 +1,6 @@ package org.broadinstitute.sting.utils.variantcontext; +import org.apache.log4j.Logger; import org.broad.tribble.Feature; import org.broad.tribble.TribbleException; import org.broad.tribble.util.ParsingUtils; @@ -176,6 +177,10 @@ import java.util.*; * @author depristo */ public class VariantContext implements Feature { // to enable tribble integration + private final static boolean WARN_ABOUT_BAD_END = true; + final protected static Logger logger = Logger.getLogger(VariantContext.class); + + private boolean fullyDecoded = false; protected CommonInfo commonInfo = null; public final static double NO_LOG10_PERROR = CommonInfo.NO_LOG10_PERROR; @@ -1146,10 +1151,16 @@ public class VariantContext implements Feature { // to enable tribble integratio if ( hasAttribute(VCFConstants.END_KEY) ) { final int end = getAttributeAsInt(VCFConstants.END_KEY, -1); assert end != -1; - if ( end != getEnd() ) - throw new ReviewedStingException("Badly formed variant context at location " + getChr() + ":" + if ( end != getEnd() && end != getEnd() + 1 ) { + // the end is allowed to 1 bigger because of the padding + final String message = "Badly formed variant context at location " + getChr() + ":" + getStart() + "; getEnd() was " + getEnd() - + " but this VariantContext contains an END key with value " + end); + + " but this VariantContext contains an END key with value " + end; + if ( WARN_ABOUT_BAD_END ) + logger.warn(message); + else + throw new ReviewedStingException(message); + } } } diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2FieldEncoder.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2FieldEncoder.java index 812e6dd07..ddeb4d284 100644 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2FieldEncoder.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2FieldEncoder.java @@ -47,7 +47,7 @@ import java.util.Map; */ @Invariant({ "headerLine != null", - "BCF2Type.INTEGERS.contains(dictionaryOffsetType)", + "dictionaryOffsetType.isIntegerType()", "dictionaryOffset >= 0" }) public abstract class BCF2FieldEncoder { diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java index 45610bbf9..c77fe7172 100644 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java @@ -338,7 +338,7 @@ class BCF2Writer extends IndexingVariantContextWriter { } @Requires("! strings.isEmpty()") - @Ensures("BCF2Type.INTEGERS.contains(result)") + @Ensures("result.isIntegerType()") private final BCF2Type encodeStringsByRef(final Collection strings) throws IOException { final List offsets = new ArrayList(strings.size()); diff --git a/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFIntegrationTest.java b/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFIntegrationTest.java index b271d8c84..e9b845d59 100644 --- a/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFIntegrationTest.java @@ -39,6 +39,17 @@ public class VCFIntegrationTest extends WalkerTest { executeTest("Test reading and writing breakpoint VCF", spec1); } + @Test(enabled = true) + public void testReadingAndWriting1000GSVs() { + String testVCF = privateTestDir + "1000G_SVs.chr1.vcf"; + + String baseCommand = "-R " + b37KGReference + " --no_cmdline_in_header -o %s "; + + String test1 = baseCommand + "-T SelectVariants -V " + testVCF; + WalkerTestSpec spec1 = new WalkerTestSpec(test1, 1, Arrays.asList("")); + executeTest("Test reading and writing 1000G Phase I SVs", spec1); + } + @Test public void testReadingAndWritingSamtools() { String testVCF = privateTestDir + "samtools.vcf"; @@ -59,12 +70,12 @@ public class VCFIntegrationTest extends WalkerTest { executeTest("Test writing samtools WEx BCF example", spec1); } - @Test(enabled = false) + @Test(enabled = true) public void testReadingSamtoolsWExBCFExample() { String testVCF = privateTestDir + "ex2.bcf"; String baseCommand = "-R " + b36KGReference + " --no_cmdline_in_header -o %s "; String test1 = baseCommand + "-T SelectVariants -V " + testVCF; - WalkerTestSpec spec1 = new WalkerTestSpec(test1, 1, Arrays.asList("63a2e0484ae37b0680514f53e0bf0c94")); + WalkerTestSpec spec1 = new WalkerTestSpec(test1, 1, Arrays.asList("0439e2b4ccc63bb4ba7c283cd9ab1b25")); executeTest("Test reading samtools WEx BCF example", spec1); } diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextTestProvider.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextTestProvider.java index ca4cdf306..c436a7b45 100644 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextTestProvider.java +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextTestProvider.java @@ -597,23 +597,41 @@ public class VariantContextTestProvider { } public static void testReaderWriter(final VariantContextIOTest tester, final VariantContextTestData data) throws IOException { + testReaderWriter(tester, data.header, data.vcs, data.vcs, true); + } + + public static void testReaderWriter(final VariantContextIOTest tester, + final VCFHeader header, + final List expected, + final Iterable vcs, + final boolean recurse) throws IOException { final File tmpFile = File.createTempFile("testReaderWriter", tester.getExtension()); tmpFile.deleteOnExit(); - // todo -- test all options - - // write + // write expected to disk final EnumSet options = EnumSet.of(Options.INDEX_ON_THE_FLY); final VariantContextWriter writer = tester.makeWriter(tmpFile, options); - writer.writeHeader(data.header); - final List expected = data.vcs; - for ( VariantContext vc : expected ) - writer.add(vc); - writer.close(); + writeVCsToFile(writer, header, vcs); - final Iterable actual = readAllVCs(tmpFile, tester.makeCodec()).getSecond(); + // ensure writing of expected == actual + final Pair> p = readAllVCs(tmpFile, tester.makeCodec()); + final Iterable actual = p.getSecond(); assertEquals(actual, expected); + if ( recurse ) { + // if we are doing a recursive test, grab a fresh iterator over the written values + final Iterable read = readAllVCs(tmpFile, tester.makeCodec()).getSecond(); + testReaderWriter(tester, p.getFirst(), expected, read, false); + } + } + + private static void writeVCsToFile(final VariantContextWriter writer, final VCFHeader header, final Iterable vcs) { + // write + writer.writeHeader(header); + for ( VariantContext vc : vcs ) + if (vc != null) + writer.add(vc); + writer.close(); } /**