Support for MISSING BCF2 type

-- Heng wants to use 0x0? to represent any missing type value, which in our implementation was invalid.  Updated our codebase to support this construct.  Heng said he'll update the BCF2 quick reference.
-- Enabled integration test reading Heng's ex2.bcf file
-- GATK now only warns in the case where the END info field isn't the same (or +1 due to padding) as the getEnd() function as determined by the GATK.  Turns out there's a single record in the 1000G SV call set that doesn't have the right length
-- VariantContextTestProvider now tests that X = Y where X -> writing -> reading -> writing -> reading = Y for a variety of variant context inputs X
-- Added integration test reading 1000G SV chr1 calls (from Chris)
This commit is contained in:
Mark DePristo 2012-07-19 16:14:11 -04:00
parent 50365d01c4
commit 2ca5fc62a2
8 changed files with 87 additions and 38 deletions

View File

@ -139,25 +139,26 @@ public final class BCF2Decoder {
return decodeTypedValue(typeDescriptor, size); return decodeTypedValue(typeDescriptor, size);
} }
@Requires("size >= 0")
public final Object decodeTypedValue(final byte typeDescriptor, final int size) { public final Object decodeTypedValue(final byte typeDescriptor, final int size) {
final BCF2Type type = BCF2Utils.decodeType(typeDescriptor);
assert size >= 0;
if ( size == 0 ) { if ( size == 0 ) {
// missing value => null in java
return null; return null;
} else if ( type == BCF2Type.CHAR ) { // special case string decoding for efficiency
return decodeLiteralString(size);
} else if ( size == 1 ) {
return decodeSingleValue(type);
} else { } else {
final ArrayList<Object> ints = new ArrayList<Object>(size); final BCF2Type type = BCF2Utils.decodeType(typeDescriptor);
for ( int i = 0; i < size; i++ ) { if ( type == BCF2Type.CHAR ) { // special case string decoding for efficiency
final Object val = decodeSingleValue(type); return decodeLiteralString(size);
if ( val == null ) continue; // auto-pruning. We remove trailing nulls } else if ( size == 1 ) {
ints.add(val); return decodeSingleValue(type);
} else {
final ArrayList<Object> ints = new ArrayList<Object>(size);
for ( int i = 0; i < size; i++ ) {
final Object val = decodeSingleValue(type);
if ( val == null ) continue; // auto-pruning. We remove trailing nulls
ints.add(val);
}
return ints.isEmpty() ? null : ints; // return null when all of the values are null
} }
return ints.isEmpty() ? null : ints; // return null when all of the values are null
} }
} }
@ -256,7 +257,7 @@ public final class BCF2Decoder {
* int elements are still forced to do a fresh allocation as well. * int elements are still forced to do a fresh allocation as well.
* @return see description * @return see description
*/ */
@Requires({"BCF2Type.INTEGERS.contains(type)", "size >= 0", "type != null"}) @Requires({"type != null", "type.isIntegerType()", "size >= 0"})
public final int[] decodeIntArray(final int size, final BCF2Type type, int[] maybeDest) { public final int[] decodeIntArray(final int size, final BCF2Type type, int[] maybeDest) {
if ( size == 0 ) { if ( size == 0 ) {
return null; return null;

View File

@ -35,6 +35,7 @@ import java.util.EnumSet;
* @since 05/12 * @since 05/12
*/ */
public enum BCF2Type { public enum BCF2Type {
MISSING(0, 0, 0x00),
INT8 (1, 1, 0xFFFFFF80, -127, 127), // todo -- confirm range INT8 (1, 1, 0xFFFFFF80, -127, 127), // todo -- confirm range
INT16(2, 2, 0xFFFF8000, -32767, 32767), INT16(2, 2, 0xFFFF8000, -32767, 32767),
INT32(3, 4, 0x80000000, -2147483647, 2147483647), INT32(3, 4, 0x80000000, -2147483647, 2147483647),
@ -86,7 +87,7 @@ public enum BCF2Type {
* @param v * @param v
* @return * @return
*/ */
@Requires("INTEGERS.contains(this)") @Requires("this.isIntegerType()")
public final boolean withinRange(final long v) { return v >= minValue && v <= maxValue; } public final boolean withinRange(final long v) { return v >= minValue && v <= maxValue; }
/** /**
@ -108,7 +109,14 @@ public enum BCF2Type {
/** /**
* An enum set of the types that might represent Integer values * An enum set of the types that might represent Integer values
*/ */
public final static EnumSet<BCF2Type> INTEGERS = EnumSet.of(INT8, INT16, INT32); private final static EnumSet<BCF2Type> INTEGERS = EnumSet.of(INT8, INT16, INT32);
/**
* @return true if this BCF2Type corresponds to the magic "MISSING" type (0x00)
*/
public boolean isMissingType() {
return this == MISSING;
}
public boolean isIntegerType() { public boolean isIntegerType() {
return INTEGERS.contains(this); return INTEGERS.contains(this);

View File

@ -225,7 +225,7 @@ public final class BCF2Utils {
} }
} }
@Ensures("BCF2Type.INTEGERS.contains(result)") @Ensures("result.isIntegerType()")
public final static BCF2Type determineIntegerType(final int value) { public final static BCF2Type determineIntegerType(final int value) {
for ( final BCF2Type potentialType : INTEGER_TYPES_BY_SIZE) { for ( final BCF2Type potentialType : INTEGER_TYPES_BY_SIZE) {
if ( potentialType.withinRange(value) ) if ( potentialType.withinRange(value) )
@ -235,7 +235,7 @@ public final class BCF2Utils {
throw new ReviewedStingException("Integer cannot be encoded in allowable range of even INT32: " + value); throw new ReviewedStingException("Integer cannot be encoded in allowable range of even INT32: " + value);
} }
@Ensures("BCF2Type.INTEGERS.contains(result)") @Ensures("result.isIntegerType()")
public final static BCF2Type determineIntegerType(final int[] values) { public final static BCF2Type determineIntegerType(final int[] values) {
// literally a copy of the code below, but there's no general way to unify lists and arrays in java // literally a copy of the code below, but there's no general way to unify lists and arrays in java
BCF2Type maxType = BCF2Type.INT8; BCF2Type maxType = BCF2Type.INT8;
@ -260,8 +260,8 @@ public final class BCF2Utils {
* @param t2 * @param t2
* @return * @return
*/ */
@Requires({"BCF2Type.INTEGERS.contains(t1)","BCF2Type.INTEGERS.contains(t2)"}) @Requires({"t1.isIntegerType()","t2.isIntegerType()"})
@Ensures("BCF2Type.INTEGERS.contains(result)") @Ensures("result.isIntegerType()")
public final static BCF2Type maxIntegerType(final BCF2Type t1, final BCF2Type t2) { public final static BCF2Type maxIntegerType(final BCF2Type t1, final BCF2Type t2) {
switch ( t1 ) { switch ( t1 ) {
case INT8: return t2; case INT8: return t2;
@ -271,7 +271,7 @@ public final class BCF2Utils {
} }
} }
@Ensures("BCF2Type.INTEGERS.contains(result)") @Ensures("result.isIntegerType()")
public final static BCF2Type determineIntegerType(final List<Integer> values) { public final static BCF2Type determineIntegerType(final List<Integer> values) {
BCF2Type maxType = BCF2Type.INT8; BCF2Type maxType = BCF2Type.INT8;
for ( final int value : values ) { for ( final int value : values ) {

View File

@ -1,5 +1,6 @@
package org.broadinstitute.sting.utils.variantcontext; package org.broadinstitute.sting.utils.variantcontext;
import org.apache.log4j.Logger;
import org.broad.tribble.Feature; import org.broad.tribble.Feature;
import org.broad.tribble.TribbleException; import org.broad.tribble.TribbleException;
import org.broad.tribble.util.ParsingUtils; import org.broad.tribble.util.ParsingUtils;
@ -176,6 +177,10 @@ import java.util.*;
* @author depristo * @author depristo
*/ */
public class VariantContext implements Feature { // to enable tribble integration public class VariantContext implements Feature { // to enable tribble integration
private final static boolean WARN_ABOUT_BAD_END = true;
final protected static Logger logger = Logger.getLogger(VariantContext.class);
private boolean fullyDecoded = false; private boolean fullyDecoded = false;
protected CommonInfo commonInfo = null; protected CommonInfo commonInfo = null;
public final static double NO_LOG10_PERROR = CommonInfo.NO_LOG10_PERROR; public final static double NO_LOG10_PERROR = CommonInfo.NO_LOG10_PERROR;
@ -1146,10 +1151,16 @@ public class VariantContext implements Feature { // to enable tribble integratio
if ( hasAttribute(VCFConstants.END_KEY) ) { if ( hasAttribute(VCFConstants.END_KEY) ) {
final int end = getAttributeAsInt(VCFConstants.END_KEY, -1); final int end = getAttributeAsInt(VCFConstants.END_KEY, -1);
assert end != -1; assert end != -1;
if ( end != getEnd() ) if ( end != getEnd() && end != getEnd() + 1 ) {
throw new ReviewedStingException("Badly formed variant context at location " + getChr() + ":" // the end is allowed to 1 bigger because of the padding
final String message = "Badly formed variant context at location " + getChr() + ":"
+ getStart() + "; getEnd() was " + getEnd() + getStart() + "; getEnd() was " + getEnd()
+ " but this VariantContext contains an END key with value " + end); + " but this VariantContext contains an END key with value " + end;
if ( WARN_ABOUT_BAD_END )
logger.warn(message);
else
throw new ReviewedStingException(message);
}
} }
} }

View File

@ -47,7 +47,7 @@ import java.util.Map;
*/ */
@Invariant({ @Invariant({
"headerLine != null", "headerLine != null",
"BCF2Type.INTEGERS.contains(dictionaryOffsetType)", "dictionaryOffsetType.isIntegerType()",
"dictionaryOffset >= 0" "dictionaryOffset >= 0"
}) })
public abstract class BCF2FieldEncoder { public abstract class BCF2FieldEncoder {

View File

@ -338,7 +338,7 @@ class BCF2Writer extends IndexingVariantContextWriter {
} }
@Requires("! strings.isEmpty()") @Requires("! strings.isEmpty()")
@Ensures("BCF2Type.INTEGERS.contains(result)") @Ensures("result.isIntegerType()")
private final BCF2Type encodeStringsByRef(final Collection<String> strings) throws IOException { private final BCF2Type encodeStringsByRef(final Collection<String> strings) throws IOException {
final List<Integer> offsets = new ArrayList<Integer>(strings.size()); final List<Integer> offsets = new ArrayList<Integer>(strings.size());

View File

@ -39,6 +39,17 @@ public class VCFIntegrationTest extends WalkerTest {
executeTest("Test reading and writing breakpoint VCF", spec1); executeTest("Test reading and writing breakpoint VCF", spec1);
} }
@Test(enabled = true)
public void testReadingAndWriting1000GSVs() {
String testVCF = privateTestDir + "1000G_SVs.chr1.vcf";
String baseCommand = "-R " + b37KGReference + " --no_cmdline_in_header -o %s ";
String test1 = baseCommand + "-T SelectVariants -V " + testVCF;
WalkerTestSpec spec1 = new WalkerTestSpec(test1, 1, Arrays.asList(""));
executeTest("Test reading and writing 1000G Phase I SVs", spec1);
}
@Test @Test
public void testReadingAndWritingSamtools() { public void testReadingAndWritingSamtools() {
String testVCF = privateTestDir + "samtools.vcf"; String testVCF = privateTestDir + "samtools.vcf";
@ -59,12 +70,12 @@ public class VCFIntegrationTest extends WalkerTest {
executeTest("Test writing samtools WEx BCF example", spec1); executeTest("Test writing samtools WEx BCF example", spec1);
} }
@Test(enabled = false) @Test(enabled = true)
public void testReadingSamtoolsWExBCFExample() { public void testReadingSamtoolsWExBCFExample() {
String testVCF = privateTestDir + "ex2.bcf"; String testVCF = privateTestDir + "ex2.bcf";
String baseCommand = "-R " + b36KGReference + " --no_cmdline_in_header -o %s "; String baseCommand = "-R " + b36KGReference + " --no_cmdline_in_header -o %s ";
String test1 = baseCommand + "-T SelectVariants -V " + testVCF; String test1 = baseCommand + "-T SelectVariants -V " + testVCF;
WalkerTestSpec spec1 = new WalkerTestSpec(test1, 1, Arrays.asList("63a2e0484ae37b0680514f53e0bf0c94")); WalkerTestSpec spec1 = new WalkerTestSpec(test1, 1, Arrays.asList("0439e2b4ccc63bb4ba7c283cd9ab1b25"));
executeTest("Test reading samtools WEx BCF example", spec1); executeTest("Test reading samtools WEx BCF example", spec1);
} }

View File

@ -597,23 +597,41 @@ public class VariantContextTestProvider {
} }
public static void testReaderWriter(final VariantContextIOTest tester, final VariantContextTestData data) throws IOException { public static void testReaderWriter(final VariantContextIOTest tester, final VariantContextTestData data) throws IOException {
testReaderWriter(tester, data.header, data.vcs, data.vcs, true);
}
public static void testReaderWriter(final VariantContextIOTest tester,
final VCFHeader header,
final List<VariantContext> expected,
final Iterable<VariantContext> vcs,
final boolean recurse) throws IOException {
final File tmpFile = File.createTempFile("testReaderWriter", tester.getExtension()); final File tmpFile = File.createTempFile("testReaderWriter", tester.getExtension());
tmpFile.deleteOnExit(); tmpFile.deleteOnExit();
// todo -- test all options // write expected to disk
// write
final EnumSet<Options> options = EnumSet.of(Options.INDEX_ON_THE_FLY); final EnumSet<Options> options = EnumSet.of(Options.INDEX_ON_THE_FLY);
final VariantContextWriter writer = tester.makeWriter(tmpFile, options); final VariantContextWriter writer = tester.makeWriter(tmpFile, options);
writer.writeHeader(data.header); writeVCsToFile(writer, header, vcs);
final List<VariantContext> expected = data.vcs;
for ( VariantContext vc : expected )
writer.add(vc);
writer.close();
final Iterable<VariantContext> actual = readAllVCs(tmpFile, tester.makeCodec()).getSecond(); // ensure writing of expected == actual
final Pair<VCFHeader, Iterable<VariantContext>> p = readAllVCs(tmpFile, tester.makeCodec());
final Iterable<VariantContext> actual = p.getSecond();
assertEquals(actual, expected); assertEquals(actual, expected);
if ( recurse ) {
// if we are doing a recursive test, grab a fresh iterator over the written values
final Iterable<VariantContext> read = readAllVCs(tmpFile, tester.makeCodec()).getSecond();
testReaderWriter(tester, p.getFirst(), expected, read, false);
}
}
private static void writeVCsToFile(final VariantContextWriter writer, final VCFHeader header, final Iterable<VariantContext> vcs) {
// write
writer.writeHeader(header);
for ( VariantContext vc : vcs )
if (vc != null)
writer.add(vc);
writer.close();
} }
/** /**