Support for MISSING BCF2 type

-- Heng wants to use 0x0? to represent any missing type value, which in our implementation was invalid.  Updated our codebase to support this construct.  Heng said he'll update the BCF2 quick reference.
-- Enabled integration test reading Heng's ex2.bcf file
-- GATK now only warns in the case where the END info field isn't the same (or +1 due to padding) as the getEnd() function as determined by the GATK.  Turns out there's a single record in the 1000G SV call set that doesn't have the right length
-- VariantContextTestProvider now tests that X = Y where X -> writing -> reading -> writing -> reading = Y for a variety of variant context inputs X
-- Added integration test reading 1000G SV chr1 calls (from Chris)
This commit is contained in:
Mark DePristo 2012-07-19 16:14:11 -04:00
parent 50365d01c4
commit 2ca5fc62a2
8 changed files with 87 additions and 38 deletions

View File

@ -139,25 +139,26 @@ public final class BCF2Decoder {
return decodeTypedValue(typeDescriptor, size);
}
@Requires("size >= 0")
public final Object decodeTypedValue(final byte typeDescriptor, final int size) {
final BCF2Type type = BCF2Utils.decodeType(typeDescriptor);
assert size >= 0;
if ( size == 0 ) {
// missing value => null in java
return null;
} else if ( type == BCF2Type.CHAR ) { // special case string decoding for efficiency
return decodeLiteralString(size);
} else if ( size == 1 ) {
return decodeSingleValue(type);
} else {
final ArrayList<Object> ints = new ArrayList<Object>(size);
for ( int i = 0; i < size; i++ ) {
final Object val = decodeSingleValue(type);
if ( val == null ) continue; // auto-pruning. We remove trailing nulls
ints.add(val);
final BCF2Type type = BCF2Utils.decodeType(typeDescriptor);
if ( type == BCF2Type.CHAR ) { // special case string decoding for efficiency
return decodeLiteralString(size);
} else if ( size == 1 ) {
return decodeSingleValue(type);
} else {
final ArrayList<Object> ints = new ArrayList<Object>(size);
for ( int i = 0; i < size; i++ ) {
final Object val = decodeSingleValue(type);
if ( val == null ) continue; // auto-pruning. We remove trailing nulls
ints.add(val);
}
return ints.isEmpty() ? null : ints; // return null when all of the values are null
}
return ints.isEmpty() ? null : ints; // return null when all of the values are null
}
}
@ -256,7 +257,7 @@ public final class BCF2Decoder {
* int elements are still forced to do a fresh allocation as well.
* @return see description
*/
@Requires({"BCF2Type.INTEGERS.contains(type)", "size >= 0", "type != null"})
@Requires({"type != null", "type.isIntegerType()", "size >= 0"})
public final int[] decodeIntArray(final int size, final BCF2Type type, int[] maybeDest) {
if ( size == 0 ) {
return null;

View File

@ -35,6 +35,7 @@ import java.util.EnumSet;
* @since 05/12
*/
public enum BCF2Type {
MISSING(0, 0, 0x00),
INT8 (1, 1, 0xFFFFFF80, -127, 127), // todo -- confirm range
INT16(2, 2, 0xFFFF8000, -32767, 32767),
INT32(3, 4, 0x80000000, -2147483647, 2147483647),
@ -86,7 +87,7 @@ public enum BCF2Type {
* @param v
* @return
*/
@Requires("INTEGERS.contains(this)")
@Requires("this.isIntegerType()")
public final boolean withinRange(final long v) { return v >= minValue && v <= maxValue; }
/**
@ -108,7 +109,14 @@ public enum BCF2Type {
/**
* An enum set of the types that might represent Integer values
*/
public final static EnumSet<BCF2Type> INTEGERS = EnumSet.of(INT8, INT16, INT32);
private final static EnumSet<BCF2Type> INTEGERS = EnumSet.of(INT8, INT16, INT32);
/**
* @return true if this BCF2Type corresponds to the magic "MISSING" type (0x00)
*/
public boolean isMissingType() {
return this == MISSING;
}
public boolean isIntegerType() {
return INTEGERS.contains(this);

View File

@ -225,7 +225,7 @@ public final class BCF2Utils {
}
}
@Ensures("BCF2Type.INTEGERS.contains(result)")
@Ensures("result.isIntegerType()")
public final static BCF2Type determineIntegerType(final int value) {
for ( final BCF2Type potentialType : INTEGER_TYPES_BY_SIZE) {
if ( potentialType.withinRange(value) )
@ -235,7 +235,7 @@ public final class BCF2Utils {
throw new ReviewedStingException("Integer cannot be encoded in allowable range of even INT32: " + value);
}
@Ensures("BCF2Type.INTEGERS.contains(result)")
@Ensures("result.isIntegerType()")
public final static BCF2Type determineIntegerType(final int[] values) {
// literally a copy of the code below, but there's no general way to unify lists and arrays in java
BCF2Type maxType = BCF2Type.INT8;
@ -260,8 +260,8 @@ public final class BCF2Utils {
* @param t2
* @return
*/
@Requires({"BCF2Type.INTEGERS.contains(t1)","BCF2Type.INTEGERS.contains(t2)"})
@Ensures("BCF2Type.INTEGERS.contains(result)")
@Requires({"t1.isIntegerType()","t2.isIntegerType()"})
@Ensures("result.isIntegerType()")
public final static BCF2Type maxIntegerType(final BCF2Type t1, final BCF2Type t2) {
switch ( t1 ) {
case INT8: return t2;
@ -271,7 +271,7 @@ public final class BCF2Utils {
}
}
@Ensures("BCF2Type.INTEGERS.contains(result)")
@Ensures("result.isIntegerType()")
public final static BCF2Type determineIntegerType(final List<Integer> values) {
BCF2Type maxType = BCF2Type.INT8;
for ( final int value : values ) {

View File

@ -1,5 +1,6 @@
package org.broadinstitute.sting.utils.variantcontext;
import org.apache.log4j.Logger;
import org.broad.tribble.Feature;
import org.broad.tribble.TribbleException;
import org.broad.tribble.util.ParsingUtils;
@ -176,6 +177,10 @@ import java.util.*;
* @author depristo
*/
public class VariantContext implements Feature { // to enable tribble integration
private final static boolean WARN_ABOUT_BAD_END = true;
final protected static Logger logger = Logger.getLogger(VariantContext.class);
private boolean fullyDecoded = false;
protected CommonInfo commonInfo = null;
public final static double NO_LOG10_PERROR = CommonInfo.NO_LOG10_PERROR;
@ -1146,10 +1151,16 @@ public class VariantContext implements Feature { // to enable tribble integratio
if ( hasAttribute(VCFConstants.END_KEY) ) {
final int end = getAttributeAsInt(VCFConstants.END_KEY, -1);
assert end != -1;
if ( end != getEnd() )
throw new ReviewedStingException("Badly formed variant context at location " + getChr() + ":"
if ( end != getEnd() && end != getEnd() + 1 ) {
// the end is allowed to 1 bigger because of the padding
final String message = "Badly formed variant context at location " + getChr() + ":"
+ getStart() + "; getEnd() was " + getEnd()
+ " but this VariantContext contains an END key with value " + end);
+ " but this VariantContext contains an END key with value " + end;
if ( WARN_ABOUT_BAD_END )
logger.warn(message);
else
throw new ReviewedStingException(message);
}
}
}

View File

@ -47,7 +47,7 @@ import java.util.Map;
*/
@Invariant({
"headerLine != null",
"BCF2Type.INTEGERS.contains(dictionaryOffsetType)",
"dictionaryOffsetType.isIntegerType()",
"dictionaryOffset >= 0"
})
public abstract class BCF2FieldEncoder {

View File

@ -338,7 +338,7 @@ class BCF2Writer extends IndexingVariantContextWriter {
}
@Requires("! strings.isEmpty()")
@Ensures("BCF2Type.INTEGERS.contains(result)")
@Ensures("result.isIntegerType()")
private final BCF2Type encodeStringsByRef(final Collection<String> strings) throws IOException {
final List<Integer> offsets = new ArrayList<Integer>(strings.size());

View File

@ -39,6 +39,17 @@ public class VCFIntegrationTest extends WalkerTest {
executeTest("Test reading and writing breakpoint VCF", spec1);
}
@Test(enabled = true)
public void testReadingAndWriting1000GSVs() {
String testVCF = privateTestDir + "1000G_SVs.chr1.vcf";
String baseCommand = "-R " + b37KGReference + " --no_cmdline_in_header -o %s ";
String test1 = baseCommand + "-T SelectVariants -V " + testVCF;
WalkerTestSpec spec1 = new WalkerTestSpec(test1, 1, Arrays.asList(""));
executeTest("Test reading and writing 1000G Phase I SVs", spec1);
}
@Test
public void testReadingAndWritingSamtools() {
String testVCF = privateTestDir + "samtools.vcf";
@ -59,12 +70,12 @@ public class VCFIntegrationTest extends WalkerTest {
executeTest("Test writing samtools WEx BCF example", spec1);
}
@Test(enabled = false)
@Test(enabled = true)
public void testReadingSamtoolsWExBCFExample() {
String testVCF = privateTestDir + "ex2.bcf";
String baseCommand = "-R " + b36KGReference + " --no_cmdline_in_header -o %s ";
String test1 = baseCommand + "-T SelectVariants -V " + testVCF;
WalkerTestSpec spec1 = new WalkerTestSpec(test1, 1, Arrays.asList("63a2e0484ae37b0680514f53e0bf0c94"));
WalkerTestSpec spec1 = new WalkerTestSpec(test1, 1, Arrays.asList("0439e2b4ccc63bb4ba7c283cd9ab1b25"));
executeTest("Test reading samtools WEx BCF example", spec1);
}

View File

@ -597,23 +597,41 @@ public class VariantContextTestProvider {
}
public static void testReaderWriter(final VariantContextIOTest tester, final VariantContextTestData data) throws IOException {
testReaderWriter(tester, data.header, data.vcs, data.vcs, true);
}
public static void testReaderWriter(final VariantContextIOTest tester,
final VCFHeader header,
final List<VariantContext> expected,
final Iterable<VariantContext> vcs,
final boolean recurse) throws IOException {
final File tmpFile = File.createTempFile("testReaderWriter", tester.getExtension());
tmpFile.deleteOnExit();
// todo -- test all options
// write
// write expected to disk
final EnumSet<Options> options = EnumSet.of(Options.INDEX_ON_THE_FLY);
final VariantContextWriter writer = tester.makeWriter(tmpFile, options);
writer.writeHeader(data.header);
final List<VariantContext> expected = data.vcs;
for ( VariantContext vc : expected )
writer.add(vc);
writer.close();
writeVCsToFile(writer, header, vcs);
final Iterable<VariantContext> actual = readAllVCs(tmpFile, tester.makeCodec()).getSecond();
// ensure writing of expected == actual
final Pair<VCFHeader, Iterable<VariantContext>> p = readAllVCs(tmpFile, tester.makeCodec());
final Iterable<VariantContext> actual = p.getSecond();
assertEquals(actual, expected);
if ( recurse ) {
// if we are doing a recursive test, grab a fresh iterator over the written values
final Iterable<VariantContext> read = readAllVCs(tmpFile, tester.makeCodec()).getSecond();
testReaderWriter(tester, p.getFirst(), expected, read, false);
}
}
private static void writeVCsToFile(final VariantContextWriter writer, final VCFHeader header, final Iterable<VariantContext> vcs) {
// write
writer.writeHeader(header);
for ( VariantContext vc : vcs )
if (vc != null)
writer.add(vc);
writer.close();
}
/**