Generalize / unify code for handling strings

-- List<String> is converted inside of the codec to a collapsed string, and exploded in the decoder.
-- Unified the type conversion code in BCFWriter to simply the mapping from VCF type => BCF type and special value recoding
-- Code cleanup and renaming
This commit is contained in:
Mark DePristo 2012-05-16 08:59:26 -04:00
parent b4a5acd6f4
commit dfee17a672
7 changed files with 167 additions and 97 deletions

View File

@ -173,12 +173,14 @@ public class BCF2Decoder {
//
// ----------------------------------------------------------------------
private final String decodeLiteralString(final int size) {
private final Object decodeLiteralString(final int size) {
assert size > 0;
// TODO -- assumes size > 0
final byte[] bytes = new byte[size]; // TODO -- in principle should just grab bytes from underlying array
try {
recordStream.read(bytes);
return new String(bytes);
final String s = new String(bytes);
return BCF2Utils.isCollapsedString(s) ? BCF2Utils.exploreStringList(s) : s;
} catch ( IOException e ) {
throw new ReviewedStingException("readByte failure", e);
}

View File

@ -72,15 +72,13 @@ public class BCF2Encoder {
public final BCF2Type encode(final Object o) throws IOException {
if ( o == null ) throw new ReviewedStingException("Generic encode cannot deal with null values");
if ( o instanceof String ) {
return encodeString((String)o);
} else if ( o instanceof List ) {
final BCF2Type type = determinePrimitiveType(((List) o).get(0));
encodeTypedVector((List) o, type);
if ( o instanceof List ) {
final BCF2Type type = determineBCFType(((List) o).get(0));
encodeTyped((List) o, type);
return type;
} else {
final BCF2Type type = determinePrimitiveType(o);
encodeTypedSingleton(o, type);
final BCF2Type type = determineBCFType(o);
encodeTyped(o, type);
return type;
}
}
@ -92,31 +90,27 @@ public class BCF2Encoder {
// --------------------------------------------------------------------------------
public final void encodeTypedMissing(final BCF2Type type) throws IOException {
encodeTypedVector(Collections.emptyList(), type);
encodeTyped(Collections.emptyList(), type);
}
// todo -- should be specialized for each object type for efficiency
public final void encodeTypedSingleton(final Object v, final BCF2Type type) throws IOException {
encodeTypedVector(Collections.singleton(v), type);
public final void encodeTyped(final Object v, final BCF2Type type) throws IOException {
encodeTyped(Collections.singletonList(v), type);
}
public final BCF2Type encodeString(final String v) throws IOException {
// TODO -- this needs to be optimized
final byte[] bytes = v.getBytes();
final List<Byte> l = new ArrayList<Byte>(bytes.length);
for ( int i = 0; i < bytes.length; i++) l.add(bytes[i]);
encodeTypedVector(l, BCF2Type.CHAR);
return BCF2Type.CHAR;
}
public final void encodeTyped(List<? extends Object> v, final BCF2Type type) throws IOException {
if ( type == BCF2Type.CHAR && v.size() != 0 ) {
final String s = v.size() > 1 ? BCF2Utils.collapseStringList((List<String>)v) : (String)v.get(0);
v = stringToBytes(s);
}
public final <T extends Object> void encodeTypedVector(final Collection<T> v, final BCF2Type type) throws IOException {
encodeType(v.size(), type);
encodeRawValues(v, type);
}
public final BCF2Type encodeTypedIntOfBestSize(final int value) throws IOException {
final BCF2Type type = determineIntegerType(value);
encodeTypedSingleton(value, type);
encodeTyped(value, type);
return type;
}
@ -214,13 +208,17 @@ public class BCF2Encoder {
throw new ReviewedStingException("Integer cannot be encoded in allowable range of even INT32: " + value);
}
private final BCF2Type determinePrimitiveType(final Object v) {
if ( v instanceof Integer )
return determineIntegerType((Integer)v);
else if ( v instanceof Double )
private final BCF2Type determineBCFType(final Object arg) {
final Object toType = arg instanceof List ? ((List)arg).get(0) : arg;
if ( toType instanceof Integer )
return determineIntegerType((Integer)toType);
else if ( toType instanceof String )
return BCF2Type.CHAR;
else if ( toType instanceof Double )
return BCF2Type.FLOAT;
else
throw new ReviewedStingException("No native encoding for Object of type " + v.getClass().getSimpleName());
throw new ReviewedStingException("No native encoding for Object of type " + arg.getClass().getSimpleName());
}
public final static void encodePrimitive(final int value, final BCF2Type type, final OutputStream encodeStream) throws IOException {
@ -231,4 +229,12 @@ public class BCF2Encoder {
encodeStream.write(byteValue);
}
}
private final List<Byte> stringToBytes(final String v) throws IOException {
// TODO -- this needs to be optimized away for efficiency
final byte[] bytes = v.getBytes();
final List<Byte> l = new ArrayList<Byte>(bytes.length);
for ( int i = 0; i < bytes.length; i++) l.add(bytes[i]);
return l;
}
}

View File

@ -31,29 +31,28 @@ package org.broadinstitute.sting.utils.codecs.bcf2;
* @since 05/12
*/
public enum BCF2Type {
RESERVED_0,
INT8(1, BCF2Utils.INT8_MISSING_VALUE, -127, 127), // todo -- confirm range
INT16(2, BCF2Utils.INT16_MISSING_VALUE, -32767, 32767),
INT32(4, BCF2Utils.INT32_MISSING_VALUE, -2147483647, 2147483647),
RESERVED_4,
FLOAT(4, BCF2Utils.FLOAT_MISSING_VALUE),
RESERVED_6,
CHAR;
INT8(1, 1, BCF2Utils.INT8_MISSING_VALUE, -127, 127), // todo -- confirm range
INT16(2, 2, BCF2Utils.INT16_MISSING_VALUE, -32767, 32767),
INT32(3, 4, BCF2Utils.INT32_MISSING_VALUE, -2147483647, 2147483647),
FLOAT(5, 4, BCF2Utils.FLOAT_MISSING_VALUE),
CHAR(7);
private final int id;
private final Object missingJavaValue;
private final int missingBytes;
private final int sizeInBytes;
private final long minValue, maxValue;
BCF2Type() {
this(-1, 0, 0, 0);
BCF2Type(final int id) {
this(id, -1, 0, 0, 0);
}
BCF2Type(final int sizeInBytes, final int missingBytes) {
this(sizeInBytes, missingBytes, 0, 0);
BCF2Type(final int id, final int sizeInBytes, final int missingBytes) {
this(id, sizeInBytes, missingBytes, 0, 0);
}
BCF2Type(final int sizeInBytes, final int missingBytes, final long minValue, final long maxValue) {
BCF2Type(final int id, final int sizeInBytes, final int missingBytes, final long minValue, final long maxValue) {
this.id = id;
this.sizeInBytes = sizeInBytes;
this.missingJavaValue = null;
this.missingBytes = missingBytes;
@ -64,7 +63,7 @@ public enum BCF2Type {
public int getSizeInBytes() {
return sizeInBytes;
}
public int getID() { return ordinal(); }
public int getID() { return id; }
public final boolean withinRange(final long v) { return v >= minValue && v <= maxValue; }
public Object getMissingJavaValue() { return missingJavaValue; }
public int getMissingBytes() { return missingBytes; }

View File

@ -34,6 +34,7 @@ import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
/**
* Common utilities for working with BCF2 files
@ -56,6 +57,14 @@ public class BCF2Utils {
public static final int FLOAT_MISSING_VALUE = 0x7F800001;
public final static BCF2Type[] INTEGER_TYPES_BY_SIZE = new BCF2Type[]{BCF2Type.INT8, BCF2Type.INT16, BCF2Type.INT32};
public final static BCF2Type[] ID_TO_ENUM;
static {
int maxID = -1;
for ( BCF2Type v : BCF2Type.values() ) maxID = Math.max(v.getID(), maxID);
ID_TO_ENUM = new BCF2Type[maxID+1];
for ( BCF2Type v : BCF2Type.values() ) ID_TO_ENUM[v.getID()] = v;
}
private BCF2Utils() {}
@ -98,7 +107,7 @@ public class BCF2Utils {
}
public final static BCF2Type decodeType(final byte typeDescriptor) {
return BCF2Type.values()[decodeTypeID(typeDescriptor)];
return ID_TO_ENUM[decodeTypeID(typeDescriptor)];
}
public final static boolean sizeIsOverflow(final byte typeDescriptor) {
@ -140,4 +149,42 @@ public class BCF2Utils {
} default: throw new ReviewedStingException("Unexpected size during decoding");
}
}
/**
* Collapse multiple strings into a comma separated list
*
* ["s1", "s2", "s3"] => ",s1,s2,s3"
*
* @param strings size > 1 list of strings
* @return
*/
public static final String collapseStringList(final List<String> strings) {
assert strings.size() > 1;
StringBuilder b = new StringBuilder();
for ( final String s : strings ) {
assert s.indexOf(",") == -1; // no commas in individual strings
b.append(",").append(s);
}
return b.toString();
}
/**
* Inverse operation of collapseStringList.
*
* ",s1,s2,s3" => ["s1", "s2", "s3"]
*
*
* @param collapsed
* @return
*/
public static final List<String> exploreStringList(final String collapsed) {
assert isCollapsedString(collapsed);
final String[] exploded = collapsed.substring(1).split(",");
return Arrays.asList(exploded);
}
public static final boolean isCollapsedString(final String s) {
return s.charAt(0) == ',';
}
}

View File

@ -173,13 +173,13 @@ class BCF2Writer extends IndexingVariantContextWriter {
}
private void buildID( VariantContext vc ) throws IOException {
encoder.encodeString(vc.getID());
encoder.encodeTyped(vc.getID(), BCF2Type.CHAR);
}
private void buildAlleles( VariantContext vc ) throws IOException {
for ( final Allele allele : vc.getAlleles() ) {
final String s = vc.getAlleleWithRefPadding(allele);
encoder.encodeString(s);
encoder.encodeTyped(s, BCF2Type.CHAR);
}
}
@ -194,19 +194,10 @@ class BCF2Writer extends IndexingVariantContextWriter {
private void buildInfo( VariantContext vc ) throws IOException {
for ( Map.Entry<String, Object> infoFieldEntry : vc.getAttributes().entrySet() ) {
final String key = infoFieldEntry.getKey();
Object value = infoFieldEntry.getValue();
final VCFToBCFType typeEquiv = getBCF2TypeFromHeader(key, value);
// handle the special FLAG case -- super annoying
if ( typeEquiv.vcfType == VCFHeaderLineType.Flag ) value = 1;
final VCFToBCFEncoding encoding = prepFieldValueForEncoding(key, infoFieldEntry.getValue());
encodeStringByRef(key);
if ( value instanceof List ) // NOTE: ONLY WORKS WITH LISTS
encoder.encodeTypedVector((List) value, typeEquiv.BCF2Type);
else if ( value instanceof String )
encoder.encodeString((String)value);
else
encoder.encodeTypedSingleton(value, typeEquiv.BCF2Type);
encoder.encodeTyped(encoding.valuesToEncode, encoding.BCF2Type);
}
}
@ -265,51 +256,67 @@ class BCF2Writer extends IndexingVariantContextWriter {
private final void addGenericGenotypeField(final VariantContext vc, final String field) throws IOException {
final int numInFormatField = getNGenotypeFieldValues(field, vc);
final VCFToBCFType type = getBCF2TypeFromHeader(field, null);
final VCFToBCFEncoding encoding = prepFieldValueForEncoding(field, null);
startGenotypeField(field, numInFormatField, type.BCF2Type);
startGenotypeField(field, numInFormatField, encoding.BCF2Type);
for ( final Genotype g : vc.getGenotypes() ) {
if ( ! g.hasAttribute(field) ) {
encoder.encodeRawMissingValues(numInFormatField, type.BCF2Type);
encoder.encodeRawMissingValues(numInFormatField, encoding.BCF2Type);
} else {
final Object val = g.getAttribute(field);
final Collection<Object> vals = numInFormatField == 1 ? Collections.singleton(val) : (Collection)val;
encoder.encodeRawValues(vals, type.BCF2Type);
encoder.encodeRawValues(vals, encoding.BCF2Type);
}
}
}
private final class VCFToBCFType {
private final class VCFToBCFEncoding {
VCFHeaderLineType vcfType;
BCF2Type BCF2Type;
List<Object> valuesToEncode;
private VCFToBCFType(final VCFHeaderLineType vcfType, final BCF2Type BCF2Type) {
private VCFToBCFEncoding(final VCFHeaderLineType vcfType, final BCF2Type BCF2Type, final List<? extends Object> valuesToEncode) {
this.vcfType = vcfType;
this.BCF2Type = BCF2Type;
this.valuesToEncode = (List<Object>)valuesToEncode;
}
}
// TODO -- we really need explicit converters as first class objects
private final VCFToBCFType getBCF2TypeFromHeader(final String field, final Object maybeIntValue) {
// TODO -- need to generalize so we can enable vectors of compressed genotype ints
// TODO -- need to generalize so we can enable vectors of compressed genotype ints
// TODO -- no sense in allocating these over and over
private final VCFToBCFEncoding prepFieldValueForEncoding(final String field, final Object value) {
final VCFCompoundHeaderLine metaData = VariantContext.getMetaDataForField(header, field);
final boolean isList = value instanceof List;
final Object toType = isList ? ((List)value).get(0) : value;
// TODO -- no sense in allocating these over and over
switch ( metaData.getType() ) {
case Character: return new VCFToBCFType(metaData.getType(), BCF2Type.CHAR);
case Flag: return new VCFToBCFType(metaData.getType(), BCF2Type.INT8);
case String: return new VCFToBCFType(metaData.getType(), BCF2Type.CHAR);
case Character:
assert toType instanceof String;
return new VCFToBCFEncoding(metaData.getType(), BCF2Type.CHAR, Collections.singletonList(value));
case Flag:
return new VCFToBCFEncoding(metaData.getType(), BCF2Type.INT8, Collections.singletonList(1));
case String:
final List<String> s = isList ? (List<String>)value : Collections.singletonList((String)value);
return new VCFToBCFEncoding(metaData.getType(), BCF2Type.CHAR, s);
case Integer: // note integer calculation is a bit complex because of the need to determine sizes
BCF2Type type;
if ( maybeIntValue == null )
type = BCF2Type.INT8;
else if ( maybeIntValue instanceof List )
type = encoder.determineIntegerType(((List<Integer>)maybeIntValue));
else
type = encoder.determineIntegerType((Integer)maybeIntValue);
return new VCFToBCFType(metaData.getType(), type);
case Float: return new VCFToBCFType(metaData.getType(), BCF2Type.FLOAT);
default: throw new ReviewedStingException("Unexpected type for field " + field);
List<Integer> l;
BCF2Type intType;
if ( isList ) {
l = (List<Integer>)value;
intType = encoder.determineIntegerType(l);
} else if ( value != null ) {
intType = encoder.determineIntegerType((Integer)value);
l = Collections.singletonList((Integer)value);
} else {
intType = BCF2Type.INT8;
l = Collections.singletonList((Integer) null);
}
return new VCFToBCFEncoding(metaData.getType(), intType, l);
case Float:
return new VCFToBCFEncoding(metaData.getType(), BCF2Type.FLOAT, isList ? (List<Double>)value : Collections.singletonList(value));
default:
throw new ReviewedStingException("Unexpected type for field " + field);
}
}
@ -395,7 +402,7 @@ class BCF2Writer extends IndexingVariantContextWriter {
}
// we've checked the types for all strings, so write them out
encoder.encodeTypedVector(offsets, maxType);
encoder.encodeTyped(offsets, maxType);
return maxType;
}

View File

@ -30,6 +30,7 @@ package org.broadinstitute.sting.utils.codecs.bcf2;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.exceptions.StingException;
import org.testng.Assert;
import org.testng.annotations.BeforeSuite;
import org.testng.annotations.DataProvider;
@ -42,7 +43,7 @@ import java.io.InputStream;
import java.util.*;
public class EncoderDecoderUnitTest extends BaseTest {
public class BCF2EncoderDecoderUnitTest extends BaseTest {
private final double FLOAT_TOLERANCE = 1e-6;
final List<BCF2TypedValue> primitives = new ArrayList<BCF2TypedValue>();
final List<BCF2TypedValue> basicTypes = new ArrayList<BCF2TypedValue>();
@ -54,7 +55,7 @@ public class EncoderDecoderUnitTest extends BaseTest {
basicTypes.add(new BCF2TypedValue(1000, BCF2Type.INT16));
basicTypes.add(new BCF2TypedValue(1000000, BCF2Type.INT32));
basicTypes.add(new BCF2TypedValue(1.2345e6, BCF2Type.FLOAT));
basicTypes.add(new BCF2TypedValue(new Byte((byte)'A'), BCF2Type.CHAR));
basicTypes.add(new BCF2TypedValue("A", BCF2Type.CHAR));
// small ints
primitives.add(new BCF2TypedValue(0, BCF2Type.INT8));
@ -137,7 +138,6 @@ public class EncoderDecoderUnitTest extends BaseTest {
for ( BCF2Type type : BCF2Type.values() ) {
forCombinations.add(new BCF2TypedValue(null, type));
}
}
// --------------------------------------------------------------------------------
@ -203,28 +203,38 @@ public class EncoderDecoderUnitTest extends BaseTest {
decodeRecord(toEncode, record);
}
@DataProvider(name = "ListOfStrings")
public Object[][] listOfStringsProvider() {
List<Object[]> tests = new ArrayList<Object[]>();
tests.add(new Object[]{Arrays.asList("s1", "s2"), ",s1,s2"});
tests.add(new Object[]{Arrays.asList("s1", "s2", "s3"), ",s1,s2,s3"});
tests.add(new Object[]{Arrays.asList("s1", "s2", "s3", "s4"), ",s1,s2,s3,s4"});
return tests.toArray(new Object[][]{});
}
@Test(dataProvider = "ListOfStrings")
public void testEncodingListOfString(List<String> strings, String expected) throws IOException {
final String collapsed = BCF2Utils.collapseStringList(strings);
Assert.assertEquals(collapsed, expected);
Assert.assertEquals(BCF2Utils.exploreStringList(collapsed), strings);
}
@Test(dataProvider = "BCF2EncodingTestProviderBasicTypes")
public void testBCF2EncodingVectors(final List<BCF2TypedValue> toEncode) throws IOException {
for ( final BCF2TypedValue tv : toEncode ) {
for ( final int length : Arrays.asList(2, 5, 10, 15, 20, 25) ) {
BCF2Encoder encoder = new BCF2Encoder();
List<Object> expected = Collections.nCopies(length, tv.value);
encoder.encodeTypedVector(expected, tv.type);
encoder.encodeTyped(expected, tv.type);
BCF2Decoder decoder = new BCF2Decoder(encoder.getRecordBytes());
final Object decoded = decoder.decodeTypedValue();
if ( tv.type == BCF2Type.CHAR ) {
Assert.assertTrue(decoded instanceof String);
final String decodedString = (String)decoded;
Assert.assertTrue(decodedString.length() == length);
} else {
Assert.assertTrue(decoded instanceof List);
final List<Object> decodedList = (List<Object>)decoded;
Assert.assertEquals(decodedList.size(), expected.size());
for ( Object decodedValue : decodedList )
myAssertEquals(tv, decodedValue);
}
Assert.assertTrue(decoded instanceof List);
final List<Object> decodedList = (List<Object>)decoded;
Assert.assertEquals(decodedList.size(), expected.size());
for ( Object decodedValue : decodedList )
myAssertEquals(tv, decodedValue);
}
}
}

View File

@ -172,10 +172,9 @@ public class VariantContextTestProvider {
add(builder().attribute("STRING1", "s1"));
add(builder().attribute("STRING1", null));
// TODO - renable when BCF2 spec is fixed
// add(builder().attribute("STRING3", Arrays.asList("s1", "s2", "s3")));
// add(builder().attribute("STRING3", null));
// add(builder().attribute("STRING20", Arrays.asList("s1", "s2", "s3", "s4", "s5", "s6", "s7", "s8", "s9", "s10", "s11", "s12", "s13", "s14", "s15", "s16", "s17", "s18", "s19", "s20")));
add(builder().attribute("STRING3", Arrays.asList("s1", "s2", "s3")));
add(builder().attribute("STRING3", null));
add(builder().attribute("STRING20", Arrays.asList("s1", "s2", "s3", "s4", "s5", "s6", "s7", "s8", "s9", "s10", "s11", "s12", "s13", "s14", "s15", "s16", "s17", "s18", "s19", "s20")));
metaData.add(new VCFInfoHeaderLine("STRING1", 1, VCFHeaderLineType.String, "x"));
metaData.add(new VCFInfoHeaderLine("STRING3", 3, VCFHeaderLineType.String, "x"));
metaData.add(new VCFInfoHeaderLine("STRING20", 20, VCFHeaderLineType.String, "x"));