diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Decoder.java b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Decoder.java index 1cf5370b0..7598ba1ec 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Decoder.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Decoder.java @@ -173,12 +173,14 @@ public class BCF2Decoder { // // ---------------------------------------------------------------------- - private final String decodeLiteralString(final int size) { + private final Object decodeLiteralString(final int size) { + assert size > 0; // TODO -- assumes size > 0 final byte[] bytes = new byte[size]; // TODO -- in principle should just grab bytes from underlying array try { recordStream.read(bytes); - return new String(bytes); + final String s = new String(bytes); + return BCF2Utils.isCollapsedString(s) ? BCF2Utils.exploreStringList(s) : s; } catch ( IOException e ) { throw new ReviewedStingException("readByte failure", e); } diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Encoder.java b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Encoder.java index ec76f858c..c504f0de7 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Encoder.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Encoder.java @@ -72,15 +72,13 @@ public class BCF2Encoder { public final BCF2Type encode(final Object o) throws IOException { if ( o == null ) throw new ReviewedStingException("Generic encode cannot deal with null values"); - if ( o instanceof String ) { - return encodeString((String)o); - } else if ( o instanceof List ) { - final BCF2Type type = determinePrimitiveType(((List) o).get(0)); - encodeTypedVector((List) o, type); + if ( o instanceof List ) { + final BCF2Type type = determineBCFType(((List) o).get(0)); + encodeTyped((List) o, type); return type; } else { - final BCF2Type type = determinePrimitiveType(o); - encodeTypedSingleton(o, type); + final BCF2Type type = determineBCFType(o); + encodeTyped(o, type); return type; } } @@ -92,31 +90,27 @@ public class BCF2Encoder { // -------------------------------------------------------------------------------- public final void encodeTypedMissing(final BCF2Type type) throws IOException { - encodeTypedVector(Collections.emptyList(), type); + encodeTyped(Collections.emptyList(), type); } // todo -- should be specialized for each object type for efficiency - public final void encodeTypedSingleton(final Object v, final BCF2Type type) throws IOException { - encodeTypedVector(Collections.singleton(v), type); + public final void encodeTyped(final Object v, final BCF2Type type) throws IOException { + encodeTyped(Collections.singletonList(v), type); } - public final BCF2Type encodeString(final String v) throws IOException { - // TODO -- this needs to be optimized - final byte[] bytes = v.getBytes(); - final List l = new ArrayList(bytes.length); - for ( int i = 0; i < bytes.length; i++) l.add(bytes[i]); - encodeTypedVector(l, BCF2Type.CHAR); - return BCF2Type.CHAR; - } + public final void encodeTyped(List v, final BCF2Type type) throws IOException { + if ( type == BCF2Type.CHAR && v.size() != 0 ) { + final String s = v.size() > 1 ? BCF2Utils.collapseStringList((List)v) : (String)v.get(0); + v = stringToBytes(s); + } - public final void encodeTypedVector(final Collection v, final BCF2Type type) throws IOException { encodeType(v.size(), type); encodeRawValues(v, type); } public final BCF2Type encodeTypedIntOfBestSize(final int value) throws IOException { final BCF2Type type = determineIntegerType(value); - encodeTypedSingleton(value, type); + encodeTyped(value, type); return type; } @@ -214,13 +208,17 @@ public class BCF2Encoder { throw new ReviewedStingException("Integer cannot be encoded in allowable range of even INT32: " + value); } - private final BCF2Type determinePrimitiveType(final Object v) { - if ( v instanceof Integer ) - return determineIntegerType((Integer)v); - else if ( v instanceof Double ) + private final BCF2Type determineBCFType(final Object arg) { + final Object toType = arg instanceof List ? ((List)arg).get(0) : arg; + + if ( toType instanceof Integer ) + return determineIntegerType((Integer)toType); + else if ( toType instanceof String ) + return BCF2Type.CHAR; + else if ( toType instanceof Double ) return BCF2Type.FLOAT; else - throw new ReviewedStingException("No native encoding for Object of type " + v.getClass().getSimpleName()); + throw new ReviewedStingException("No native encoding for Object of type " + arg.getClass().getSimpleName()); } public final static void encodePrimitive(final int value, final BCF2Type type, final OutputStream encodeStream) throws IOException { @@ -231,4 +229,12 @@ public class BCF2Encoder { encodeStream.write(byteValue); } } + + private final List stringToBytes(final String v) throws IOException { + // TODO -- this needs to be optimized away for efficiency + final byte[] bytes = v.getBytes(); + final List l = new ArrayList(bytes.length); + for ( int i = 0; i < bytes.length; i++) l.add(bytes[i]); + return l; + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Type.java b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Type.java index 37bdda6a4..cc7debc00 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Type.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Type.java @@ -31,29 +31,28 @@ package org.broadinstitute.sting.utils.codecs.bcf2; * @since 05/12 */ public enum BCF2Type { - RESERVED_0, - INT8(1, BCF2Utils.INT8_MISSING_VALUE, -127, 127), // todo -- confirm range - INT16(2, BCF2Utils.INT16_MISSING_VALUE, -32767, 32767), - INT32(4, BCF2Utils.INT32_MISSING_VALUE, -2147483647, 2147483647), - RESERVED_4, - FLOAT(4, BCF2Utils.FLOAT_MISSING_VALUE), - RESERVED_6, - CHAR; + INT8(1, 1, BCF2Utils.INT8_MISSING_VALUE, -127, 127), // todo -- confirm range + INT16(2, 2, BCF2Utils.INT16_MISSING_VALUE, -32767, 32767), + INT32(3, 4, BCF2Utils.INT32_MISSING_VALUE, -2147483647, 2147483647), + FLOAT(5, 4, BCF2Utils.FLOAT_MISSING_VALUE), + CHAR(7); + private final int id; private final Object missingJavaValue; private final int missingBytes; private final int sizeInBytes; private final long minValue, maxValue; - BCF2Type() { - this(-1, 0, 0, 0); + BCF2Type(final int id) { + this(id, -1, 0, 0, 0); } - BCF2Type(final int sizeInBytes, final int missingBytes) { - this(sizeInBytes, missingBytes, 0, 0); + BCF2Type(final int id, final int sizeInBytes, final int missingBytes) { + this(id, sizeInBytes, missingBytes, 0, 0); } - BCF2Type(final int sizeInBytes, final int missingBytes, final long minValue, final long maxValue) { + BCF2Type(final int id, final int sizeInBytes, final int missingBytes, final long minValue, final long maxValue) { + this.id = id; this.sizeInBytes = sizeInBytes; this.missingJavaValue = null; this.missingBytes = missingBytes; @@ -64,7 +63,7 @@ public enum BCF2Type { public int getSizeInBytes() { return sizeInBytes; } - public int getID() { return ordinal(); } + public int getID() { return id; } public final boolean withinRange(final long v) { return v >= minValue && v <= maxValue; } public Object getMissingJavaValue() { return missingJavaValue; } public int getMissingBytes() { return missingBytes; } diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Utils.java b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Utils.java index 76bb17529..aeec7260b 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Utils.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Utils.java @@ -34,6 +34,7 @@ import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; import java.util.Arrays; +import java.util.List; /** * Common utilities for working with BCF2 files @@ -56,6 +57,14 @@ public class BCF2Utils { public static final int FLOAT_MISSING_VALUE = 0x7F800001; public final static BCF2Type[] INTEGER_TYPES_BY_SIZE = new BCF2Type[]{BCF2Type.INT8, BCF2Type.INT16, BCF2Type.INT32}; + public final static BCF2Type[] ID_TO_ENUM; + + static { + int maxID = -1; + for ( BCF2Type v : BCF2Type.values() ) maxID = Math.max(v.getID(), maxID); + ID_TO_ENUM = new BCF2Type[maxID+1]; + for ( BCF2Type v : BCF2Type.values() ) ID_TO_ENUM[v.getID()] = v; + } private BCF2Utils() {} @@ -98,7 +107,7 @@ public class BCF2Utils { } public final static BCF2Type decodeType(final byte typeDescriptor) { - return BCF2Type.values()[decodeTypeID(typeDescriptor)]; + return ID_TO_ENUM[decodeTypeID(typeDescriptor)]; } public final static boolean sizeIsOverflow(final byte typeDescriptor) { @@ -140,4 +149,42 @@ public class BCF2Utils { } default: throw new ReviewedStingException("Unexpected size during decoding"); } } + + /** + * Collapse multiple strings into a comma separated list + * + * ["s1", "s2", "s3"] => ",s1,s2,s3" + * + * @param strings size > 1 list of strings + * @return + */ + public static final String collapseStringList(final List strings) { + assert strings.size() > 1; + + StringBuilder b = new StringBuilder(); + for ( final String s : strings ) { + assert s.indexOf(",") == -1; // no commas in individual strings + b.append(",").append(s); + } + return b.toString(); + } + + /** + * Inverse operation of collapseStringList. + * + * ",s1,s2,s3" => ["s1", "s2", "s3"] + * + * + * @param collapsed + * @return + */ + public static final List exploreStringList(final String collapsed) { + assert isCollapsedString(collapsed); + final String[] exploded = collapsed.substring(1).split(","); + return Arrays.asList(exploded); + } + + public static final boolean isCollapsedString(final String s) { + return s.charAt(0) == ','; + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java index b599773b3..f41c3243d 100644 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java @@ -173,13 +173,13 @@ class BCF2Writer extends IndexingVariantContextWriter { } private void buildID( VariantContext vc ) throws IOException { - encoder.encodeString(vc.getID()); + encoder.encodeTyped(vc.getID(), BCF2Type.CHAR); } private void buildAlleles( VariantContext vc ) throws IOException { for ( final Allele allele : vc.getAlleles() ) { final String s = vc.getAlleleWithRefPadding(allele); - encoder.encodeString(s); + encoder.encodeTyped(s, BCF2Type.CHAR); } } @@ -194,19 +194,10 @@ class BCF2Writer extends IndexingVariantContextWriter { private void buildInfo( VariantContext vc ) throws IOException { for ( Map.Entry infoFieldEntry : vc.getAttributes().entrySet() ) { final String key = infoFieldEntry.getKey(); - Object value = infoFieldEntry.getValue(); - - final VCFToBCFType typeEquiv = getBCF2TypeFromHeader(key, value); - // handle the special FLAG case -- super annoying - if ( typeEquiv.vcfType == VCFHeaderLineType.Flag ) value = 1; + final VCFToBCFEncoding encoding = prepFieldValueForEncoding(key, infoFieldEntry.getValue()); encodeStringByRef(key); - if ( value instanceof List ) // NOTE: ONLY WORKS WITH LISTS - encoder.encodeTypedVector((List) value, typeEquiv.BCF2Type); - else if ( value instanceof String ) - encoder.encodeString((String)value); - else - encoder.encodeTypedSingleton(value, typeEquiv.BCF2Type); + encoder.encodeTyped(encoding.valuesToEncode, encoding.BCF2Type); } } @@ -265,51 +256,67 @@ class BCF2Writer extends IndexingVariantContextWriter { private final void addGenericGenotypeField(final VariantContext vc, final String field) throws IOException { final int numInFormatField = getNGenotypeFieldValues(field, vc); - final VCFToBCFType type = getBCF2TypeFromHeader(field, null); + final VCFToBCFEncoding encoding = prepFieldValueForEncoding(field, null); - startGenotypeField(field, numInFormatField, type.BCF2Type); + startGenotypeField(field, numInFormatField, encoding.BCF2Type); for ( final Genotype g : vc.getGenotypes() ) { if ( ! g.hasAttribute(field) ) { - encoder.encodeRawMissingValues(numInFormatField, type.BCF2Type); + encoder.encodeRawMissingValues(numInFormatField, encoding.BCF2Type); } else { final Object val = g.getAttribute(field); final Collection vals = numInFormatField == 1 ? Collections.singleton(val) : (Collection)val; - encoder.encodeRawValues(vals, type.BCF2Type); + encoder.encodeRawValues(vals, encoding.BCF2Type); } } } - private final class VCFToBCFType { + private final class VCFToBCFEncoding { VCFHeaderLineType vcfType; BCF2Type BCF2Type; + List valuesToEncode; - private VCFToBCFType(final VCFHeaderLineType vcfType, final BCF2Type BCF2Type) { + private VCFToBCFEncoding(final VCFHeaderLineType vcfType, final BCF2Type BCF2Type, final List valuesToEncode) { this.vcfType = vcfType; this.BCF2Type = BCF2Type; + this.valuesToEncode = (List)valuesToEncode; } } // TODO -- we really need explicit converters as first class objects - private final VCFToBCFType getBCF2TypeFromHeader(final String field, final Object maybeIntValue) { - // TODO -- need to generalize so we can enable vectors of compressed genotype ints + // TODO -- need to generalize so we can enable vectors of compressed genotype ints + // TODO -- no sense in allocating these over and over + private final VCFToBCFEncoding prepFieldValueForEncoding(final String field, final Object value) { final VCFCompoundHeaderLine metaData = VariantContext.getMetaDataForField(header, field); + final boolean isList = value instanceof List; + final Object toType = isList ? ((List)value).get(0) : value; - // TODO -- no sense in allocating these over and over switch ( metaData.getType() ) { - case Character: return new VCFToBCFType(metaData.getType(), BCF2Type.CHAR); - case Flag: return new VCFToBCFType(metaData.getType(), BCF2Type.INT8); - case String: return new VCFToBCFType(metaData.getType(), BCF2Type.CHAR); + case Character: + assert toType instanceof String; + return new VCFToBCFEncoding(metaData.getType(), BCF2Type.CHAR, Collections.singletonList(value)); + case Flag: + return new VCFToBCFEncoding(metaData.getType(), BCF2Type.INT8, Collections.singletonList(1)); + case String: + final List s = isList ? (List)value : Collections.singletonList((String)value); + return new VCFToBCFEncoding(metaData.getType(), BCF2Type.CHAR, s); case Integer: // note integer calculation is a bit complex because of the need to determine sizes - BCF2Type type; - if ( maybeIntValue == null ) - type = BCF2Type.INT8; - else if ( maybeIntValue instanceof List ) - type = encoder.determineIntegerType(((List)maybeIntValue)); - else - type = encoder.determineIntegerType((Integer)maybeIntValue); - return new VCFToBCFType(metaData.getType(), type); - case Float: return new VCFToBCFType(metaData.getType(), BCF2Type.FLOAT); - default: throw new ReviewedStingException("Unexpected type for field " + field); + List l; + BCF2Type intType; + if ( isList ) { + l = (List)value; + intType = encoder.determineIntegerType(l); + } else if ( value != null ) { + intType = encoder.determineIntegerType((Integer)value); + l = Collections.singletonList((Integer)value); + } else { + intType = BCF2Type.INT8; + l = Collections.singletonList((Integer) null); + } + return new VCFToBCFEncoding(metaData.getType(), intType, l); + case Float: + return new VCFToBCFEncoding(metaData.getType(), BCF2Type.FLOAT, isList ? (List)value : Collections.singletonList(value)); + default: + throw new ReviewedStingException("Unexpected type for field " + field); } } @@ -395,7 +402,7 @@ class BCF2Writer extends IndexingVariantContextWriter { } // we've checked the types for all strings, so write them out - encoder.encodeTypedVector(offsets, maxType); + encoder.encodeTyped(offsets, maxType); return maxType; } diff --git a/public/java/test/org/broadinstitute/sting/utils/codecs/bcf2/EncoderDecoderUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/codecs/bcf2/BCF2EncoderDecoderUnitTest.java similarity index 92% rename from public/java/test/org/broadinstitute/sting/utils/codecs/bcf2/EncoderDecoderUnitTest.java rename to public/java/test/org/broadinstitute/sting/utils/codecs/bcf2/BCF2EncoderDecoderUnitTest.java index ac28ba6ee..784b13101 100644 --- a/public/java/test/org/broadinstitute/sting/utils/codecs/bcf2/EncoderDecoderUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/codecs/bcf2/BCF2EncoderDecoderUnitTest.java @@ -30,6 +30,7 @@ package org.broadinstitute.sting.utils.codecs.bcf2; import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.exceptions.StingException; import org.testng.Assert; import org.testng.annotations.BeforeSuite; import org.testng.annotations.DataProvider; @@ -42,7 +43,7 @@ import java.io.InputStream; import java.util.*; -public class EncoderDecoderUnitTest extends BaseTest { +public class BCF2EncoderDecoderUnitTest extends BaseTest { private final double FLOAT_TOLERANCE = 1e-6; final List primitives = new ArrayList(); final List basicTypes = new ArrayList(); @@ -54,7 +55,7 @@ public class EncoderDecoderUnitTest extends BaseTest { basicTypes.add(new BCF2TypedValue(1000, BCF2Type.INT16)); basicTypes.add(new BCF2TypedValue(1000000, BCF2Type.INT32)); basicTypes.add(new BCF2TypedValue(1.2345e6, BCF2Type.FLOAT)); - basicTypes.add(new BCF2TypedValue(new Byte((byte)'A'), BCF2Type.CHAR)); + basicTypes.add(new BCF2TypedValue("A", BCF2Type.CHAR)); // small ints primitives.add(new BCF2TypedValue(0, BCF2Type.INT8)); @@ -137,7 +138,6 @@ public class EncoderDecoderUnitTest extends BaseTest { for ( BCF2Type type : BCF2Type.values() ) { forCombinations.add(new BCF2TypedValue(null, type)); } - } // -------------------------------------------------------------------------------- @@ -203,28 +203,38 @@ public class EncoderDecoderUnitTest extends BaseTest { decodeRecord(toEncode, record); } + @DataProvider(name = "ListOfStrings") + public Object[][] listOfStringsProvider() { + List tests = new ArrayList(); + tests.add(new Object[]{Arrays.asList("s1", "s2"), ",s1,s2"}); + tests.add(new Object[]{Arrays.asList("s1", "s2", "s3"), ",s1,s2,s3"}); + tests.add(new Object[]{Arrays.asList("s1", "s2", "s3", "s4"), ",s1,s2,s3,s4"}); + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "ListOfStrings") + public void testEncodingListOfString(List strings, String expected) throws IOException { + final String collapsed = BCF2Utils.collapseStringList(strings); + Assert.assertEquals(collapsed, expected); + Assert.assertEquals(BCF2Utils.exploreStringList(collapsed), strings); + } + @Test(dataProvider = "BCF2EncodingTestProviderBasicTypes") public void testBCF2EncodingVectors(final List toEncode) throws IOException { for ( final BCF2TypedValue tv : toEncode ) { for ( final int length : Arrays.asList(2, 5, 10, 15, 20, 25) ) { BCF2Encoder encoder = new BCF2Encoder(); List expected = Collections.nCopies(length, tv.value); - encoder.encodeTypedVector(expected, tv.type); + encoder.encodeTyped(expected, tv.type); BCF2Decoder decoder = new BCF2Decoder(encoder.getRecordBytes()); final Object decoded = decoder.decodeTypedValue(); - if ( tv.type == BCF2Type.CHAR ) { - Assert.assertTrue(decoded instanceof String); - final String decodedString = (String)decoded; - Assert.assertTrue(decodedString.length() == length); - } else { - Assert.assertTrue(decoded instanceof List); - final List decodedList = (List)decoded; - Assert.assertEquals(decodedList.size(), expected.size()); - for ( Object decodedValue : decodedList ) - myAssertEquals(tv, decodedValue); - } + Assert.assertTrue(decoded instanceof List); + final List decodedList = (List)decoded; + Assert.assertEquals(decodedList.size(), expected.size()); + for ( Object decodedValue : decodedList ) + myAssertEquals(tv, decodedValue); } } } diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextTestProvider.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextTestProvider.java index 9499160a2..990dfeec6 100644 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextTestProvider.java +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextTestProvider.java @@ -172,10 +172,9 @@ public class VariantContextTestProvider { add(builder().attribute("STRING1", "s1")); add(builder().attribute("STRING1", null)); - // TODO - renable when BCF2 spec is fixed -// add(builder().attribute("STRING3", Arrays.asList("s1", "s2", "s3"))); -// add(builder().attribute("STRING3", null)); -// add(builder().attribute("STRING20", Arrays.asList("s1", "s2", "s3", "s4", "s5", "s6", "s7", "s8", "s9", "s10", "s11", "s12", "s13", "s14", "s15", "s16", "s17", "s18", "s19", "s20"))); + add(builder().attribute("STRING3", Arrays.asList("s1", "s2", "s3"))); + add(builder().attribute("STRING3", null)); + add(builder().attribute("STRING20", Arrays.asList("s1", "s2", "s3", "s4", "s5", "s6", "s7", "s8", "s9", "s10", "s11", "s12", "s13", "s14", "s15", "s16", "s17", "s18", "s19", "s20"))); metaData.add(new VCFInfoHeaderLine("STRING1", 1, VCFHeaderLineType.String, "x")); metaData.add(new VCFInfoHeaderLine("STRING3", 3, VCFHeaderLineType.String, "x")); metaData.add(new VCFInfoHeaderLine("STRING20", 20, VCFHeaderLineType.String, "x"));