From 93cef8263718ed064dbb91e312685165f9325981 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Sat, 12 May 2012 19:59:00 -0400 Subject: [PATCH] BCF2 header encoding decoding at final spec --- .../sting/utils/codecs/bcf2/BCF2Codec.java | 55 ++++++++----------- .../sting/utils/codecs/bcf2/BCF2Decoder.java | 34 ++---------- .../sting/utils/codecs/bcf2/BCF2Utils.java | 52 ++++++++++++++---- .../utils/codecs/bcf2/writer/BCF2Writer.java | 22 ++++++-- .../codecs/vcf/writer/StandardVCFWriter.java | 6 +- 5 files changed, 89 insertions(+), 80 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java index b6d9e8d90..72fb45322 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java @@ -38,6 +38,7 @@ import org.broadinstitute.sting.utils.variantcontext.Genotype; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder; +import java.io.ByteArrayInputStream; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; @@ -50,6 +51,7 @@ public class BCF2Codec implements FeatureCodec { private ArrayList dictionary; private final BCF2Decoder decoder = new BCF2Decoder(); private boolean skipGenotypes = false; + private final static int MAX_HEADER_SIZE = 0x08000000; // ---------------------------------------------------------------------- // @@ -91,36 +93,29 @@ public class BCF2Codec implements FeatureCodec { @Override public FeatureCodecHeader readHeader( final PositionalBufferedStream inputStream ) { - AsciiLineReader headerReader = new AsciiLineReader(inputStream); - String headerLine; - List headerLines = new ArrayList(); - boolean foundHeaderEnd = false; - try { - while ( ! foundHeaderEnd && (headerLine = headerReader.readLine()) != null) { - if ( headerLine.startsWith(VCFHeader.METADATA_INDICATOR) ) { - headerLines.add(headerLine); - } - else if ( headerLine.startsWith(VCFHeader.HEADER_INDICATOR) ) { - headerLines.add(headerLine); - foundHeaderEnd = true; - } - else { - throw new UserException.MalformedBCF2("Reached end of header without encountering a field layout line"); - } - } - } - catch ( IOException e ) { + // note that this reads the magic as well, and so does double duty + if ( ! BCF2Utils.startsWithBCF2Magic(inputStream) ) + throw new IllegalArgumentException("Input stream does not begin with BCF2 magic"); + + final int headerSizeInBytes = BCF2Utils.readInt(BCF2Type.INT32.getSizeInBytes(), inputStream); + + if ( headerSizeInBytes <= 0 || headerSizeInBytes > MAX_HEADER_SIZE) // no bigger than 8 MB + throw new UserException.MalformedBCF2("BCF2 header has invalid length: " + headerSizeInBytes + " must be >= 0 and < "+ MAX_HEADER_SIZE); + + final byte[] headerBytes = new byte[headerSizeInBytes]; + if ( inputStream.read(headerBytes) != headerSizeInBytes ) + throw new UserException.MalformedBCF2("Couldn't read all of the bytes specified in the header length = " + headerSizeInBytes); + + final PositionalBufferedStream bps = new PositionalBufferedStream(new ByteArrayInputStream(headerBytes)); + final AsciiLineReader headerReader = new AsciiLineReader(bps); + final VCFCodec headerParser = new VCFCodec(); + this.header = (VCFHeader)headerParser.readHeader(headerReader); + bps.close(); + } catch ( IOException e ) { throw new UserException.CouldNotReadInputFile("I/O error while reading BCF2 header"); } - if ( ! foundHeaderEnd ) { - throw new UserException.MalformedBCF2("Reached end of header without encountering a field layout line"); - } - - // read the header - this.header = AbstractVCFCodec.parseHeader(headerLines, VCFHeaderVersion.VCF4_1); - // create the config offsets for ( final VCFContigHeaderLine contig : header.getContigLines()) contigNames.add(contig.getID()); @@ -136,18 +131,12 @@ public class BCF2Codec implements FeatureCodec { public boolean canDecode( final String path ) { try { FileInputStream fis = new FileInputStream(path); - AsciiLineReader reader = new AsciiLineReader(new PositionalBufferedStream(fis)); - String firstLine = reader.readLine(); - if ( firstLine != null && firstLine.equals(BCF2Utils.VERSION_LINE) ) { - return true; - } + return BCF2Utils.startsWithBCF2Magic(fis); } catch ( FileNotFoundException e ) { return false; } catch ( IOException e ) { return false; } - - return false; } private final ArrayList parseDictionary(final VCFHeader header) { diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Decoder.java b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Decoder.java index 5d6ebf113..d7eebdc17 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Decoder.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Decoder.java @@ -151,7 +151,7 @@ public class BCF2Decoder { public final Object decodeSingleValue(final BCF2Type type) { // TODO -- decodeTypedValue should integrate this routine - final int value = readInt(type.getSizeInBytes(), recordStream); + final int value = BCF2Utils.readInt(type.getSizeInBytes(), recordStream); if ( value == type.getMissingBytes() ) return null; @@ -196,7 +196,7 @@ public class BCF2Decoder { } public final int decodeInt(int bytesForEachInt) { - return readInt(bytesForEachInt, recordStream); + return BCF2Utils.readInt(bytesForEachInt, recordStream); } public final float rawFloatToFloat(final int rawFloat) { @@ -216,7 +216,7 @@ public class BCF2Decoder { * @return */ public final int readBlockSize(final InputStream inputStream) { - return readInt(4, inputStream); + return BCF2Utils.readInt(4, inputStream); } /** @@ -246,32 +246,6 @@ public class BCF2Decoder { } public final byte readTypeDescriptor() { - return readByte(recordStream); - } - - private final static byte readByte(final InputStream stream) { - try { - return (byte)(stream.read() & 0xFF); - } catch ( IOException e ) { - throw new ReviewedStingException("readByte failure", e); - } - } - - private final static int readInt(int bytesForEachInt, final InputStream stream) { - switch ( bytesForEachInt ) { - case 1: { - return (byte)(readByte(stream)); - } case 2: { - final int b1 = readByte(stream) & 0xFF; - final int b2 = readByte(stream) & 0xFF; - return (short)((b1 << 8) | b2); - } case 4: { - final int b1 = readByte(stream) & 0xFF; - final int b2 = readByte(stream) & 0xFF; - final int b3 = readByte(stream) & 0xFF; - final int b4 = readByte(stream) & 0xFF; - return (int)(b1 << 24 | b2 << 16 | b3 << 8 | b4); - } default: throw new ReviewedStingException("Unexpected size during decoding"); - } + return BCF2Utils.readByte(recordStream); } } diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Utils.java b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Utils.java index 3d6fe056d..76bb17529 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Utils.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Utils.java @@ -28,8 +28,12 @@ import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLine; import org.broadinstitute.sting.utils.codecs.vcf.VCFIDHeaderLine; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import java.io.IOException; +import java.io.InputStream; import java.util.ArrayList; +import java.util.Arrays; /** * Common utilities for working with BCF2 files @@ -40,12 +44,10 @@ import java.util.ArrayList; * @since 5/12 */ public class BCF2Utils { + public static final byte[] MAGIC_HEADER_LINE = "BCF\2".getBytes(); + public static final int OVERFLOW_ELEMENT_MARKER = 15; public static final int MAX_INLINE_ELEMENTS = 14; - public final static BCF2Type[] INTEGER_TYPES_BY_SIZE = new BCF2Type[3]; - private final static BCF2Type[] LOOKUP = BCF2Type.values(); - public static final String VERSION_LINE_FORMAT = "fileformat=BCF2v%d.%d"; - public static final String VERSION_LINE = String.format(VCFHeader.METADATA_INDICATOR + VERSION_LINE_FORMAT, 0, 1); // Note that these values are prefixed by FFFFFF for convenience public static final int INT8_MISSING_VALUE = 0xFFFFFF80; @@ -53,11 +55,9 @@ public class BCF2Utils { public static final int INT32_MISSING_VALUE = 0x80000000; public static final int FLOAT_MISSING_VALUE = 0x7F800001; - static { - BCF2Utils.INTEGER_TYPES_BY_SIZE[0] = BCF2Type.INT8; - BCF2Utils.INTEGER_TYPES_BY_SIZE[1] = BCF2Type.INT16; - BCF2Utils.INTEGER_TYPES_BY_SIZE[2] = BCF2Type.INT32; - } + public final static BCF2Type[] INTEGER_TYPES_BY_SIZE = new BCF2Type[]{BCF2Type.INT8, BCF2Type.INT16, BCF2Type.INT32}; + + private BCF2Utils() {} /** * Create a strings dictionary from the VCF header @@ -98,7 +98,7 @@ public class BCF2Utils { } public final static BCF2Type decodeType(final byte typeDescriptor) { - return LOOKUP[decodeTypeID(typeDescriptor)]; + return BCF2Type.values()[decodeTypeID(typeDescriptor)]; } public final static boolean sizeIsOverflow(final byte typeDescriptor) { @@ -108,4 +108,36 @@ public class BCF2Utils { public final static boolean willOverflow(final long nElements) { return nElements > MAX_INLINE_ELEMENTS; } + + public final static boolean startsWithBCF2Magic(final InputStream stream) throws IOException { + final byte[] magicBytes = new byte[BCF2Utils.MAGIC_HEADER_LINE.length]; + stream.read(magicBytes); + return Arrays.equals(magicBytes, BCF2Utils.MAGIC_HEADER_LINE); + } + + public final static byte readByte(final InputStream stream) { + try { + return (byte)(stream.read() & 0xFF); + } catch ( IOException e ) { + throw new ReviewedStingException("readByte failure", e); + } + } + + public final static int readInt(int bytesForEachInt, final InputStream stream) { + switch ( bytesForEachInt ) { + case 1: { + return (byte)(readByte(stream)); + } case 2: { + final int b1 = readByte(stream) & 0xFF; + final int b2 = readByte(stream) & 0xFF; + return (short)((b1 << 8) | b2); + } case 4: { + final int b1 = readByte(stream) & 0xFF; + final int b2 = readByte(stream) & 0xFF; + final int b3 = readByte(stream) & 0xFF; + final int b4 = readByte(stream) & 0xFF; + return (int)(b1 << 24 | b2 << 16 | b3 << 8 | b4); + } default: throw new ReviewedStingException("Unexpected size during decoding"); + } + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/writer/BCF2Writer.java b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/writer/BCF2Writer.java index 822f62407..d137413c0 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/writer/BCF2Writer.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/writer/BCF2Writer.java @@ -39,10 +39,7 @@ import org.broadinstitute.sting.utils.variantcontext.Genotype; import org.broadinstitute.sting.utils.variantcontext.GenotypesContext; import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import java.io.File; -import java.io.IOException; -import java.io.OutputStream; -import java.io.OutputStreamWriter; +import java.io.*; import java.util.*; public class BCF2Writer extends IndexingVCFWriter { @@ -80,8 +77,21 @@ public class BCF2Writer extends IndexingVCFWriter { stringDictionaryMap.put(dict.get(i), i); } - // write out the header - StandardVCFWriter.writeHeader(header, new OutputStreamWriter(outputStream), doNotWriteGenotypes, BCF2Utils.VERSION_LINE, "BCF2 stream"); + try { + // write out the header into a byte stream, get it's length, and write everything to the file + final ByteArrayOutputStream capture = new ByteArrayOutputStream(); + final OutputStreamWriter writer = new OutputStreamWriter(capture); + StandardVCFWriter.writeHeader(header, writer, doNotWriteGenotypes, StandardVCFWriter.getVersionLine(), "BCF2 stream"); + writer.append('\0'); // the header is null terminated by a byte + writer.close(); + + final byte[] headerBytes = capture.toByteArray(); + outputStream.write(BCF2Utils.MAGIC_HEADER_LINE); + BCF2Encoder.encodePrimitive(headerBytes.length, BCF2Type.INT32, outputStream); + outputStream.write(headerBytes); + } catch (IOException e) { + throw new UserException.CouldNotCreateOutputFile("BCF2 stream", "Got IOException while trying to write BCF2 header", e); + } } @Override diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/writer/StandardVCFWriter.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/writer/StandardVCFWriter.java index dbb8bcd21..5acd03090 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/writer/StandardVCFWriter.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/writer/StandardVCFWriter.java @@ -91,7 +91,7 @@ public class StandardVCFWriter extends IndexingVCFWriter { @Override public void writeHeader(VCFHeader header) { mHeader = header; - writeHeader(mHeader, mWriter, doNotWriteGenotypes, VERSION_LINE, getStreamName()); + writeHeader(mHeader, mWriter, doNotWriteGenotypes, getVersionLine(), getStreamName()); // determine if we use filters, so we should FORCE pass the records // TODO -- this might not be necessary any longer as we have unfiltered, filtered, and PASS VCs @@ -101,6 +101,10 @@ public class StandardVCFWriter extends IndexingVCFWriter { } } + public static final String getVersionLine() { + return VERSION_LINE; + } + public static void writeHeader(VCFHeader header, final Writer writer, final boolean doNotWriteGenotypes,