BCF2 header encoding decoding at final spec

This commit is contained in:
Mark DePristo 2012-05-12 19:59:00 -04:00
parent ce9e9eebb1
commit 93cef82637
5 changed files with 89 additions and 80 deletions

View File

@ -38,6 +38,7 @@ import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.VariantContext; import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder; import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder;
import java.io.ByteArrayInputStream;
import java.io.FileInputStream; import java.io.FileInputStream;
import java.io.FileNotFoundException; import java.io.FileNotFoundException;
import java.io.IOException; import java.io.IOException;
@ -50,6 +51,7 @@ public class BCF2Codec implements FeatureCodec<VariantContext> {
private ArrayList<String> dictionary; private ArrayList<String> dictionary;
private final BCF2Decoder decoder = new BCF2Decoder(); private final BCF2Decoder decoder = new BCF2Decoder();
private boolean skipGenotypes = false; private boolean skipGenotypes = false;
private final static int MAX_HEADER_SIZE = 0x08000000;
// ---------------------------------------------------------------------- // ----------------------------------------------------------------------
// //
@ -91,36 +93,29 @@ public class BCF2Codec implements FeatureCodec<VariantContext> {
@Override @Override
public FeatureCodecHeader readHeader( final PositionalBufferedStream inputStream ) { public FeatureCodecHeader readHeader( final PositionalBufferedStream inputStream ) {
AsciiLineReader headerReader = new AsciiLineReader(inputStream);
String headerLine;
List<String> headerLines = new ArrayList<String>();
boolean foundHeaderEnd = false;
try { try {
while ( ! foundHeaderEnd && (headerLine = headerReader.readLine()) != null) { // note that this reads the magic as well, and so does double duty
if ( headerLine.startsWith(VCFHeader.METADATA_INDICATOR) ) { if ( ! BCF2Utils.startsWithBCF2Magic(inputStream) )
headerLines.add(headerLine); throw new IllegalArgumentException("Input stream does not begin with BCF2 magic");
}
else if ( headerLine.startsWith(VCFHeader.HEADER_INDICATOR) ) { final int headerSizeInBytes = BCF2Utils.readInt(BCF2Type.INT32.getSizeInBytes(), inputStream);
headerLines.add(headerLine);
foundHeaderEnd = true; if ( headerSizeInBytes <= 0 || headerSizeInBytes > MAX_HEADER_SIZE) // no bigger than 8 MB
} throw new UserException.MalformedBCF2("BCF2 header has invalid length: " + headerSizeInBytes + " must be >= 0 and < "+ MAX_HEADER_SIZE);
else {
throw new UserException.MalformedBCF2("Reached end of header without encountering a field layout line"); final byte[] headerBytes = new byte[headerSizeInBytes];
} if ( inputStream.read(headerBytes) != headerSizeInBytes )
} throw new UserException.MalformedBCF2("Couldn't read all of the bytes specified in the header length = " + headerSizeInBytes);
}
catch ( IOException e ) { final PositionalBufferedStream bps = new PositionalBufferedStream(new ByteArrayInputStream(headerBytes));
final AsciiLineReader headerReader = new AsciiLineReader(bps);
final VCFCodec headerParser = new VCFCodec();
this.header = (VCFHeader)headerParser.readHeader(headerReader);
bps.close();
} catch ( IOException e ) {
throw new UserException.CouldNotReadInputFile("I/O error while reading BCF2 header"); throw new UserException.CouldNotReadInputFile("I/O error while reading BCF2 header");
} }
if ( ! foundHeaderEnd ) {
throw new UserException.MalformedBCF2("Reached end of header without encountering a field layout line");
}
// read the header
this.header = AbstractVCFCodec.parseHeader(headerLines, VCFHeaderVersion.VCF4_1);
// create the config offsets // create the config offsets
for ( final VCFContigHeaderLine contig : header.getContigLines()) for ( final VCFContigHeaderLine contig : header.getContigLines())
contigNames.add(contig.getID()); contigNames.add(contig.getID());
@ -136,18 +131,12 @@ public class BCF2Codec implements FeatureCodec<VariantContext> {
public boolean canDecode( final String path ) { public boolean canDecode( final String path ) {
try { try {
FileInputStream fis = new FileInputStream(path); FileInputStream fis = new FileInputStream(path);
AsciiLineReader reader = new AsciiLineReader(new PositionalBufferedStream(fis)); return BCF2Utils.startsWithBCF2Magic(fis);
String firstLine = reader.readLine();
if ( firstLine != null && firstLine.equals(BCF2Utils.VERSION_LINE) ) {
return true;
}
} catch ( FileNotFoundException e ) { } catch ( FileNotFoundException e ) {
return false; return false;
} catch ( IOException e ) { } catch ( IOException e ) {
return false; return false;
} }
return false;
} }
private final ArrayList<String> parseDictionary(final VCFHeader header) { private final ArrayList<String> parseDictionary(final VCFHeader header) {

View File

@ -151,7 +151,7 @@ public class BCF2Decoder {
public final Object decodeSingleValue(final BCF2Type type) { public final Object decodeSingleValue(final BCF2Type type) {
// TODO -- decodeTypedValue should integrate this routine // TODO -- decodeTypedValue should integrate this routine
final int value = readInt(type.getSizeInBytes(), recordStream); final int value = BCF2Utils.readInt(type.getSizeInBytes(), recordStream);
if ( value == type.getMissingBytes() ) if ( value == type.getMissingBytes() )
return null; return null;
@ -196,7 +196,7 @@ public class BCF2Decoder {
} }
public final int decodeInt(int bytesForEachInt) { public final int decodeInt(int bytesForEachInt) {
return readInt(bytesForEachInt, recordStream); return BCF2Utils.readInt(bytesForEachInt, recordStream);
} }
public final float rawFloatToFloat(final int rawFloat) { public final float rawFloatToFloat(final int rawFloat) {
@ -216,7 +216,7 @@ public class BCF2Decoder {
* @return * @return
*/ */
public final int readBlockSize(final InputStream inputStream) { public final int readBlockSize(final InputStream inputStream) {
return readInt(4, inputStream); return BCF2Utils.readInt(4, inputStream);
} }
/** /**
@ -246,32 +246,6 @@ public class BCF2Decoder {
} }
public final byte readTypeDescriptor() { public final byte readTypeDescriptor() {
return readByte(recordStream); return BCF2Utils.readByte(recordStream);
}
private final static byte readByte(final InputStream stream) {
try {
return (byte)(stream.read() & 0xFF);
} catch ( IOException e ) {
throw new ReviewedStingException("readByte failure", e);
}
}
private final static int readInt(int bytesForEachInt, final InputStream stream) {
switch ( bytesForEachInt ) {
case 1: {
return (byte)(readByte(stream));
} case 2: {
final int b1 = readByte(stream) & 0xFF;
final int b2 = readByte(stream) & 0xFF;
return (short)((b1 << 8) | b2);
} case 4: {
final int b1 = readByte(stream) & 0xFF;
final int b2 = readByte(stream) & 0xFF;
final int b3 = readByte(stream) & 0xFF;
final int b4 = readByte(stream) & 0xFF;
return (int)(b1 << 24 | b2 << 16 | b3 << 8 | b4);
} default: throw new ReviewedStingException("Unexpected size during decoding");
}
} }
} }

View File

@ -28,8 +28,12 @@ import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLine; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLine;
import org.broadinstitute.sting.utils.codecs.vcf.VCFIDHeaderLine; import org.broadinstitute.sting.utils.codecs.vcf.VCFIDHeaderLine;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Arrays;
/** /**
* Common utilities for working with BCF2 files * Common utilities for working with BCF2 files
@ -40,12 +44,10 @@ import java.util.ArrayList;
* @since 5/12 * @since 5/12
*/ */
public class BCF2Utils { public class BCF2Utils {
public static final byte[] MAGIC_HEADER_LINE = "BCF\2".getBytes();
public static final int OVERFLOW_ELEMENT_MARKER = 15; public static final int OVERFLOW_ELEMENT_MARKER = 15;
public static final int MAX_INLINE_ELEMENTS = 14; public static final int MAX_INLINE_ELEMENTS = 14;
public final static BCF2Type[] INTEGER_TYPES_BY_SIZE = new BCF2Type[3];
private final static BCF2Type[] LOOKUP = BCF2Type.values();
public static final String VERSION_LINE_FORMAT = "fileformat=BCF2v%d.%d";
public static final String VERSION_LINE = String.format(VCFHeader.METADATA_INDICATOR + VERSION_LINE_FORMAT, 0, 1);
// Note that these values are prefixed by FFFFFF for convenience // Note that these values are prefixed by FFFFFF for convenience
public static final int INT8_MISSING_VALUE = 0xFFFFFF80; public static final int INT8_MISSING_VALUE = 0xFFFFFF80;
@ -53,11 +55,9 @@ public class BCF2Utils {
public static final int INT32_MISSING_VALUE = 0x80000000; public static final int INT32_MISSING_VALUE = 0x80000000;
public static final int FLOAT_MISSING_VALUE = 0x7F800001; public static final int FLOAT_MISSING_VALUE = 0x7F800001;
static { public final static BCF2Type[] INTEGER_TYPES_BY_SIZE = new BCF2Type[]{BCF2Type.INT8, BCF2Type.INT16, BCF2Type.INT32};
BCF2Utils.INTEGER_TYPES_BY_SIZE[0] = BCF2Type.INT8;
BCF2Utils.INTEGER_TYPES_BY_SIZE[1] = BCF2Type.INT16; private BCF2Utils() {}
BCF2Utils.INTEGER_TYPES_BY_SIZE[2] = BCF2Type.INT32;
}
/** /**
* Create a strings dictionary from the VCF header * Create a strings dictionary from the VCF header
@ -98,7 +98,7 @@ public class BCF2Utils {
} }
public final static BCF2Type decodeType(final byte typeDescriptor) { public final static BCF2Type decodeType(final byte typeDescriptor) {
return LOOKUP[decodeTypeID(typeDescriptor)]; return BCF2Type.values()[decodeTypeID(typeDescriptor)];
} }
public final static boolean sizeIsOverflow(final byte typeDescriptor) { public final static boolean sizeIsOverflow(final byte typeDescriptor) {
@ -108,4 +108,36 @@ public class BCF2Utils {
public final static boolean willOverflow(final long nElements) { public final static boolean willOverflow(final long nElements) {
return nElements > MAX_INLINE_ELEMENTS; return nElements > MAX_INLINE_ELEMENTS;
} }
public final static boolean startsWithBCF2Magic(final InputStream stream) throws IOException {
final byte[] magicBytes = new byte[BCF2Utils.MAGIC_HEADER_LINE.length];
stream.read(magicBytes);
return Arrays.equals(magicBytes, BCF2Utils.MAGIC_HEADER_LINE);
}
public final static byte readByte(final InputStream stream) {
try {
return (byte)(stream.read() & 0xFF);
} catch ( IOException e ) {
throw new ReviewedStingException("readByte failure", e);
}
}
public final static int readInt(int bytesForEachInt, final InputStream stream) {
switch ( bytesForEachInt ) {
case 1: {
return (byte)(readByte(stream));
} case 2: {
final int b1 = readByte(stream) & 0xFF;
final int b2 = readByte(stream) & 0xFF;
return (short)((b1 << 8) | b2);
} case 4: {
final int b1 = readByte(stream) & 0xFF;
final int b2 = readByte(stream) & 0xFF;
final int b3 = readByte(stream) & 0xFF;
final int b4 = readByte(stream) & 0xFF;
return (int)(b1 << 24 | b2 << 16 | b3 << 8 | b4);
} default: throw new ReviewedStingException("Unexpected size during decoding");
}
}
} }

View File

@ -39,10 +39,7 @@ import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.GenotypesContext; import org.broadinstitute.sting.utils.variantcontext.GenotypesContext;
import org.broadinstitute.sting.utils.variantcontext.VariantContext; import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.io.File; import java.io.*;
import java.io.IOException;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.util.*; import java.util.*;
public class BCF2Writer extends IndexingVCFWriter { public class BCF2Writer extends IndexingVCFWriter {
@ -80,8 +77,21 @@ public class BCF2Writer extends IndexingVCFWriter {
stringDictionaryMap.put(dict.get(i), i); stringDictionaryMap.put(dict.get(i), i);
} }
// write out the header try {
StandardVCFWriter.writeHeader(header, new OutputStreamWriter(outputStream), doNotWriteGenotypes, BCF2Utils.VERSION_LINE, "BCF2 stream"); // write out the header into a byte stream, get it's length, and write everything to the file
final ByteArrayOutputStream capture = new ByteArrayOutputStream();
final OutputStreamWriter writer = new OutputStreamWriter(capture);
StandardVCFWriter.writeHeader(header, writer, doNotWriteGenotypes, StandardVCFWriter.getVersionLine(), "BCF2 stream");
writer.append('\0'); // the header is null terminated by a byte
writer.close();
final byte[] headerBytes = capture.toByteArray();
outputStream.write(BCF2Utils.MAGIC_HEADER_LINE);
BCF2Encoder.encodePrimitive(headerBytes.length, BCF2Type.INT32, outputStream);
outputStream.write(headerBytes);
} catch (IOException e) {
throw new UserException.CouldNotCreateOutputFile("BCF2 stream", "Got IOException while trying to write BCF2 header", e);
}
} }
@Override @Override

View File

@ -91,7 +91,7 @@ public class StandardVCFWriter extends IndexingVCFWriter {
@Override @Override
public void writeHeader(VCFHeader header) { public void writeHeader(VCFHeader header) {
mHeader = header; mHeader = header;
writeHeader(mHeader, mWriter, doNotWriteGenotypes, VERSION_LINE, getStreamName()); writeHeader(mHeader, mWriter, doNotWriteGenotypes, getVersionLine(), getStreamName());
// determine if we use filters, so we should FORCE pass the records // determine if we use filters, so we should FORCE pass the records
// TODO -- this might not be necessary any longer as we have unfiltered, filtered, and PASS VCs // TODO -- this might not be necessary any longer as we have unfiltered, filtered, and PASS VCs
@ -101,6 +101,10 @@ public class StandardVCFWriter extends IndexingVCFWriter {
} }
} }
public static final String getVersionLine() {
return VERSION_LINE;
}
public static void writeHeader(VCFHeader header, public static void writeHeader(VCFHeader header,
final Writer writer, final Writer writer,
final boolean doNotWriteGenotypes, final boolean doNotWriteGenotypes,