BCF2 header encoding decoding at final spec

This commit is contained in:
Mark DePristo 2012-05-12 19:59:00 -04:00
parent ce9e9eebb1
commit 93cef82637
5 changed files with 89 additions and 80 deletions

View File

@ -38,6 +38,7 @@ import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder;
import java.io.ByteArrayInputStream;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
@ -50,6 +51,7 @@ public class BCF2Codec implements FeatureCodec<VariantContext> {
private ArrayList<String> dictionary;
private final BCF2Decoder decoder = new BCF2Decoder();
private boolean skipGenotypes = false;
private final static int MAX_HEADER_SIZE = 0x08000000;
// ----------------------------------------------------------------------
//
@ -91,36 +93,29 @@ public class BCF2Codec implements FeatureCodec<VariantContext> {
@Override
public FeatureCodecHeader readHeader( final PositionalBufferedStream inputStream ) {
AsciiLineReader headerReader = new AsciiLineReader(inputStream);
String headerLine;
List<String> headerLines = new ArrayList<String>();
boolean foundHeaderEnd = false;
try {
while ( ! foundHeaderEnd && (headerLine = headerReader.readLine()) != null) {
if ( headerLine.startsWith(VCFHeader.METADATA_INDICATOR) ) {
headerLines.add(headerLine);
}
else if ( headerLine.startsWith(VCFHeader.HEADER_INDICATOR) ) {
headerLines.add(headerLine);
foundHeaderEnd = true;
}
else {
throw new UserException.MalformedBCF2("Reached end of header without encountering a field layout line");
}
}
}
catch ( IOException e ) {
// note that this reads the magic as well, and so does double duty
if ( ! BCF2Utils.startsWithBCF2Magic(inputStream) )
throw new IllegalArgumentException("Input stream does not begin with BCF2 magic");
final int headerSizeInBytes = BCF2Utils.readInt(BCF2Type.INT32.getSizeInBytes(), inputStream);
if ( headerSizeInBytes <= 0 || headerSizeInBytes > MAX_HEADER_SIZE) // no bigger than 8 MB
throw new UserException.MalformedBCF2("BCF2 header has invalid length: " + headerSizeInBytes + " must be >= 0 and < "+ MAX_HEADER_SIZE);
final byte[] headerBytes = new byte[headerSizeInBytes];
if ( inputStream.read(headerBytes) != headerSizeInBytes )
throw new UserException.MalformedBCF2("Couldn't read all of the bytes specified in the header length = " + headerSizeInBytes);
final PositionalBufferedStream bps = new PositionalBufferedStream(new ByteArrayInputStream(headerBytes));
final AsciiLineReader headerReader = new AsciiLineReader(bps);
final VCFCodec headerParser = new VCFCodec();
this.header = (VCFHeader)headerParser.readHeader(headerReader);
bps.close();
} catch ( IOException e ) {
throw new UserException.CouldNotReadInputFile("I/O error while reading BCF2 header");
}
if ( ! foundHeaderEnd ) {
throw new UserException.MalformedBCF2("Reached end of header without encountering a field layout line");
}
// read the header
this.header = AbstractVCFCodec.parseHeader(headerLines, VCFHeaderVersion.VCF4_1);
// create the config offsets
for ( final VCFContigHeaderLine contig : header.getContigLines())
contigNames.add(contig.getID());
@ -136,18 +131,12 @@ public class BCF2Codec implements FeatureCodec<VariantContext> {
public boolean canDecode( final String path ) {
try {
FileInputStream fis = new FileInputStream(path);
AsciiLineReader reader = new AsciiLineReader(new PositionalBufferedStream(fis));
String firstLine = reader.readLine();
if ( firstLine != null && firstLine.equals(BCF2Utils.VERSION_LINE) ) {
return true;
}
return BCF2Utils.startsWithBCF2Magic(fis);
} catch ( FileNotFoundException e ) {
return false;
} catch ( IOException e ) {
return false;
}
return false;
}
private final ArrayList<String> parseDictionary(final VCFHeader header) {

View File

@ -151,7 +151,7 @@ public class BCF2Decoder {
public final Object decodeSingleValue(final BCF2Type type) {
// TODO -- decodeTypedValue should integrate this routine
final int value = readInt(type.getSizeInBytes(), recordStream);
final int value = BCF2Utils.readInt(type.getSizeInBytes(), recordStream);
if ( value == type.getMissingBytes() )
return null;
@ -196,7 +196,7 @@ public class BCF2Decoder {
}
public final int decodeInt(int bytesForEachInt) {
return readInt(bytesForEachInt, recordStream);
return BCF2Utils.readInt(bytesForEachInt, recordStream);
}
public final float rawFloatToFloat(final int rawFloat) {
@ -216,7 +216,7 @@ public class BCF2Decoder {
* @return
*/
public final int readBlockSize(final InputStream inputStream) {
return readInt(4, inputStream);
return BCF2Utils.readInt(4, inputStream);
}
/**
@ -246,32 +246,6 @@ public class BCF2Decoder {
}
public final byte readTypeDescriptor() {
return readByte(recordStream);
}
private final static byte readByte(final InputStream stream) {
try {
return (byte)(stream.read() & 0xFF);
} catch ( IOException e ) {
throw new ReviewedStingException("readByte failure", e);
}
}
private final static int readInt(int bytesForEachInt, final InputStream stream) {
switch ( bytesForEachInt ) {
case 1: {
return (byte)(readByte(stream));
} case 2: {
final int b1 = readByte(stream) & 0xFF;
final int b2 = readByte(stream) & 0xFF;
return (short)((b1 << 8) | b2);
} case 4: {
final int b1 = readByte(stream) & 0xFF;
final int b2 = readByte(stream) & 0xFF;
final int b3 = readByte(stream) & 0xFF;
final int b4 = readByte(stream) & 0xFF;
return (int)(b1 << 24 | b2 << 16 | b3 << 8 | b4);
} default: throw new ReviewedStingException("Unexpected size during decoding");
}
return BCF2Utils.readByte(recordStream);
}
}

View File

@ -28,8 +28,12 @@ import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLine;
import org.broadinstitute.sting.utils.codecs.vcf.VCFIDHeaderLine;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Arrays;
/**
* Common utilities for working with BCF2 files
@ -40,12 +44,10 @@ import java.util.ArrayList;
* @since 5/12
*/
public class BCF2Utils {
public static final byte[] MAGIC_HEADER_LINE = "BCF\2".getBytes();
public static final int OVERFLOW_ELEMENT_MARKER = 15;
public static final int MAX_INLINE_ELEMENTS = 14;
public final static BCF2Type[] INTEGER_TYPES_BY_SIZE = new BCF2Type[3];
private final static BCF2Type[] LOOKUP = BCF2Type.values();
public static final String VERSION_LINE_FORMAT = "fileformat=BCF2v%d.%d";
public static final String VERSION_LINE = String.format(VCFHeader.METADATA_INDICATOR + VERSION_LINE_FORMAT, 0, 1);
// Note that these values are prefixed by FFFFFF for convenience
public static final int INT8_MISSING_VALUE = 0xFFFFFF80;
@ -53,11 +55,9 @@ public class BCF2Utils {
public static final int INT32_MISSING_VALUE = 0x80000000;
public static final int FLOAT_MISSING_VALUE = 0x7F800001;
static {
BCF2Utils.INTEGER_TYPES_BY_SIZE[0] = BCF2Type.INT8;
BCF2Utils.INTEGER_TYPES_BY_SIZE[1] = BCF2Type.INT16;
BCF2Utils.INTEGER_TYPES_BY_SIZE[2] = BCF2Type.INT32;
}
public final static BCF2Type[] INTEGER_TYPES_BY_SIZE = new BCF2Type[]{BCF2Type.INT8, BCF2Type.INT16, BCF2Type.INT32};
private BCF2Utils() {}
/**
* Create a strings dictionary from the VCF header
@ -98,7 +98,7 @@ public class BCF2Utils {
}
public final static BCF2Type decodeType(final byte typeDescriptor) {
return LOOKUP[decodeTypeID(typeDescriptor)];
return BCF2Type.values()[decodeTypeID(typeDescriptor)];
}
public final static boolean sizeIsOverflow(final byte typeDescriptor) {
@ -108,4 +108,36 @@ public class BCF2Utils {
public final static boolean willOverflow(final long nElements) {
return nElements > MAX_INLINE_ELEMENTS;
}
public final static boolean startsWithBCF2Magic(final InputStream stream) throws IOException {
final byte[] magicBytes = new byte[BCF2Utils.MAGIC_HEADER_LINE.length];
stream.read(magicBytes);
return Arrays.equals(magicBytes, BCF2Utils.MAGIC_HEADER_LINE);
}
public final static byte readByte(final InputStream stream) {
try {
return (byte)(stream.read() & 0xFF);
} catch ( IOException e ) {
throw new ReviewedStingException("readByte failure", e);
}
}
public final static int readInt(int bytesForEachInt, final InputStream stream) {
switch ( bytesForEachInt ) {
case 1: {
return (byte)(readByte(stream));
} case 2: {
final int b1 = readByte(stream) & 0xFF;
final int b2 = readByte(stream) & 0xFF;
return (short)((b1 << 8) | b2);
} case 4: {
final int b1 = readByte(stream) & 0xFF;
final int b2 = readByte(stream) & 0xFF;
final int b3 = readByte(stream) & 0xFF;
final int b4 = readByte(stream) & 0xFF;
return (int)(b1 << 24 | b2 << 16 | b3 << 8 | b4);
} default: throw new ReviewedStingException("Unexpected size during decoding");
}
}
}

View File

@ -39,10 +39,7 @@ import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.GenotypesContext;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.io.File;
import java.io.IOException;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.*;
import java.util.*;
public class BCF2Writer extends IndexingVCFWriter {
@ -80,8 +77,21 @@ public class BCF2Writer extends IndexingVCFWriter {
stringDictionaryMap.put(dict.get(i), i);
}
// write out the header
StandardVCFWriter.writeHeader(header, new OutputStreamWriter(outputStream), doNotWriteGenotypes, BCF2Utils.VERSION_LINE, "BCF2 stream");
try {
// write out the header into a byte stream, get it's length, and write everything to the file
final ByteArrayOutputStream capture = new ByteArrayOutputStream();
final OutputStreamWriter writer = new OutputStreamWriter(capture);
StandardVCFWriter.writeHeader(header, writer, doNotWriteGenotypes, StandardVCFWriter.getVersionLine(), "BCF2 stream");
writer.append('\0'); // the header is null terminated by a byte
writer.close();
final byte[] headerBytes = capture.toByteArray();
outputStream.write(BCF2Utils.MAGIC_HEADER_LINE);
BCF2Encoder.encodePrimitive(headerBytes.length, BCF2Type.INT32, outputStream);
outputStream.write(headerBytes);
} catch (IOException e) {
throw new UserException.CouldNotCreateOutputFile("BCF2 stream", "Got IOException while trying to write BCF2 header", e);
}
}
@Override

View File

@ -91,7 +91,7 @@ public class StandardVCFWriter extends IndexingVCFWriter {
@Override
public void writeHeader(VCFHeader header) {
mHeader = header;
writeHeader(mHeader, mWriter, doNotWriteGenotypes, VERSION_LINE, getStreamName());
writeHeader(mHeader, mWriter, doNotWriteGenotypes, getVersionLine(), getStreamName());
// determine if we use filters, so we should FORCE pass the records
// TODO -- this might not be necessary any longer as we have unfiltered, filtered, and PASS VCs
@ -101,6 +101,10 @@ public class StandardVCFWriter extends IndexingVCFWriter {
}
}
public static final String getVersionLine() {
return VERSION_LINE;
}
public static void writeHeader(VCFHeader header,
final Writer writer,
final boolean doNotWriteGenotypes,