BCF2 header encoding decoding at final spec
This commit is contained in:
parent
ce9e9eebb1
commit
93cef82637
|
|
@ -38,6 +38,7 @@ import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
|||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.IOException;
|
||||
|
|
@ -50,6 +51,7 @@ public class BCF2Codec implements FeatureCodec<VariantContext> {
|
|||
private ArrayList<String> dictionary;
|
||||
private final BCF2Decoder decoder = new BCF2Decoder();
|
||||
private boolean skipGenotypes = false;
|
||||
private final static int MAX_HEADER_SIZE = 0x08000000;
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
//
|
||||
|
|
@ -91,36 +93,29 @@ public class BCF2Codec implements FeatureCodec<VariantContext> {
|
|||
|
||||
@Override
|
||||
public FeatureCodecHeader readHeader( final PositionalBufferedStream inputStream ) {
|
||||
AsciiLineReader headerReader = new AsciiLineReader(inputStream);
|
||||
String headerLine;
|
||||
List<String> headerLines = new ArrayList<String>();
|
||||
boolean foundHeaderEnd = false;
|
||||
|
||||
try {
|
||||
while ( ! foundHeaderEnd && (headerLine = headerReader.readLine()) != null) {
|
||||
if ( headerLine.startsWith(VCFHeader.METADATA_INDICATOR) ) {
|
||||
headerLines.add(headerLine);
|
||||
}
|
||||
else if ( headerLine.startsWith(VCFHeader.HEADER_INDICATOR) ) {
|
||||
headerLines.add(headerLine);
|
||||
foundHeaderEnd = true;
|
||||
}
|
||||
else {
|
||||
throw new UserException.MalformedBCF2("Reached end of header without encountering a field layout line");
|
||||
}
|
||||
}
|
||||
}
|
||||
catch ( IOException e ) {
|
||||
// note that this reads the magic as well, and so does double duty
|
||||
if ( ! BCF2Utils.startsWithBCF2Magic(inputStream) )
|
||||
throw new IllegalArgumentException("Input stream does not begin with BCF2 magic");
|
||||
|
||||
final int headerSizeInBytes = BCF2Utils.readInt(BCF2Type.INT32.getSizeInBytes(), inputStream);
|
||||
|
||||
if ( headerSizeInBytes <= 0 || headerSizeInBytes > MAX_HEADER_SIZE) // no bigger than 8 MB
|
||||
throw new UserException.MalformedBCF2("BCF2 header has invalid length: " + headerSizeInBytes + " must be >= 0 and < "+ MAX_HEADER_SIZE);
|
||||
|
||||
final byte[] headerBytes = new byte[headerSizeInBytes];
|
||||
if ( inputStream.read(headerBytes) != headerSizeInBytes )
|
||||
throw new UserException.MalformedBCF2("Couldn't read all of the bytes specified in the header length = " + headerSizeInBytes);
|
||||
|
||||
final PositionalBufferedStream bps = new PositionalBufferedStream(new ByteArrayInputStream(headerBytes));
|
||||
final AsciiLineReader headerReader = new AsciiLineReader(bps);
|
||||
final VCFCodec headerParser = new VCFCodec();
|
||||
this.header = (VCFHeader)headerParser.readHeader(headerReader);
|
||||
bps.close();
|
||||
} catch ( IOException e ) {
|
||||
throw new UserException.CouldNotReadInputFile("I/O error while reading BCF2 header");
|
||||
}
|
||||
|
||||
if ( ! foundHeaderEnd ) {
|
||||
throw new UserException.MalformedBCF2("Reached end of header without encountering a field layout line");
|
||||
}
|
||||
|
||||
// read the header
|
||||
this.header = AbstractVCFCodec.parseHeader(headerLines, VCFHeaderVersion.VCF4_1);
|
||||
|
||||
// create the config offsets
|
||||
for ( final VCFContigHeaderLine contig : header.getContigLines())
|
||||
contigNames.add(contig.getID());
|
||||
|
|
@ -136,18 +131,12 @@ public class BCF2Codec implements FeatureCodec<VariantContext> {
|
|||
public boolean canDecode( final String path ) {
|
||||
try {
|
||||
FileInputStream fis = new FileInputStream(path);
|
||||
AsciiLineReader reader = new AsciiLineReader(new PositionalBufferedStream(fis));
|
||||
String firstLine = reader.readLine();
|
||||
if ( firstLine != null && firstLine.equals(BCF2Utils.VERSION_LINE) ) {
|
||||
return true;
|
||||
}
|
||||
return BCF2Utils.startsWithBCF2Magic(fis);
|
||||
} catch ( FileNotFoundException e ) {
|
||||
return false;
|
||||
} catch ( IOException e ) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
private final ArrayList<String> parseDictionary(final VCFHeader header) {
|
||||
|
|
|
|||
|
|
@ -151,7 +151,7 @@ public class BCF2Decoder {
|
|||
|
||||
public final Object decodeSingleValue(final BCF2Type type) {
|
||||
// TODO -- decodeTypedValue should integrate this routine
|
||||
final int value = readInt(type.getSizeInBytes(), recordStream);
|
||||
final int value = BCF2Utils.readInt(type.getSizeInBytes(), recordStream);
|
||||
|
||||
if ( value == type.getMissingBytes() )
|
||||
return null;
|
||||
|
|
@ -196,7 +196,7 @@ public class BCF2Decoder {
|
|||
}
|
||||
|
||||
public final int decodeInt(int bytesForEachInt) {
|
||||
return readInt(bytesForEachInt, recordStream);
|
||||
return BCF2Utils.readInt(bytesForEachInt, recordStream);
|
||||
}
|
||||
|
||||
public final float rawFloatToFloat(final int rawFloat) {
|
||||
|
|
@ -216,7 +216,7 @@ public class BCF2Decoder {
|
|||
* @return
|
||||
*/
|
||||
public final int readBlockSize(final InputStream inputStream) {
|
||||
return readInt(4, inputStream);
|
||||
return BCF2Utils.readInt(4, inputStream);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -246,32 +246,6 @@ public class BCF2Decoder {
|
|||
}
|
||||
|
||||
public final byte readTypeDescriptor() {
|
||||
return readByte(recordStream);
|
||||
}
|
||||
|
||||
private final static byte readByte(final InputStream stream) {
|
||||
try {
|
||||
return (byte)(stream.read() & 0xFF);
|
||||
} catch ( IOException e ) {
|
||||
throw new ReviewedStingException("readByte failure", e);
|
||||
}
|
||||
}
|
||||
|
||||
private final static int readInt(int bytesForEachInt, final InputStream stream) {
|
||||
switch ( bytesForEachInt ) {
|
||||
case 1: {
|
||||
return (byte)(readByte(stream));
|
||||
} case 2: {
|
||||
final int b1 = readByte(stream) & 0xFF;
|
||||
final int b2 = readByte(stream) & 0xFF;
|
||||
return (short)((b1 << 8) | b2);
|
||||
} case 4: {
|
||||
final int b1 = readByte(stream) & 0xFF;
|
||||
final int b2 = readByte(stream) & 0xFF;
|
||||
final int b3 = readByte(stream) & 0xFF;
|
||||
final int b4 = readByte(stream) & 0xFF;
|
||||
return (int)(b1 << 24 | b2 << 16 | b3 << 8 | b4);
|
||||
} default: throw new ReviewedStingException("Unexpected size during decoding");
|
||||
}
|
||||
return BCF2Utils.readByte(recordStream);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -28,8 +28,12 @@ import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
|
|||
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLine;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFIDHeaderLine;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
|
||||
/**
|
||||
* Common utilities for working with BCF2 files
|
||||
|
|
@ -40,12 +44,10 @@ import java.util.ArrayList;
|
|||
* @since 5/12
|
||||
*/
|
||||
public class BCF2Utils {
|
||||
public static final byte[] MAGIC_HEADER_LINE = "BCF\2".getBytes();
|
||||
|
||||
public static final int OVERFLOW_ELEMENT_MARKER = 15;
|
||||
public static final int MAX_INLINE_ELEMENTS = 14;
|
||||
public final static BCF2Type[] INTEGER_TYPES_BY_SIZE = new BCF2Type[3];
|
||||
private final static BCF2Type[] LOOKUP = BCF2Type.values();
|
||||
public static final String VERSION_LINE_FORMAT = "fileformat=BCF2v%d.%d";
|
||||
public static final String VERSION_LINE = String.format(VCFHeader.METADATA_INDICATOR + VERSION_LINE_FORMAT, 0, 1);
|
||||
|
||||
// Note that these values are prefixed by FFFFFF for convenience
|
||||
public static final int INT8_MISSING_VALUE = 0xFFFFFF80;
|
||||
|
|
@ -53,11 +55,9 @@ public class BCF2Utils {
|
|||
public static final int INT32_MISSING_VALUE = 0x80000000;
|
||||
public static final int FLOAT_MISSING_VALUE = 0x7F800001;
|
||||
|
||||
static {
|
||||
BCF2Utils.INTEGER_TYPES_BY_SIZE[0] = BCF2Type.INT8;
|
||||
BCF2Utils.INTEGER_TYPES_BY_SIZE[1] = BCF2Type.INT16;
|
||||
BCF2Utils.INTEGER_TYPES_BY_SIZE[2] = BCF2Type.INT32;
|
||||
}
|
||||
public final static BCF2Type[] INTEGER_TYPES_BY_SIZE = new BCF2Type[]{BCF2Type.INT8, BCF2Type.INT16, BCF2Type.INT32};
|
||||
|
||||
private BCF2Utils() {}
|
||||
|
||||
/**
|
||||
* Create a strings dictionary from the VCF header
|
||||
|
|
@ -98,7 +98,7 @@ public class BCF2Utils {
|
|||
}
|
||||
|
||||
public final static BCF2Type decodeType(final byte typeDescriptor) {
|
||||
return LOOKUP[decodeTypeID(typeDescriptor)];
|
||||
return BCF2Type.values()[decodeTypeID(typeDescriptor)];
|
||||
}
|
||||
|
||||
public final static boolean sizeIsOverflow(final byte typeDescriptor) {
|
||||
|
|
@ -108,4 +108,36 @@ public class BCF2Utils {
|
|||
public final static boolean willOverflow(final long nElements) {
|
||||
return nElements > MAX_INLINE_ELEMENTS;
|
||||
}
|
||||
|
||||
public final static boolean startsWithBCF2Magic(final InputStream stream) throws IOException {
|
||||
final byte[] magicBytes = new byte[BCF2Utils.MAGIC_HEADER_LINE.length];
|
||||
stream.read(magicBytes);
|
||||
return Arrays.equals(magicBytes, BCF2Utils.MAGIC_HEADER_LINE);
|
||||
}
|
||||
|
||||
public final static byte readByte(final InputStream stream) {
|
||||
try {
|
||||
return (byte)(stream.read() & 0xFF);
|
||||
} catch ( IOException e ) {
|
||||
throw new ReviewedStingException("readByte failure", e);
|
||||
}
|
||||
}
|
||||
|
||||
public final static int readInt(int bytesForEachInt, final InputStream stream) {
|
||||
switch ( bytesForEachInt ) {
|
||||
case 1: {
|
||||
return (byte)(readByte(stream));
|
||||
} case 2: {
|
||||
final int b1 = readByte(stream) & 0xFF;
|
||||
final int b2 = readByte(stream) & 0xFF;
|
||||
return (short)((b1 << 8) | b2);
|
||||
} case 4: {
|
||||
final int b1 = readByte(stream) & 0xFF;
|
||||
final int b2 = readByte(stream) & 0xFF;
|
||||
final int b3 = readByte(stream) & 0xFF;
|
||||
final int b4 = readByte(stream) & 0xFF;
|
||||
return (int)(b1 << 24 | b2 << 16 | b3 << 8 | b4);
|
||||
} default: throw new ReviewedStingException("Unexpected size during decoding");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -39,10 +39,7 @@ import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
|||
import org.broadinstitute.sting.utils.variantcontext.GenotypesContext;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.OutputStream;
|
||||
import java.io.OutputStreamWriter;
|
||||
import java.io.*;
|
||||
import java.util.*;
|
||||
|
||||
public class BCF2Writer extends IndexingVCFWriter {
|
||||
|
|
@ -80,8 +77,21 @@ public class BCF2Writer extends IndexingVCFWriter {
|
|||
stringDictionaryMap.put(dict.get(i), i);
|
||||
}
|
||||
|
||||
// write out the header
|
||||
StandardVCFWriter.writeHeader(header, new OutputStreamWriter(outputStream), doNotWriteGenotypes, BCF2Utils.VERSION_LINE, "BCF2 stream");
|
||||
try {
|
||||
// write out the header into a byte stream, get it's length, and write everything to the file
|
||||
final ByteArrayOutputStream capture = new ByteArrayOutputStream();
|
||||
final OutputStreamWriter writer = new OutputStreamWriter(capture);
|
||||
StandardVCFWriter.writeHeader(header, writer, doNotWriteGenotypes, StandardVCFWriter.getVersionLine(), "BCF2 stream");
|
||||
writer.append('\0'); // the header is null terminated by a byte
|
||||
writer.close();
|
||||
|
||||
final byte[] headerBytes = capture.toByteArray();
|
||||
outputStream.write(BCF2Utils.MAGIC_HEADER_LINE);
|
||||
BCF2Encoder.encodePrimitive(headerBytes.length, BCF2Type.INT32, outputStream);
|
||||
outputStream.write(headerBytes);
|
||||
} catch (IOException e) {
|
||||
throw new UserException.CouldNotCreateOutputFile("BCF2 stream", "Got IOException while trying to write BCF2 header", e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
|||
|
|
@ -91,7 +91,7 @@ public class StandardVCFWriter extends IndexingVCFWriter {
|
|||
@Override
|
||||
public void writeHeader(VCFHeader header) {
|
||||
mHeader = header;
|
||||
writeHeader(mHeader, mWriter, doNotWriteGenotypes, VERSION_LINE, getStreamName());
|
||||
writeHeader(mHeader, mWriter, doNotWriteGenotypes, getVersionLine(), getStreamName());
|
||||
|
||||
// determine if we use filters, so we should FORCE pass the records
|
||||
// TODO -- this might not be necessary any longer as we have unfiltered, filtered, and PASS VCs
|
||||
|
|
@ -101,6 +101,10 @@ public class StandardVCFWriter extends IndexingVCFWriter {
|
|||
}
|
||||
}
|
||||
|
||||
public static final String getVersionLine() {
|
||||
return VERSION_LINE;
|
||||
}
|
||||
|
||||
public static void writeHeader(VCFHeader header,
|
||||
final Writer writer,
|
||||
final boolean doNotWriteGenotypes,
|
||||
|
|
|
|||
Loading…
Reference in New Issue