Move BCF2 from private utils to public codecs

This commit is contained in:
Mark DePristo 2012-05-12 12:13:33 -04:00
parent d13cda6b6f
commit 679ffdd333
9 changed files with 1996 additions and 0 deletions

View File

@ -0,0 +1,414 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.codecs.bcf2;
import org.apache.log4j.Logger;
import org.broad.tribble.Feature;
import org.broad.tribble.FeatureCodec;
import org.broad.tribble.FeatureCodecHeader;
import org.broad.tribble.readers.AsciiLineReader;
import org.broad.tribble.readers.PositionalBufferedStream;
import org.broadinstitute.sting.utils.codecs.vcf.*;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.*;
public class BCF2Codec implements FeatureCodec<VariantContext> {
final protected static Logger logger = Logger.getLogger(BCF2Codec.class);
private VCFHeader header = null;
private final ArrayList<String> contigNames = new ArrayList<String>();
private final ArrayList<String> dictionary = new ArrayList<String>();
private final BCF2Decoder decoder = new BCF2Decoder();
private boolean skipGenotypes = false;
// ----------------------------------------------------------------------
//
// Feature codec interface functions
//
// ----------------------------------------------------------------------
@Override
public Feature decodeLoc( final PositionalBufferedStream inputStream ) {
return decode(inputStream);
// TODO: a less expensive version of decodeLoc() that doesn't use VariantContext
// TODO: very easy -- just decodeSitesBlock, and then skip to end of end of sites block
// TODO: and then skip genotypes block
}
@Override
public VariantContext decode( final PositionalBufferedStream inputStream ) {
final VariantContextBuilder builder = new VariantContextBuilder();
final int sitesBlockSize = decoder.readBlockSize(inputStream);
final int genotypeBlockSize = decoder.readBlockSize(inputStream);
decoder.readNextBlock(sitesBlockSize, inputStream);
final SitesInfoForDecoding info = decodeSitesBlock(builder);
if ( isSkippingGenotypes() ) {
decoder.skipNextBlock(genotypeBlockSize, inputStream);
} else {
decoder.readNextBlock(genotypeBlockSize, inputStream);
decodeGenotypes(info, builder);
}
return builder.make();
}
@Override
public Class<VariantContext> getFeatureType() {
return VariantContext.class;
}
@Override
public FeatureCodecHeader readHeader( final PositionalBufferedStream inputStream ) {
AsciiLineReader headerReader = new AsciiLineReader(inputStream);
String headerLine;
List<String> headerLines = new ArrayList<String>();
boolean foundHeaderEnd = false;
try {
while ( ! foundHeaderEnd && (headerLine = headerReader.readLine()) != null) {
if ( headerLine.startsWith(VCFHeader.METADATA_INDICATOR) ) {
headerLines.add(headerLine);
}
else if ( headerLine.startsWith(VCFHeader.HEADER_INDICATOR) ) {
headerLines.add(headerLine);
foundHeaderEnd = true;
}
else {
throw new UserException.MalformedBCF2("Reached end of header without encountering a field layout line");
}
}
}
catch ( IOException e ) {
throw new UserException.CouldNotReadInputFile("I/O error while reading BCF2 header");
}
if ( ! foundHeaderEnd ) {
throw new UserException.MalformedBCF2("Reached end of header without encountering a field layout line");
}
// read the header
this.header = AbstractVCFCodec.parseHeader(headerLines, VCFHeaderVersion.VCF4_1);
// create the config offsets
for ( final VCFContigHeaderLine contig : header.getContigLines())
contigNames.add(contig.getID());
// create the string dictionary
parseDictionary(header);
// position right before next line (would be right before first real record byte at end of header)
return new FeatureCodecHeader(header, inputStream.getPosition());
}
@Override
public boolean canDecode( final String path ) {
try {
FileInputStream fis = new FileInputStream(path);
AsciiLineReader reader = new AsciiLineReader(new PositionalBufferedStream(fis));
String firstLine = reader.readLine();
if ( firstLine != null && firstLine.equals(BCF2Constants.VERSION_LINE) ) {
return true;
}
} catch ( FileNotFoundException e ) {
return false;
} catch ( IOException e ) {
return false;
}
return false;
}
private final void parseDictionary(final VCFHeader header) {
for ( final VCFHeaderLine line : header.getMetaData() ) {
if ( line.getKey().equals(BCF2Constants.DICTIONARY_LINE_TAG) ) {
for ( final String string : line.getValue().split(BCF2Constants.DICTIONARY_LINE_ENTRY_SEPARATOR) )
dictionary.add(string);
break;
}
}
// if we got here we never found a dictionary, or there are no elements in the dictionary
if ( dictionary.size() == 0 )
throw new UserException.MalformedBCF2("Dictionary header element was absent or empty");
}
public boolean isSkippingGenotypes() {
return skipGenotypes;
}
public void setSkipGenotypes(final boolean skipGenotypes) {
this.skipGenotypes = skipGenotypes;
}
// --------------------------------------------------------------------------------
//
// implicit block
//
// The first four records of BCF are inline untype encoded data of:
//
// 4 byte integer chrom offset
// 4 byte integer start
// 4 byte integer ref length
// 4 byte float qual
//
// --------------------------------------------------------------------------------
private final SitesInfoForDecoding decodeSitesBlock(final VariantContextBuilder builder) {
final int contigOffset = decoder.decodeInt(BCFType.INT32.getSizeInBytes());
final String contig = lookupContigName(contigOffset);
builder.chr(contig);
final int pos = decoder.decodeInt(BCFType.INT32.getSizeInBytes());
final int refLength = decoder.decodeInt(BCFType.INT32.getSizeInBytes());
builder.start((long)pos);
builder.stop((long)(pos + refLength - 1)); // minus one because of our open intervals
final Object qual = decoder.decodeSingleValue(BCFType.FLOAT);
if ( qual != null ) {
builder.log10PError(((Double)qual) / -10.0);
}
final int nAlleleInfo = decoder.decodeInt(BCFType.INT32.getSizeInBytes());
final int nFormatSamples = decoder.decodeInt(BCFType.INT32.getSizeInBytes());
final int nAlleles = nAlleleInfo >> 16;
final int nInfo = nAlleleInfo & 0x00FF;
final int nFormatFields = nFormatSamples >> 24;
final int nSamples = nFormatSamples & 0x0FFF;
decodeID(builder);
final ArrayList<Allele> alleles = decodeAlleles(builder, pos, nAlleles);
decodeFilter(builder);
decodeInfo(builder, nInfo);
return new SitesInfoForDecoding(pos, nFormatFields, nSamples, alleles);
}
private final static class SitesInfoForDecoding {
final int pos;
final int nFormatFields;
final int nSamples;
final ArrayList<Allele> alleles;
private SitesInfoForDecoding(final int pos, final int nFormatFields, final int nSamples, final ArrayList<Allele> alleles) {
this.pos = pos;
this.nFormatFields = nFormatFields;
this.nSamples = nSamples;
this.alleles = alleles;
}
}
private void decodeID( final VariantContextBuilder builder ) {
final String id = (String)decoder.decodeTypedValue();
if ( id == null ) {
builder.noID();
}
else {
builder.id(id);
}
}
public static ArrayList<Allele> clipAllelesIfNecessary(int position, String ref, ArrayList<Allele> unclippedAlleles) {
if ( AbstractVCFCodec.isSingleNucleotideEvent(unclippedAlleles) ) {
ArrayList<Allele> clippedAlleles = new ArrayList<Allele>(unclippedAlleles.size());
AbstractVCFCodec.clipAlleles(position, ref, unclippedAlleles, clippedAlleles, -1);
return clippedAlleles;
} else
return unclippedAlleles;
}
private ArrayList<Allele> decodeAlleles( final VariantContextBuilder builder, final int pos, final int nAlleles ) {
// TODO -- probably need inline decoder for efficiency here (no sense in going bytes -> string -> vector -> bytes
ArrayList<Allele> alleles = new ArrayList<Allele>(nAlleles);
String ref = null;
for ( int i = 0; i < nAlleles; i++ ) {
final String allele = (String)decoder.decodeTypedValue();
if ( i == 0 ) {
ref = allele;
alleles.add(Allele.create(allele, true));
} else {
alleles.add(Allele.create(allele, false));
}
}
alleles = clipAllelesIfNecessary(pos, ref, alleles);
builder.alleles(alleles);
builder.referenceBaseForIndel(ref.getBytes()[0]);
return alleles;
}
private void decodeFilter( final VariantContextBuilder builder ) {
final Object filters = decoder.decodeTypedValue();
if ( filters == null ) {
builder.unfiltered();
}
else {
builder.filters(new LinkedHashSet<String>(asStrings(filters)));
}
}
private void decodeInfo( final VariantContextBuilder builder, final int numInfoFields ) {
final Map<String, Object> infoFieldEntries = new HashMap<String, Object>(numInfoFields);
for ( int i = 0; i < numInfoFields; i++ ) {
final String key = getDictionaryString();
Object value = decoder.decodeTypedValue();
final VCFCompoundHeaderLine metaData = VariantContext.getMetaDataForField(header, key);
if ( metaData.getType() == VCFHeaderLineType.Flag ) value = true; // special case for flags
infoFieldEntries.put(key, value);
}
builder.attributes(infoFieldEntries);
}
private void decodeGenotypes( final SitesInfoForDecoding siteInfo, final VariantContextBuilder builder ) {
final List<String> samples = new ArrayList<String>(header.getGenotypeSamples());
final int nSamples = siteInfo.nSamples;
final int nFields = siteInfo.nFormatFields;
final Map<String, List<Object>> fieldValues = decodeGenotypeFieldValues(nFields, nSamples);
if ( samples.size() != nSamples )
throw new UserException.MalformedBCF2("GATK currently doesn't support reading BCF2 files with " +
"different numbers of samples per record. Saw " + samples.size() +
" samples in header but have a record with " + nSamples + " samples");
final List<Genotype> genotypes = new ArrayList<Genotype>(nSamples);
for ( int i = 0; i < nSamples; i++ ) {
final String sampleName = samples.get(i);
List<Allele> alleles = null;
boolean isPhased = false;
double log10PError = VariantContext.NO_LOG10_PERROR;
Set<String> filters = null;
Map<String, Object> attributes = null;
double[] log10Likelihoods = null;
for ( final Map.Entry<String, List<Object>> entry : fieldValues.entrySet() ) {
final String field = entry.getKey();
final List<Object> values = entry.getValue();
if ( field.equals(VCFConstants.GENOTYPE_KEY) ) {
alleles = decodeGenotypeAlleles(siteInfo.alleles, (List<Integer>)values.get(i));
} else if ( field.equals(VCFConstants.GENOTYPE_QUALITY_KEY) ) {
final Integer value = (Integer)values.get(i);
if ( value != BCFType.INT8.getMissingJavaValue() )
log10PError = value / -10.0;
} else if ( field.equals(VCFConstants.GENOTYPE_FILTER_KEY) ) {
throw new ReviewedStingException("Genotype filters not implemented in GATK BCF2");
//filters = new HashSet<String>(values.get(i));
} else { // add to attributes
if ( attributes == null ) attributes = new HashMap<String, Object>(nFields);
attributes.put(field, values.get(i));
}
}
if ( alleles == null ) throw new ReviewedStingException("BUG: no alleles found");
final Genotype g = new Genotype(sampleName, alleles, log10PError, filters, attributes, isPhased, log10Likelihoods);
genotypes.add(g);
}
builder.genotypes(genotypes);
}
private final List<Allele> decodeGenotypeAlleles(final ArrayList<Allele> siteAlleles, final List<Integer> encoded) {
final List<Allele> gt = new ArrayList<Allele>(encoded.size());
for ( final Integer encode : encoded ) {
if ( encode == null ) // absent, as are all following by definition
return gt;
else {
final int offset = encode >> 1;
if ( offset == 0 )
gt.add(Allele.NO_CALL);
else
gt.add(siteAlleles.get(offset - 1));
}
}
return gt;
}
private final Map<String, List<Object>> decodeGenotypeFieldValues(final int nFields, final int nSamples) {
final Map<String, List<Object>> map = new LinkedHashMap<String, List<Object>>(nFields);
for ( int i = 0; i < nFields; i++ ) {
final String field = getDictionaryString();
final byte typeDescriptor = decoder.readTypeDescriptor();
final List<Object> values = new ArrayList<Object>(nSamples);
for ( int j = 0; j < nSamples; j++ )
values.add(decoder.decodeTypedValue(typeDescriptor));
map.put(field, values);
}
return map;
}
private final String getDictionaryString() {
final int offset = (Integer)decoder.decodeTypedValue();
final String field = dictionary.get(offset);
return field;
}
private final String lookupContigName( final int contigOffset ) {
if ( contigOffset < contigNames.size() ) {
return contigNames.get(contigOffset);
}
else {
throw new UserException.MalformedBCF2(String.format("No contig at index %d present in the sequence dictionary from the BCF2 header (%s)", contigOffset, contigNames));
}
}
// ----------------------------------------------------------------------
//
// Utility functions
//
// ----------------------------------------------------------------------
private final Collection<String> asStrings(final Object o) {
return asCollection(String.class, o);
}
private final <T> Collection<T> asCollection(final Class<T> c, final Object o) {
if ( o == null )
return Collections.emptyList();
else if ( o instanceof List ) {
return (List<T>)o;
} else {
return (Set<T>)Collections.singleton(o);
}
}
}

View File

@ -0,0 +1,44 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.codecs.bcf2;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
import java.nio.charset.Charset;
public class BCF2Constants {
public static final String VERSION_LINE_FORMAT = "fileformat=BCF2v%d.%d";
public static final String VERSION_LINE = String.format(VCFHeader.METADATA_INDICATOR + VERSION_LINE_FORMAT, 0, 1);
public static final String DICTIONARY_LINE_TAG = "dictionary";
public static final String DICTIONARY_LINE_ENTRY_SEPARATOR = ",";
public static final Charset BCF2_TEXT_CHARSET = Charset.forName("US-ASCII"); // TODO: enforce this!
// Note that these values are prefixed by FFFFFF for convenience
public static final int INT8_MISSING_VALUE = 0xFFFFFF80;
public static final int INT16_MISSING_VALUE = 0xFFFF8000;
public static final int INT32_MISSING_VALUE = 0x80000000;
public static final int FLOAT_MISSING_VALUE = 0x7F800001;
}

View File

@ -0,0 +1,277 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.codecs.bcf2;
import org.apache.log4j.Logger;
import org.broad.tribble.FeatureCodec;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
public class BCF2Decoder {
final protected static Logger logger = Logger.getLogger(FeatureCodec.class);
byte[] recordBytes;
ByteArrayInputStream recordStream;
public BCF2Decoder() {
// nothing to do
}
/**
* Create a new decoder ready to read BCF2 data from the byte[] recordBytes, for testing purposes
*
* @param recordBytes
*/
protected BCF2Decoder(final byte[] recordBytes) {
setRecordBytes(recordBytes);
}
// ----------------------------------------------------------------------
//
// Routines to load, set, skip blocks of underlying data we are decoding
//
// ----------------------------------------------------------------------
/**
* Reads the next record from input stream and prepare this decoder to decode values from it
*
* @param stream
* @return
*/
public void readNextBlock(final int blockSizeInBytes, final InputStream stream) {
setRecordBytes(readRecordBytes(blockSizeInBytes, stream));
}
/**
* Skips the next record from input stream, invalidating current block data
*
* @param stream
* @return
*/
public void skipNextBlock(final int blockSizeInBytes, final InputStream stream) {
try {
final int bytesRead = (int)stream.skip(blockSizeInBytes);
validateReadBytes(bytesRead, blockSizeInBytes);
} catch ( IOException e ) {
throw new UserException.CouldNotReadInputFile("I/O error while reading BCF2 file", e);
}
this.recordBytes = null;
this.recordStream = null;
}
/**
* Returns the byte[] for the block of data we are currently decoding
* @return
*/
public byte[] getRecordBytes() {
return recordBytes;
}
/**
* The size of the current block in bytes
*
* @return
*/
public int getBlockSize() {
return recordBytes.length;
}
public boolean blockIsFullyDecoded() {
return recordStream.available() == 0;
}
/**
* Use the recordBytes[] to read BCF2 records from now on
*
* @param recordBytes
*/
public void setRecordBytes(final byte[] recordBytes) {
this.recordBytes = recordBytes;
this.recordStream = new ByteArrayInputStream(recordBytes);
}
// ----------------------------------------------------------------------
//
// High-level decoder
//
// ----------------------------------------------------------------------
public final Object decodeTypedValue() {
final byte typeDescriptor = readTypeDescriptor();
return decodeTypedValue(typeDescriptor);
}
public final Object decodeTypedValue(final byte typeDescriptor) {
final int size = TypeDescriptor.sizeIsOverflow(typeDescriptor) ? decodeVectorSize() : TypeDescriptor.decodeSize(typeDescriptor);
final BCFType type = TypeDescriptor.decodeType(typeDescriptor);
assert size >= 0;
if ( size == 0 ) {
return null;
} else if ( type == BCFType.CHAR ) { // special case string decoding for efficiency
return decodeLiteralString(size);
} else if ( size == 1 ) {
return decodeSingleValue(type);
} else {
final ArrayList<Object> ints = new ArrayList<Object>(size);
for ( int i = 0; i < size; i++ ) {
ints.add(decodeSingleValue(type));
}
return ints;
}
}
public final Object decodeSingleValue(final BCFType type) {
// TODO -- decodeTypedValue should integrate this routine
final int value = readInt(type.getSizeInBytes(), recordStream);
if ( value == type.getMissingBytes() )
return null;
else {
switch (type) {
case INT8:
case INT16:
case INT32: return value;
case FLOAT: return (double)rawFloatToFloat(value);
case CHAR: return value & 0xFF; // TODO -- I cannot imagine why we'd get here, as string needs to be special cased
default: throw new ReviewedStingException("BCF2 codec doesn't know how to decode type " + type );
}
}
}
// ----------------------------------------------------------------------
//
// Decode raw primitive data types (ints, floats, and strings)
//
// ----------------------------------------------------------------------
private final String decodeLiteralString(final int size) {
// TODO -- assumes size > 0
final byte[] bytes = new byte[size]; // TODO -- in principle should just grab bytes from underlying array
try {
recordStream.read(bytes);
return new String(bytes);
} catch ( IOException e ) {
throw new ReviewedStingException("readByte failure", e);
}
}
private final int decodeVectorSize() {
final byte typeDescriptor = readTypeDescriptor();
final int size = TypeDescriptor.decodeSize(typeDescriptor);
final BCFType type = TypeDescriptor.decodeType(typeDescriptor);
assert size == 1;
assert type == BCFType.INT8 || type == BCFType.INT16 || type == BCFType.INT32;
return decodeInt(type.getSizeInBytes());
}
public final int decodeInt(int bytesForEachInt) {
return readInt(bytesForEachInt, recordStream);
}
public final float rawFloatToFloat(final int rawFloat) {
return Float.intBitsToFloat(rawFloat);
}
// ----------------------------------------------------------------------
//
// Utility functions
//
// ----------------------------------------------------------------------
/**
* Read the size of the next block from inputStream
*
* @param inputStream
* @return
*/
public final int readBlockSize(final InputStream inputStream) {
return readInt(4, inputStream);
}
/**
*
* @param inputStream
* @return
*/
private final static byte[] readRecordBytes(final int blockSizeInBytes, final InputStream inputStream) {
final byte[] record = new byte[blockSizeInBytes];
try {
final int bytesRead = inputStream.read(record);
validateReadBytes(bytesRead, blockSizeInBytes);
} catch ( IOException e ) {
throw new UserException.CouldNotReadInputFile("I/O error while reading BCF2 file", e);
}
return record;
}
private final static void validateReadBytes(final int actuallyRead, final int expected) {
if ( actuallyRead < expected ) {
throw new UserException.MalformedBCF2(String.format("Failed to read next complete record: %s",
actuallyRead == -1 ?
"premature end of input stream" :
String.format("expected %d bytes but read only %d", expected, actuallyRead)));
}
}
public final byte readTypeDescriptor() {
return readByte(recordStream);
}
private final static byte readByte(final InputStream stream) {
try {
return (byte)(stream.read() & 0xFF);
} catch ( IOException e ) {
throw new ReviewedStingException("readByte failure", e);
}
}
private final static int readInt(int bytesForEachInt, final InputStream stream) {
switch ( bytesForEachInt ) {
case 1: {
return (byte)(readByte(stream));
} case 2: {
final int b1 = readByte(stream) & 0xFF;
final int b2 = readByte(stream) & 0xFF;
return (short)((b1 << 8) | b2);
} case 4: {
final int b1 = readByte(stream) & 0xFF;
final int b2 = readByte(stream) & 0xFF;
final int b3 = readByte(stream) & 0xFF;
final int b4 = readByte(stream) & 0xFF;
return (int)(b1 << 24 | b2 << 16 | b3 << 8 | b4);
} default: throw new ReviewedStingException("Unexpected size during decoding");
}
}
}

View File

@ -0,0 +1,234 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.codecs.bcf2;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.util.*;
/**
* Simple BCF2 encoder
*
* @author depristo
* @since 5/12
*/
public class BCF2Encoder {
// TODO -- increase default size?
public static final int WRITE_BUFFER_INITIAL_SIZE = 16384;
private ByteArrayOutputStream encodeStream = new ByteArrayOutputStream(WRITE_BUFFER_INITIAL_SIZE);
// --------------------------------------------------------------------------------
//
// Functions to return the data being encoded here
//
// --------------------------------------------------------------------------------
public int getRecordSizeInBytes() {
return encodeStream.size();
}
public byte[] getRecordBytes() {
byte[] bytes = encodeStream.toByteArray();
encodeStream.reset();
return bytes;
}
// --------------------------------------------------------------------------------
//
// Super-high level interface
//
// --------------------------------------------------------------------------------
/**
* Totally generic encoder that examines o, determines the best way to encode it, and encodes it
* @param o
* @return
*/
public final BCFType encode(final Object o) throws IOException {
if ( o == null ) throw new ReviewedStingException("Generic encode cannot deal with null values");
if ( o instanceof String ) {
return encodeString((String)o);
} else if ( o instanceof List ) {
final BCFType type = determinePrimitiveType(((List) o).get(0));
encodeTypedVector((List) o, type);
return type;
} else {
final BCFType type = determinePrimitiveType(o);
encodeTypedSingleton(o, type);
return type;
}
}
// --------------------------------------------------------------------------------
//
// Writing typed values (have type byte)
//
// --------------------------------------------------------------------------------
public final void encodeTypedMissing(final BCFType type) throws IOException {
encodeTypedVector(Collections.emptyList(), type);
}
// todo -- should be specialized for each object type for efficiency
public final void encodeTypedSingleton(final Object v, final BCFType type) throws IOException {
encodeTypedVector(Collections.singleton(v), type);
}
public final BCFType encodeString(final String v) throws IOException {
// TODO -- this needs to be optimized
final byte[] bytes = v.getBytes();
final List<Byte> l = new ArrayList<Byte>(bytes.length);
for ( int i = 0; i < bytes.length; i++) l.add(bytes[i]);
encodeTypedVector(l, BCFType.CHAR);
return BCFType.CHAR;
}
public final <T extends Object> void encodeTypedVector(final Collection<T> v, final BCFType type) throws IOException {
encodeType(v.size(), type);
encodeRawValues(v, type);
}
public final BCFType encodeTypedIntOfBestSize(final int value) throws IOException {
final BCFType type = determineIntegerType(value);
encodeTypedSingleton(value, type);
return type;
}
// --------------------------------------------------------------------------------
//
// Writing raw values (don't have a type byte)
//
// --------------------------------------------------------------------------------
public final <T extends Object> void encodeRawValues(final Collection<T> v, final BCFType type) throws IOException {
for ( final T v1 : v ) {
encodeRawValue(v1, type);
}
}
public final <T extends Object> void encodeRawValue(final T value, final BCFType type) throws IOException {
if ( value == type.getMissingJavaValue() )
encodeRawMissingValue(type);
else {
switch (type) {
case INT8:
case INT16:
case INT32: encodePrimitive((Integer)value, type); break;
case FLOAT: encodeRawFloat((Float) value, type); break;
case CHAR: encodeRawChar((Byte) value); break;
default: throw new ReviewedStingException("Illegal type encountered " + type);
}
}
}
public final void encodeRawMissingValue(final BCFType type) throws IOException {
encodePrimitive(type.getMissingBytes(), type);
}
public final void encodeRawMissingValues(final int size, final BCFType type) throws IOException {
for ( int i = 0; i < size; i++ )
encodeRawMissingValue(type);
}
// --------------------------------------------------------------------------------
//
// low-level encoders
//
// --------------------------------------------------------------------------------
public final void encodeRawChar(final byte c) throws IOException {
encodeStream.write(c);
}
public final void encodeRawFloat(final float value, final BCFType type) throws IOException {
encodePrimitive(Float.floatToIntBits(value), type);
}
public final void encodeType(final int size, final BCFType type) throws IOException {
final byte typeByte = TypeDescriptor.encodeTypeDescriptor(size, type);
encodeStream.write(typeByte);
if ( TypeDescriptor.willOverflow(size) )
encodeTypedIntOfBestSize(size);
}
public final void encodeRawInt(final int value, final BCFType type) throws IOException {
encodePrimitive(value, type, encodeStream);
}
public final void encodePrimitive(final int value, final BCFType type) throws IOException {
encodePrimitive(value, type, encodeStream);
}
// --------------------------------------------------------------------------------
//
// utility functions
//
// --------------------------------------------------------------------------------
public final BCFType determineIntegerType(final List<Integer> values) {
BCFType maxType = BCFType.INT8;
for ( final int value : values ) {
final BCFType type1 = determineIntegerType(value);
switch ( type1 ) {
case INT8: break;
case INT16: maxType = BCFType.INT16; break;
case INT32: return BCFType.INT32; // fast path for largest possible value
default: throw new ReviewedStingException("Unexpected integer type " + type1 );
}
}
return maxType;
}
public final BCFType determineIntegerType(final int value) {
for ( final BCFType potentialType : TypeDescriptor.INTEGER_TYPES_BY_SIZE ) {
if ( potentialType.withinRange(value) )
return potentialType;
}
throw new ReviewedStingException("Integer cannot be encoded in allowable range of even INT32: " + value);
}
private final BCFType determinePrimitiveType(final Object v) {
if ( v instanceof Integer )
return determineIntegerType((Integer)v);
else if ( v instanceof Float )
return BCFType.FLOAT;
else
throw new ReviewedStingException("No native encoding for Object of type " + v.getClass().getSimpleName());
}
public final static void encodePrimitive(final int value, final BCFType type, final OutputStream encodeStream) throws IOException {
for ( int i = type.getSizeInBytes() - 1; i >= 0; i-- ) {
final int shift = i * 8;
int mask = 0xFF << shift;
int byteValue = (mask & value) >> shift;
encodeStream.write(byteValue);
}
}
}

View File

@ -0,0 +1,136 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.codecs.bcf2;
import org.broad.tribble.FeatureCodecHeader;
import org.broad.tribble.readers.PositionalBufferedStream;
import org.broadinstitute.sting.commandline.*;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder;
import java.io.*;
import java.util.*;
/**
* Testing BCF2
*
* @author Mark DePristo
* @since 2012
*/
public class BCF2TestWalker extends RodWalker<Integer, Integer> {
/**
* Variants from this VCF file are used by this tool as input.
* The file must at least contain the standard VCF header lines, but
* can be empty (i.e., no variants are contained in the file).
*/
@Input(fullName="variant", shortName = "V", doc="Input VCF file", required=true)
public RodBinding<VariantContext> variants;
@Argument(doc="keep variants", required=false)
public boolean keepVariants = false;
@Argument(doc="quiet", required=false)
public boolean quiet = false;
@Argument(doc="dontIndexOnTheFly", required=false)
public boolean dontIndexOnTheFly = false;
@Output(doc="File to which results should be written",required=true)
protected File bcfFile;
private final List<VariantContext> vcs = new ArrayList<VariantContext>();
protected BCF2Writer writer;
@Override
public void initialize() {
final Map<String, VCFHeader> vcfRods = VCFUtils.getVCFHeadersFromRods(getToolkit(), Collections.singletonList(variants));
final VCFHeader header = VCFUtils.withUpdatedContigs(vcfRods.values().iterator().next(), getToolkit());
try {
writer = new BCF2Writer("out", bcfFile, new FileOutputStream(bcfFile),
getToolkit().getMasterSequenceDictionary(), ! dontIndexOnTheFly );
writer.writeHeader(header);
} catch ( FileNotFoundException e ) {
throw new UserException.CouldNotCreateOutputFile(bcfFile, e);
}
}
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
if ( tracker == null ) // RodWalkers can make funky map calls
return 0;
for ( VariantContext vc : tracker.getValues(variants, context.getLocation())) {
writer.add(vc);
if ( keepVariants ) vcs.add(vc);
}
return 1;
}
//
// default reduce -- doesn't do anything at all
//
public Integer reduceInit() { return 0; }
public Integer reduce(Integer counter, Integer sum) { return counter + sum; }
public void onTraversalDone(Integer sum) {
try {
writer.close();
logger.info("Closed writer");
// read in the BCF records
BCF2Codec codec = new BCF2Codec();
PositionalBufferedStream pbs = new PositionalBufferedStream(new FileInputStream(bcfFile));
FeatureCodecHeader header = codec.readHeader(pbs);
pbs.close();
pbs = new PositionalBufferedStream(new FileInputStream(bcfFile));
pbs.skip(header.getHeaderEnd());
Iterator<VariantContext> it = vcs.iterator();
while ( ! pbs.isDone() ) {
if ( keepVariants ) {
VariantContext expected = it.next();
if ( ! quiet )
System.out.printf("vcf = %s %d %s%n", expected.getChr(), expected.getStart(), expected);
}
VariantContext bcfRaw = codec.decode(pbs);
VariantContext bcf = new VariantContextBuilder(bcfRaw).source("variant").make();
if ( ! quiet ) {
System.out.printf("bcf = %s %d %s%n", bcf.getChr(), bcf.getStart(), bcf.toString());
System.out.printf("--------------------------------------------------%n");
}
}
} catch ( IOException e ) {
throw new UserException.CouldNotCreateOutputFile(bcfFile, "bad user!");
}
}
}

View File

@ -0,0 +1,387 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.codecs.bcf2;
import net.sf.samtools.SAMSequenceDictionary;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.codecs.vcf.*;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.GenotypesContext;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.io.File;
import java.io.IOException;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.util.*;
public class BCF2Writer extends IndexingVCFWriter {
final protected static Logger logger = Logger.getLogger(BCF2Writer.class);
private final static boolean doNotWriteGenotypes = false;
private OutputStream outputStream; // Note: do not flush until completely done writing, to avoid issues with eventual BGZF support
private VCFHeader header;
private Map<String, Integer> contigDictionary = new HashMap<String, Integer>();
private Map<String, Integer> stringDictionary = new LinkedHashMap<String, Integer>();
private final BCF2Encoder encoder = new BCF2Encoder(); // initialized after the header arrives
public BCF2Writer(final String name, final File location, final OutputStream output, final SAMSequenceDictionary refDict, final boolean enableOnTheFlyIndexing) {
super(name, location, output, refDict, enableOnTheFlyIndexing);
this.outputStream = getOutputStream();
}
// --------------------------------------------------------------------------------
//
// Interface functions
//
// --------------------------------------------------------------------------------
@Override
public void writeHeader(final VCFHeader header) {
this.header = header;
// create the config offsets map
for ( final VCFContigHeaderLine contig : header.getContigLines())
contigDictionary.put(contig.getID(), contig.getContigIndex());
// set up the strings dictionary
int offset = 0;
stringDictionary.put(VCFConstants.PASSES_FILTERS_v4, offset++); // special case the special PASS field
for ( VCFHeaderLine line : header.getMetaData() ) {
if ( line instanceof VCFIDHeaderLine ) {
VCFIDHeaderLine idLine = (VCFIDHeaderLine)line;
stringDictionary.put(idLine.getID(), offset++);
}
}
// add the dictionary ##dictionary=x,y,z line to the header
final String dictionaryLineValue = Utils.join(BCF2Constants.DICTIONARY_LINE_ENTRY_SEPARATOR, stringDictionary.keySet());
header.addMetaDataLine(new VCFHeaderLine(BCF2Constants.DICTIONARY_LINE_TAG, dictionaryLineValue));
// write out the header
StandardVCFWriter.writeHeader(header, new OutputStreamWriter(outputStream), doNotWriteGenotypes, BCF2Constants.VERSION_LINE, "BCF2 stream");
}
@Override
public void add( final VariantContext initialVC ) {
final VariantContext vc = initialVC.fullyDecode(header);
super.add(vc); // allow on the fly indexing
try {
final byte[] infoBlock = buildSitesData(vc);
final byte[] genotypesBlock = buildSamplesData(vc);
// write the two blocks to disk
writeBlock(infoBlock, genotypesBlock);
}
catch ( IOException e ) {
throw new UserException("Error writing record to BCF2 file: " + vc.toString(), e);
}
}
@Override
public void close() {
try {
outputStream.flush();
outputStream.close();
}
catch ( IOException e ) {
throw new UserException("Failed to close BCF2 file");
}
super.close();
}
// --------------------------------------------------------------------------------
//
// implicit block
//
// The first four records of BCF are inline untype encoded data of:
//
// 4 byte integer chrom offset
// 4 byte integer start
// 4 byte integer ref length
// 4 byte float qual
//
// --------------------------------------------------------------------------------
private byte[] buildSitesData( VariantContext vc ) throws IOException {
final int contigIndex = contigDictionary.get(vc.getChr());
if ( contigIndex == -1 )
throw new UserException(String.format("Contig %s not found in sequence dictionary from reference", vc.getChr()));
// note use of encodeRawValue to not insert the typing byte
encoder.encodeRawValue(contigIndex, BCFType.INT32);
// pos
encoder.encodeRawValue(vc.getStart(), BCFType.INT32);
// ref length
encoder.encodeRawValue(vc.getEnd() - vc.getStart() + 1, BCFType.INT32);
// qual
if ( vc.hasLog10PError() )
encoder.encodeRawFloat((float) vc.getPhredScaledQual(), BCFType.FLOAT);
else
encoder.encodeRawMissingValue(BCFType.FLOAT);
// info fields
final int nAlleles = vc.getNAlleles();
final int nInfo = vc.getAttributes().size();
final int nGenotypeFormatFields = StandardVCFWriter.calcVCFGenotypeKeys(vc).size();
final int nSamples = vc.getNSamples();
encoder.encodeRawInt((nAlleles << 16) | (nInfo & 0x00FF), BCFType.INT32);
encoder.encodeRawInt((nGenotypeFormatFields << 24) | (nSamples & 0x0FFF), BCFType.INT32);
buildID(vc);
buildAlleles(vc);
buildFilter(vc);
buildInfo(vc);
return encoder.getRecordBytes();
}
private void buildID( VariantContext vc ) throws IOException {
encoder.encodeString(vc.getID());
}
private void buildAlleles( VariantContext vc ) throws IOException {
for ( final Allele allele : vc.getAlleles() ) {
final String s = vc.getAlleleWithRefPadding(allele);
encoder.encodeString(s);
}
}
private void buildFilter( VariantContext vc ) throws IOException {
if ( vc.isFiltered() ) {
encodeStringsByRef(vc.getFilters());
} else {
encoder.encodeTypedMissing(BCFType.INT32);
}
}
private void buildInfo( VariantContext vc ) throws IOException {
for ( Map.Entry<String, Object> infoFieldEntry : vc.getAttributes().entrySet() ) {
final String key = infoFieldEntry.getKey();
Object value = infoFieldEntry.getValue();
final VCFToBCFType typeEquiv = getBCF2TypeFromHeader(key, value);
// handle the special FLAG case -- super annoying
if ( typeEquiv.vcfType == VCFHeaderLineType.Flag ) value = 1;
encodeStringByRef(key);
if ( value instanceof List ) // NOTE: ONLY WORKS WITH LISTS
encoder.encodeTypedVector((List) value, typeEquiv.bcfType);
else if ( value instanceof String )
encoder.encodeString((String)value);
else
encoder.encodeTypedSingleton(value, typeEquiv.bcfType);
}
}
private byte[] buildSamplesData(final VariantContext vc) throws IOException {
// write size
List<String> genotypeFields = StandardVCFWriter.calcVCFGenotypeKeys(vc);
for ( final String field : genotypeFields ) {
if ( field.equals(VCFConstants.GENOTYPE_KEY) ) {
addGenotypes(vc);
} else if ( field.equals(VCFConstants.GENOTYPE_QUALITY_KEY) ) {
addGQ(vc);
} else if ( field.equals(VCFConstants.GENOTYPE_FILTER_KEY) ) {
addGenotypeFilters(vc);
} else {
addGenericGenotypeField(vc, field);
}
}
return encoder.getRecordBytes();
}
private final int getNGenotypeFieldValues(final String field, final VariantContext vc) {
final VCFCompoundHeaderLine metaData = VariantContext.getMetaDataForField(header, field);
int nFields = metaData.getCount(vc.getAlternateAlleles().size());
if ( nFields == -1 ) { // unbounded, need to look at values
return computeMaxSizeOfGenotypeFieldFromValues(field, vc);
} else {
return nFields;
}
}
private final int computeMaxSizeOfGenotypeFieldFromValues(final String field, final VariantContext vc) {
int size = 1;
final GenotypesContext gc = vc.getGenotypes();
for ( final Genotype g : gc ) {
final Object o = g.getAttribute(field);
if ( o == null ) continue;
if ( o instanceof List ) {
// only do compute if first value is of type list
final List values = (List)g.getAttribute(field);
if ( values != null )
size = Math.max(size, values.size());
} else {
return 1;
}
}
return size;
}
private final void addGenericGenotypeField(final VariantContext vc, final String field) throws IOException {
final int numInFormatField = getNGenotypeFieldValues(field, vc);
final VCFToBCFType type = getBCF2TypeFromHeader(field, null);
startGenotypeField(field, numInFormatField, type.bcfType);
for ( final Genotype g : vc.getGenotypes() ) {
if ( ! g.hasAttribute(field) ) {
encoder.encodeRawMissingValues(numInFormatField, type.bcfType);
} else {
final Object val = g.getAttribute(field);
final Collection<Object> vals = numInFormatField == 1 ? Collections.singleton(val) : (Collection)val;
encoder.encodeRawValues(vals, type.bcfType);
}
}
}
private final class VCFToBCFType {
VCFHeaderLineType vcfType;
BCFType bcfType;
private VCFToBCFType(final VCFHeaderLineType vcfType, final BCFType bcfType) {
this.vcfType = vcfType;
this.bcfType = bcfType;
}
}
// TODO -- we really need explicit converters as first class objects
private final VCFToBCFType getBCF2TypeFromHeader(final String field, final Object maybeIntValue) {
// TODO -- need to generalize so we can enable vectors of compressed genotype ints
final VCFCompoundHeaderLine metaData = VariantContext.getMetaDataForField(header, field);
// TODO -- no sense in allocating these over and over
switch ( metaData.getType() ) {
case Character: return new VCFToBCFType(metaData.getType(), BCFType.CHAR);
case Flag: return new VCFToBCFType(metaData.getType(), BCFType.INT8);
case String: return new VCFToBCFType(metaData.getType(), BCFType.CHAR);
case Integer: return new VCFToBCFType(metaData.getType(), maybeIntValue != null ? encoder.determineIntegerType((Integer)maybeIntValue) : BCFType.INT32);
case Float: return new VCFToBCFType(metaData.getType(), BCFType.FLOAT);
default: throw new ReviewedStingException("Unexpected type for field " + field);
}
}
private final void addGenotypeFilters(final VariantContext vc) throws IOException {
logger.warn("Skipping genotype filter field");
// // TODO -- FIXME -- string is wrong here -- need to compute string size...
// startGenotypeField(VCFConstants.GENOTYPE_FILTER_KEY, 1, BCFType.CHAR);
// for ( final Genotype g : vc.getGenotypes() ) {
// if ( g.filtersWereApplied() && g.isFiltered() ) {
// encoder.encodeString(ParsingUtils.join(";", ParsingUtils.sortList(g.getFilters())));
// } else {
// encoder.encodeRawMissingValues(1, BCFType.CHAR); // todo fixme
// }
// }
}
private final void addGQ(final VariantContext vc) throws IOException {
startGenotypeField(VCFConstants.GENOTYPE_QUALITY_KEY, 1, BCFType.INT8);
for ( final Genotype g : vc.getGenotypes() ) {
if ( g.hasLog10PError() ) {
final int GQ = (int)Math.round(Math.min(g.getPhredScaledQual(), VCFConstants.MAX_GENOTYPE_QUAL));
if ( GQ > VCFConstants.MAX_GENOTYPE_QUAL ) throw new ReviewedStingException("Unexpectedly large GQ " + GQ + " at " + vc);
encoder.encodeRawValue(GQ, BCFType.INT8);
} else {
encoder.encodeRawMissingValues(1, BCFType.INT8);
}
}
}
private final void addGenotypes(final VariantContext vc) throws IOException {
if ( vc.getNAlleles() > 127 )
throw new ReviewedStingException("Current BCF2 encoder cannot handle sites " +
"with > 127 alleles, but you have " + vc.getNAlleles() + " at "
+ vc.getChr() + ":" + vc.getStart());
final Map<Allele, String> alleleMap = StandardVCFWriter.buildAlleleMap(vc);
final int requiredPloidy = 2; // TODO -- handle ploidy, will need padding / depadding
startGenotypeField(VCFConstants.GENOTYPE_KEY, requiredPloidy, BCFType.INT8);
for ( final Genotype g : vc.getGenotypes() ) {
if ( g.getPloidy() != requiredPloidy ) throw new ReviewedStingException("Cannot currently handle non-diploid calls!");
final List<Integer> encoding = new ArrayList<Integer>(requiredPloidy);
for ( final Allele a : g.getAlleles() ) {
final int offset = a.isNoCall() ? -1 : Integer.valueOf(alleleMap.get(a));
encoding.add(((offset+1) << 1) | (g.isPhased() ? 0x01 : 0x00));
}
encoder.encodeRawValues(encoding, BCFType.INT8);
}
}
/**
* Write the data in the encoder to the outputstream as a length encoded
* block of data. After this call the encoder stream will be ready to
* start a new data block
*
* @throws IOException
*/
private void writeBlock(final byte[] infoBlock, final byte[] genotypesBlock) throws IOException {
BCF2Encoder.encodePrimitive(infoBlock.length, BCFType.INT32, outputStream);
BCF2Encoder.encodePrimitive(genotypesBlock.length, BCFType.INT32, outputStream);
outputStream.write(infoBlock);
outputStream.write(genotypesBlock);
}
public final BCFType encodeStringByRef(final String string) throws IOException {
return encodeStringsByRef(Collections.singleton(string));
}
public final BCFType encodeStringsByRef(final Collection<String> strings) throws IOException {
final List<Integer> offsets = new ArrayList<Integer>(strings.size());
BCFType maxType = BCFType.INT8; // start with the smallest size
// iterate over strings until we find one that needs 16 bits, and break
for ( final String string : strings ) {
final int offset = stringDictionary.get(string);
offsets.add(offset);
final BCFType type1 = encoder.determineIntegerType(offset);
switch ( type1 ) {
case INT8: break;
case INT16: if ( maxType == BCFType.INT8 ) maxType = BCFType.INT16; break;
case INT32: maxType = BCFType.INT32; break;
default: throw new ReviewedStingException("Unexpected type " + type1);
}
}
// we've checked the types for all strings, so write them out
encoder.encodeTypedVector(offsets, maxType);
return maxType;
}
public final void startGenotypeField(final String key, final int size, final BCFType valueType) throws IOException {
encodeStringByRef(key);
encoder.encodeType(size, valueType);
}
}

View File

@ -0,0 +1,71 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.codecs.bcf2;
/**
* BCF2 types and information
*
* @author depristo
* @since 05/12
*/
public enum BCFType {
RESERVED_0,
INT8(1, BCF2Constants.INT8_MISSING_VALUE, -127, 127), // todo -- confirm range
INT16(2, BCF2Constants.INT16_MISSING_VALUE, -32767, 32767),
INT32(4, BCF2Constants.INT32_MISSING_VALUE, -2147483647, 2147483647),
RESERVED_4,
FLOAT(4, BCF2Constants.FLOAT_MISSING_VALUE),
RESERVED_6,
CHAR;
private final Object missingJavaValue;
private final int missingBytes;
private final int sizeInBytes;
private final long minValue, maxValue;
BCFType() {
this(-1, 0, 0, 0);
}
BCFType(final int sizeInBytes, final int missingBytes) {
this(sizeInBytes, missingBytes, 0, 0);
}
BCFType(final int sizeInBytes, final int missingBytes, final long minValue, final long maxValue) {
this.sizeInBytes = sizeInBytes;
this.missingJavaValue = null;
this.missingBytes = missingBytes;
this.minValue = minValue;
this.maxValue = maxValue;
}
public int getSizeInBytes() {
return sizeInBytes;
}
public int getID() { return ordinal(); }
public final boolean withinRange(final long v) { return v >= minValue && v <= maxValue; }
public Object getMissingJavaValue() { return missingJavaValue; }
public int getMissingBytes() { return missingBytes; }
}

View File

@ -0,0 +1,71 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.codecs.bcf2;
/**
* Simple BCF decoder
* @author Mark DePristo
* @since 5/3/12
*/
public class TypeDescriptor {
public static final int OVERFLOW_ELEMENT_MARKER = 15;
public static final int MAX_INLINE_ELEMENTS = 14;
public final static BCFType[] INTEGER_TYPES_BY_SIZE = new BCFType[3];
public final static BCFType[] DICTIONARY_TYPES_BY_SIZE = INTEGER_TYPES_BY_SIZE;
private final static BCFType[] lookup = BCFType.values();
static {
INTEGER_TYPES_BY_SIZE[0] = BCFType.INT8;
INTEGER_TYPES_BY_SIZE[1] = BCFType.INT16;
INTEGER_TYPES_BY_SIZE[2] = BCFType.INT32;
}
public final static byte encodeTypeDescriptor(final int nElements, final BCFType type ) {
int encodeSize = Math.min(nElements, OVERFLOW_ELEMENT_MARKER);
byte typeByte = (byte)((0x0F & encodeSize) << 4 | (type.getID() & 0x0F));
return typeByte;
}
public final static int decodeSize(final byte typeDescriptor) {
return (0xF0 & typeDescriptor) >> 4;
}
public final static int decodeTypeID(final byte typeDescriptor) {
return typeDescriptor & 0x0F;
}
public final static BCFType decodeType(final byte typeDescriptor) {
return lookup[decodeTypeID(typeDescriptor)];
}
public final static boolean sizeIsOverflow(final byte typeDescriptor) {
return decodeSize(typeDescriptor) == OVERFLOW_ELEMENT_MARKER;
}
public final static boolean willOverflow(final long nElements) {
return nElements > MAX_INLINE_ELEMENTS;
}
}

View File

@ -0,0 +1,362 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
// our package
package org.broadinstitute.sting.utils.codecs.bcf2;
// the imports for unit testing.
import org.broadinstitute.sting.BaseTest;
import org.testng.Assert;
import org.testng.annotations.BeforeSuite;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.*;
public class EncoderDecoderUnitTest extends BaseTest {
private final float FLOAT_TOLERANCE = (float)1e-8;
final List<BCF2TypedValue> primitives = new ArrayList<BCF2TypedValue>();
final List<BCF2TypedValue> basicTypes = new ArrayList<BCF2TypedValue>();
final List<BCF2TypedValue> forCombinations = new ArrayList<BCF2TypedValue>();
@BeforeSuite
public void before() {
basicTypes.add(new BCF2TypedValue(1, BCFType.INT8));
basicTypes.add(new BCF2TypedValue(1000, BCFType.INT16));
basicTypes.add(new BCF2TypedValue(1000000, BCFType.INT32));
basicTypes.add(new BCF2TypedValue(1.2345e6, BCFType.FLOAT));
basicTypes.add(new BCF2TypedValue(new Byte((byte)'A'), BCFType.CHAR));
// small ints
primitives.add(new BCF2TypedValue(0, BCFType.INT8));
primitives.add(new BCF2TypedValue(10, BCFType.INT8));
primitives.add(new BCF2TypedValue(-1, BCFType.INT8));
primitives.add(new BCF2TypedValue(100, BCFType.INT8));
primitives.add(new BCF2TypedValue(-100, BCFType.INT8));
primitives.add(new BCF2TypedValue(-127, BCFType.INT8)); // last value in range
primitives.add(new BCF2TypedValue( 127, BCFType.INT8)); // last value in range
// medium ints
primitives.add(new BCF2TypedValue(-1000, BCFType.INT16));
primitives.add(new BCF2TypedValue(1000, BCFType.INT16));
primitives.add(new BCF2TypedValue(-128, BCFType.INT16)); // first value in range
primitives.add(new BCF2TypedValue( 128, BCFType.INT16)); // first value in range
primitives.add(new BCF2TypedValue(-32767, BCFType.INT16)); // last value in range
primitives.add(new BCF2TypedValue( 32767, BCFType.INT16)); // last value in range
// larger ints
primitives.add(new BCF2TypedValue(-32768, BCFType.INT32)); // first value in range
primitives.add(new BCF2TypedValue( 32768, BCFType.INT32)); // first value in range
primitives.add(new BCF2TypedValue(-100000, BCFType.INT32));
primitives.add(new BCF2TypedValue(100000, BCFType.INT32));
primitives.add(new BCF2TypedValue(-2147483647, BCFType.INT32));
primitives.add(new BCF2TypedValue(2147483647, BCFType.INT32));
// floats
primitives.add(new BCF2TypedValue(0.0, BCFType.FLOAT));
primitives.add(new BCF2TypedValue(-0.0, BCFType.FLOAT));
primitives.add(new BCF2TypedValue(1.0, BCFType.FLOAT));
primitives.add(new BCF2TypedValue(-1.0, BCFType.FLOAT));
primitives.add(new BCF2TypedValue(1.1, BCFType.FLOAT));
primitives.add(new BCF2TypedValue(-1.1, BCFType.FLOAT));
primitives.add(new BCF2TypedValue(5.0 / 3.0, BCFType.FLOAT));
primitives.add(new BCF2TypedValue(-5.0 / 3.0, BCFType.FLOAT));
primitives.add(new BCF2TypedValue(1.23e3, BCFType.FLOAT));
primitives.add(new BCF2TypedValue(1.23e6, BCFType.FLOAT));
primitives.add(new BCF2TypedValue(1.23e9, BCFType.FLOAT));
primitives.add(new BCF2TypedValue(1.23e12, BCFType.FLOAT));
primitives.add(new BCF2TypedValue(1.23e15, BCFType.FLOAT));
primitives.add(new BCF2TypedValue(-1.23e3, BCFType.FLOAT));
primitives.add(new BCF2TypedValue(-1.23e6, BCFType.FLOAT));
primitives.add(new BCF2TypedValue(-1.23e9, BCFType.FLOAT));
primitives.add(new BCF2TypedValue(-1.23e12, BCFType.FLOAT));
primitives.add(new BCF2TypedValue(-1.23e15, BCFType.FLOAT));
primitives.add(new BCF2TypedValue(Float.MIN_VALUE, BCFType.FLOAT));
primitives.add(new BCF2TypedValue(Float.MAX_VALUE, BCFType.FLOAT));
primitives.add(new BCF2TypedValue(Float.NEGATIVE_INFINITY, BCFType.FLOAT));
primitives.add(new BCF2TypedValue(Float.POSITIVE_INFINITY, BCFType.FLOAT));
primitives.add(new BCF2TypedValue(Float.NaN, BCFType.FLOAT));
// strings
//primitives.add(new BCF2TypedValue("", BCFType.CHAR)); <- will be null (which is right)
primitives.add(new BCF2TypedValue("S", BCFType.CHAR));
primitives.add(new BCF2TypedValue("S2", BCFType.CHAR));
primitives.add(new BCF2TypedValue("12345678910", BCFType.CHAR));
primitives.add(new BCF2TypedValue("ABCDEFGHIJKLMNOPQRSTUVWXYZ", BCFType.CHAR));
primitives.add(new BCF2TypedValue("ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZ", BCFType.CHAR));
// missing values
for ( BCFType type : BCFType.values() ) {
primitives.add(new BCF2TypedValue(null, type));
}
forCombinations.add(new BCF2TypedValue(10, BCFType.INT8));
forCombinations.add(new BCF2TypedValue(100, BCFType.INT8));
forCombinations.add(new BCF2TypedValue(-100, BCFType.INT8));
forCombinations.add(new BCF2TypedValue(-128, BCFType.INT16)); // first value in range
forCombinations.add(new BCF2TypedValue( 128, BCFType.INT16)); // first value in range
forCombinations.add(new BCF2TypedValue(-100000, BCFType.INT32));
forCombinations.add(new BCF2TypedValue(100000, BCFType.INT32));
forCombinations.add(new BCF2TypedValue(0.0, BCFType.FLOAT));
forCombinations.add(new BCF2TypedValue(1.23e6, BCFType.FLOAT));
forCombinations.add(new BCF2TypedValue(-1.23e6, BCFType.FLOAT));
forCombinations.add(new BCF2TypedValue("S", BCFType.CHAR));
forCombinations.add(new BCF2TypedValue("ABCDEFGHIJKLMNOPQRSTUVWXYZ", BCFType.CHAR));
forCombinations.add(new BCF2TypedValue("ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZ", BCFType.CHAR));
// missing values
for ( BCFType type : BCFType.values() ) {
forCombinations.add(new BCF2TypedValue(null, type));
}
}
// --------------------------------------------------------------------------------
//
// merge case Provider
//
// --------------------------------------------------------------------------------
private class BCF2TypedValue {
final BCFType type;
final Object value;
private BCF2TypedValue(final int value, final BCFType type) {
this(new Integer(value), type);
}
private BCF2TypedValue(final double value, final BCFType type) {
this(new Float(value), type);
}
private BCF2TypedValue(final Object value, final BCFType type) {
this.type = type;
this.value = value;
}
public boolean isMissing() { return value == null; }
@Override
public String toString() {
return String.format("%s of %s", value, type);
}
}
@DataProvider(name = "BCF2EncodingTestProviderSingletons")
public Object[][] BCF2EncodingTestProviderSingletons() {
List<Object[]> tests = new ArrayList<Object[]>();
for ( BCF2TypedValue tv : primitives )
tests.add(new Object[]{Arrays.asList(tv)});
return tests.toArray(new Object[][]{});
}
@DataProvider(name = "BCF2EncodingTestProviderBasicTypes")
public Object[][] BCF2EncodingTestProviderBasicTypes() {
List<Object[]> tests = new ArrayList<Object[]>();
for ( BCF2TypedValue tv : basicTypes )
tests.add(new Object[]{Arrays.asList(tv)});
return tests.toArray(new Object[][]{});
}
@DataProvider(name = "BCF2EncodingTestProviderSequences")
public Object[][] BCF2EncodingTestProviderSequences() {
List<Object[]> tests = new ArrayList<Object[]>();
for ( BCF2TypedValue tv1 : forCombinations )
for ( BCF2TypedValue tv2 : forCombinations )
for ( BCF2TypedValue tv3 : forCombinations )
tests.add(new Object[]{Arrays.asList(tv1, tv2, tv3)});
return tests.toArray(new Object[][]{});
}
@Test(dataProvider = "BCF2EncodingTestProviderSingletons")
public void testBCF2EncodingSingletons(final List<BCF2TypedValue> toEncode) throws IOException {
final byte[] record = encodeRecord(toEncode);
decodeRecord(toEncode, record);
}
@Test(dataProvider = "BCF2EncodingTestProviderBasicTypes")
public void testBCF2EncodingVectors(final List<BCF2TypedValue> toEncode) throws IOException {
for ( final BCF2TypedValue tv : toEncode ) {
for ( final int length : Arrays.asList(2, 5, 10, 15, 20, 25) ) {
BCF2Encoder encoder = new BCF2Encoder();
List<Object> expected = Collections.nCopies(length, tv.value);
encoder.encodeTypedVector(expected, tv.type);
BCF2Decoder decoder = new BCF2Decoder(encoder.getRecordBytes());
final Object decoded = decoder.decodeTypedValue();
if ( tv.type == BCFType.CHAR ) {
Assert.assertTrue(decoded instanceof String);
final String decodedString = (String)decoded;
Assert.assertTrue(decodedString.length() == length);
} else {
Assert.assertTrue(decoded instanceof List);
final List<Object> decodedList = (List<Object>)decoded;
Assert.assertEquals(decodedList.size(), expected.size());
for ( Object decodedValue : decodedList )
myAssertEquals(tv, decodedValue);
}
}
}
}
@Test(dataProvider = "BCF2EncodingTestProviderBasicTypes")
public void testBCF2EncodingVectorsWithMissing(final List<BCF2TypedValue> toEncode) throws IOException {
for ( final BCF2TypedValue tv : toEncode ) {
if ( tv.type != BCFType.CHAR ) {
for ( final int length : Arrays.asList(2, 5, 10, 15, 20, 25) ) {
final byte td = TypeDescriptor.encodeTypeDescriptor(1, tv.type);
final BCF2Encoder encoder = new BCF2Encoder();
for ( int i = 0; i < length; i++ ) {
encoder.encodeRawValue(i % 2 == 0 ? null : tv.value, tv.type);
}
final BCF2Decoder decoder = new BCF2Decoder(encoder.getRecordBytes());
for ( int i = 0; i < length; i++ ) {
final Object decoded = decoder.decodeTypedValue(td);
myAssertEquals(i % 2 == 0 ? new BCF2TypedValue(null, tv.type) : tv, decoded);
}
}
}
}
}
@Test(dataProvider = "BCF2EncodingTestProviderSequences", dependsOnMethods = "testBCF2EncodingSingletons")
public void testBCF2EncodingTestProviderSequences(final List<BCF2TypedValue> toEncode) throws IOException {
final byte[] record = encodeRecord(toEncode);
decodeRecord(toEncode, record);
}
@Test(dataProvider = "BCF2EncodingTestProviderSequences", dependsOnMethods = "testBCF2EncodingTestProviderSequences")
public void testReadAndSkipWithMultipleBlocks(final List<BCF2TypedValue> block) throws IOException {
testReadAndSkipWithMultipleBlocks(block, forCombinations);
testReadAndSkipWithMultipleBlocks(forCombinations, block);
}
public void testReadAndSkipWithMultipleBlocks(final List<BCF2TypedValue> block1, final List<BCF2TypedValue> block2) throws IOException {
final byte[] record1 = encodeRecord(block1);
final byte[] record2 = encodeRecord(block2);
// each record is individually good
decodeRecord(block1, record1);
decodeRecord(block2, record2);
BCF2Decoder decoder = new BCF2Decoder();
// test setting
decoder.setRecordBytes(record1);
decodeRecord(block1, decoder);
decoder.setRecordBytes(record2);
decodeRecord(block2, decoder);
// test combining the streams
final byte[] combined = combineRecords(record1, record2);
final List<BCF2TypedValue> combinedObjects = new ArrayList<BCF2TypedValue>(block1);
combinedObjects.addAll(block2);
// the combined bytes is the same as the combined objects
InputStream stream = new ByteArrayInputStream(combined);
decoder.readNextBlock(record1.length, stream);
decodeRecord(block1, decoder);
decoder.readNextBlock(record2.length, stream);
decodeRecord(block2, decoder);
// skipping the first block allows us to read the second block directly
stream = new ByteArrayInputStream(combined);
decoder.skipNextBlock(record1.length, stream);
decoder.readNextBlock(record2.length, stream);
decodeRecord(block2, decoder);
}
private final byte[] combineRecords(final byte[] record1, final byte[] record2) throws IOException {
ByteArrayOutputStream baos = new ByteArrayOutputStream();
baos.write(record1);
baos.write(record2);
return baos.toByteArray();
}
private final byte[] encodeRecord(final List<BCF2TypedValue> toEncode) throws IOException {
BCF2Encoder encoder = new BCF2Encoder();
for ( final BCF2TypedValue tv : toEncode ) {
if ( tv.isMissing() )
encoder.encodeTypedMissing(tv.type);
else {
final BCFType encodedType = encoder.encode(tv.value);
if ( tv.type != null ) // only if we have an expectation
Assert.assertEquals(encodedType, tv.type);
}
}
// check output
final byte[] record = encoder.getRecordBytes();
Assert.assertNotNull(record);
Assert.assertTrue(record.length > 0);
return record;
}
private final void decodeRecord(final List<BCF2TypedValue> toEncode, final byte[] record) {
decodeRecord(toEncode, new BCF2Decoder(record));
}
private final void decodeRecord(final List<BCF2TypedValue> toEncode, final BCF2Decoder decoder) {
for ( final BCF2TypedValue tv : toEncode ) {
Assert.assertFalse(decoder.blockIsFullyDecoded());
final Object decoded = decoder.decodeTypedValue();
myAssertEquals(tv, decoded);
}
Assert.assertTrue(decoder.blockIsFullyDecoded());
}
private final void myAssertEquals(final BCF2TypedValue tv, final Object decoded) {
if ( tv.value == null ) { // special needs for instanceof double
Assert.assertEquals(decoded, tv.value);
} else if ( tv.type == BCFType.FLOAT ) { // need tolerance for floats, and they aren't null
Assert.assertTrue(decoded instanceof Double);
final float valueFloat = (float)(Float)tv.value;
final float decodedFloat = (float)(double)(Double)decoded;
if ( Float.isNaN(valueFloat) ) // NaN == NaN => false unfortunately
Assert.assertTrue(Float.isNaN(decodedFloat));
else {
Assert.assertEquals(decodedFloat, valueFloat, FLOAT_TOLERANCE);
}
} else
Assert.assertEquals(decoded, tv.value);
}
}