diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java new file mode 100644 index 000000000..8d7f6f91c --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java @@ -0,0 +1,414 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.codecs.bcf2; + +import org.apache.log4j.Logger; +import org.broad.tribble.Feature; +import org.broad.tribble.FeatureCodec; +import org.broad.tribble.FeatureCodecHeader; +import org.broad.tribble.readers.AsciiLineReader; +import org.broad.tribble.readers.PositionalBufferedStream; +import org.broadinstitute.sting.utils.codecs.vcf.*; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.variantcontext.Allele; +import org.broadinstitute.sting.utils.variantcontext.Genotype; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder; + +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.util.*; + +public class BCF2Codec implements FeatureCodec { + final protected static Logger logger = Logger.getLogger(BCF2Codec.class); + private VCFHeader header = null; + private final ArrayList contigNames = new ArrayList(); + private final ArrayList dictionary = new ArrayList(); + private final BCF2Decoder decoder = new BCF2Decoder(); + private boolean skipGenotypes = false; + + // ---------------------------------------------------------------------- + // + // Feature codec interface functions + // + // ---------------------------------------------------------------------- + + @Override + public Feature decodeLoc( final PositionalBufferedStream inputStream ) { + return decode(inputStream); + // TODO: a less expensive version of decodeLoc() that doesn't use VariantContext + // TODO: very easy -- just decodeSitesBlock, and then skip to end of end of sites block + // TODO: and then skip genotypes block + } + + @Override + public VariantContext decode( final PositionalBufferedStream inputStream ) { + final VariantContextBuilder builder = new VariantContextBuilder(); + + final int sitesBlockSize = decoder.readBlockSize(inputStream); + final int genotypeBlockSize = decoder.readBlockSize(inputStream); + decoder.readNextBlock(sitesBlockSize, inputStream); + final SitesInfoForDecoding info = decodeSitesBlock(builder); + + if ( isSkippingGenotypes() ) { + decoder.skipNextBlock(genotypeBlockSize, inputStream); + } else { + decoder.readNextBlock(genotypeBlockSize, inputStream); + decodeGenotypes(info, builder); + } + + return builder.make(); + } + + @Override + public Class getFeatureType() { + return VariantContext.class; + } + + @Override + public FeatureCodecHeader readHeader( final PositionalBufferedStream inputStream ) { + AsciiLineReader headerReader = new AsciiLineReader(inputStream); + String headerLine; + List headerLines = new ArrayList(); + boolean foundHeaderEnd = false; + + try { + while ( ! foundHeaderEnd && (headerLine = headerReader.readLine()) != null) { + if ( headerLine.startsWith(VCFHeader.METADATA_INDICATOR) ) { + headerLines.add(headerLine); + } + else if ( headerLine.startsWith(VCFHeader.HEADER_INDICATOR) ) { + headerLines.add(headerLine); + foundHeaderEnd = true; + } + else { + throw new UserException.MalformedBCF2("Reached end of header without encountering a field layout line"); + } + } + } + catch ( IOException e ) { + throw new UserException.CouldNotReadInputFile("I/O error while reading BCF2 header"); + } + + if ( ! foundHeaderEnd ) { + throw new UserException.MalformedBCF2("Reached end of header without encountering a field layout line"); + } + + // read the header + this.header = AbstractVCFCodec.parseHeader(headerLines, VCFHeaderVersion.VCF4_1); + + // create the config offsets + for ( final VCFContigHeaderLine contig : header.getContigLines()) + contigNames.add(contig.getID()); + + // create the string dictionary + parseDictionary(header); + + // position right before next line (would be right before first real record byte at end of header) + return new FeatureCodecHeader(header, inputStream.getPosition()); + } + + @Override + public boolean canDecode( final String path ) { + try { + FileInputStream fis = new FileInputStream(path); + AsciiLineReader reader = new AsciiLineReader(new PositionalBufferedStream(fis)); + String firstLine = reader.readLine(); + if ( firstLine != null && firstLine.equals(BCF2Constants.VERSION_LINE) ) { + return true; + } + } catch ( FileNotFoundException e ) { + return false; + } catch ( IOException e ) { + return false; + } + + return false; + } + + private final void parseDictionary(final VCFHeader header) { + for ( final VCFHeaderLine line : header.getMetaData() ) { + if ( line.getKey().equals(BCF2Constants.DICTIONARY_LINE_TAG) ) { + for ( final String string : line.getValue().split(BCF2Constants.DICTIONARY_LINE_ENTRY_SEPARATOR) ) + dictionary.add(string); + break; + } + } + + // if we got here we never found a dictionary, or there are no elements in the dictionary + if ( dictionary.size() == 0 ) + throw new UserException.MalformedBCF2("Dictionary header element was absent or empty"); + } + + public boolean isSkippingGenotypes() { + return skipGenotypes; + } + + public void setSkipGenotypes(final boolean skipGenotypes) { + this.skipGenotypes = skipGenotypes; + } + + // -------------------------------------------------------------------------------- + // + // implicit block + // + // The first four records of BCF are inline untype encoded data of: + // + // 4 byte integer chrom offset + // 4 byte integer start + // 4 byte integer ref length + // 4 byte float qual + // + // -------------------------------------------------------------------------------- + + private final SitesInfoForDecoding decodeSitesBlock(final VariantContextBuilder builder) { + final int contigOffset = decoder.decodeInt(BCFType.INT32.getSizeInBytes()); + final String contig = lookupContigName(contigOffset); + builder.chr(contig); + + final int pos = decoder.decodeInt(BCFType.INT32.getSizeInBytes()); + final int refLength = decoder.decodeInt(BCFType.INT32.getSizeInBytes()); + builder.start((long)pos); + builder.stop((long)(pos + refLength - 1)); // minus one because of our open intervals + + final Object qual = decoder.decodeSingleValue(BCFType.FLOAT); + if ( qual != null ) { + builder.log10PError(((Double)qual) / -10.0); + } + + final int nAlleleInfo = decoder.decodeInt(BCFType.INT32.getSizeInBytes()); + final int nFormatSamples = decoder.decodeInt(BCFType.INT32.getSizeInBytes()); + final int nAlleles = nAlleleInfo >> 16; + final int nInfo = nAlleleInfo & 0x00FF; + final int nFormatFields = nFormatSamples >> 24; + final int nSamples = nFormatSamples & 0x0FFF; + + decodeID(builder); + final ArrayList alleles = decodeAlleles(builder, pos, nAlleles); + decodeFilter(builder); + decodeInfo(builder, nInfo); + + return new SitesInfoForDecoding(pos, nFormatFields, nSamples, alleles); + } + + private final static class SitesInfoForDecoding { + final int pos; + final int nFormatFields; + final int nSamples; + final ArrayList alleles; + + private SitesInfoForDecoding(final int pos, final int nFormatFields, final int nSamples, final ArrayList alleles) { + this.pos = pos; + this.nFormatFields = nFormatFields; + this.nSamples = nSamples; + this.alleles = alleles; + } + } + + private void decodeID( final VariantContextBuilder builder ) { + final String id = (String)decoder.decodeTypedValue(); + + if ( id == null ) { + builder.noID(); + } + else { + builder.id(id); + } + } + + public static ArrayList clipAllelesIfNecessary(int position, String ref, ArrayList unclippedAlleles) { + if ( AbstractVCFCodec.isSingleNucleotideEvent(unclippedAlleles) ) { + ArrayList clippedAlleles = new ArrayList(unclippedAlleles.size()); + AbstractVCFCodec.clipAlleles(position, ref, unclippedAlleles, clippedAlleles, -1); + return clippedAlleles; + } else + return unclippedAlleles; + } + + private ArrayList decodeAlleles( final VariantContextBuilder builder, final int pos, final int nAlleles ) { + // TODO -- probably need inline decoder for efficiency here (no sense in going bytes -> string -> vector -> bytes + ArrayList alleles = new ArrayList(nAlleles); + String ref = null; + + for ( int i = 0; i < nAlleles; i++ ) { + final String allele = (String)decoder.decodeTypedValue(); + + if ( i == 0 ) { + ref = allele; + alleles.add(Allele.create(allele, true)); + } else { + alleles.add(Allele.create(allele, false)); + } + } + + alleles = clipAllelesIfNecessary(pos, ref, alleles); + builder.alleles(alleles); + + builder.referenceBaseForIndel(ref.getBytes()[0]); + + return alleles; + } + + private void decodeFilter( final VariantContextBuilder builder ) { + final Object filters = decoder.decodeTypedValue(); + + if ( filters == null ) { + builder.unfiltered(); + } + else { + builder.filters(new LinkedHashSet(asStrings(filters))); + } + } + + private void decodeInfo( final VariantContextBuilder builder, final int numInfoFields ) { + final Map infoFieldEntries = new HashMap(numInfoFields); + + for ( int i = 0; i < numInfoFields; i++ ) { + final String key = getDictionaryString(); + Object value = decoder.decodeTypedValue(); + final VCFCompoundHeaderLine metaData = VariantContext.getMetaDataForField(header, key); + if ( metaData.getType() == VCFHeaderLineType.Flag ) value = true; // special case for flags + infoFieldEntries.put(key, value); + } + + builder.attributes(infoFieldEntries); + } + + private void decodeGenotypes( final SitesInfoForDecoding siteInfo, final VariantContextBuilder builder ) { + final List samples = new ArrayList(header.getGenotypeSamples()); + final int nSamples = siteInfo.nSamples; + final int nFields = siteInfo.nFormatFields; + final Map> fieldValues = decodeGenotypeFieldValues(nFields, nSamples); + + if ( samples.size() != nSamples ) + throw new UserException.MalformedBCF2("GATK currently doesn't support reading BCF2 files with " + + "different numbers of samples per record. Saw " + samples.size() + + " samples in header but have a record with " + nSamples + " samples"); + + final List genotypes = new ArrayList(nSamples); + for ( int i = 0; i < nSamples; i++ ) { + final String sampleName = samples.get(i); + List alleles = null; + boolean isPhased = false; + double log10PError = VariantContext.NO_LOG10_PERROR; + Set filters = null; + Map attributes = null; + double[] log10Likelihoods = null; + + for ( final Map.Entry> entry : fieldValues.entrySet() ) { + final String field = entry.getKey(); + final List values = entry.getValue(); + if ( field.equals(VCFConstants.GENOTYPE_KEY) ) { + alleles = decodeGenotypeAlleles(siteInfo.alleles, (List)values.get(i)); + } else if ( field.equals(VCFConstants.GENOTYPE_QUALITY_KEY) ) { + final Integer value = (Integer)values.get(i); + if ( value != BCFType.INT8.getMissingJavaValue() ) + log10PError = value / -10.0; + } else if ( field.equals(VCFConstants.GENOTYPE_FILTER_KEY) ) { + throw new ReviewedStingException("Genotype filters not implemented in GATK BCF2"); + //filters = new HashSet(values.get(i)); + } else { // add to attributes + if ( attributes == null ) attributes = new HashMap(nFields); + attributes.put(field, values.get(i)); + } + } + + if ( alleles == null ) throw new ReviewedStingException("BUG: no alleles found"); + + final Genotype g = new Genotype(sampleName, alleles, log10PError, filters, attributes, isPhased, log10Likelihoods); + genotypes.add(g); + } + + builder.genotypes(genotypes); + } + + private final List decodeGenotypeAlleles(final ArrayList siteAlleles, final List encoded) { + final List gt = new ArrayList(encoded.size()); + for ( final Integer encode : encoded ) { + if ( encode == null ) // absent, as are all following by definition + return gt; + else { + final int offset = encode >> 1; + if ( offset == 0 ) + gt.add(Allele.NO_CALL); + else + gt.add(siteAlleles.get(offset - 1)); + } + } + return gt; + } + + private final Map> decodeGenotypeFieldValues(final int nFields, final int nSamples) { + final Map> map = new LinkedHashMap>(nFields); + + for ( int i = 0; i < nFields; i++ ) { + final String field = getDictionaryString(); + final byte typeDescriptor = decoder.readTypeDescriptor(); + final List values = new ArrayList(nSamples); + for ( int j = 0; j < nSamples; j++ ) + values.add(decoder.decodeTypedValue(typeDescriptor)); + map.put(field, values); + } + + return map; + } + + private final String getDictionaryString() { + final int offset = (Integer)decoder.decodeTypedValue(); + final String field = dictionary.get(offset); + return field; + } + + private final String lookupContigName( final int contigOffset ) { + if ( contigOffset < contigNames.size() ) { + return contigNames.get(contigOffset); + } + else { + throw new UserException.MalformedBCF2(String.format("No contig at index %d present in the sequence dictionary from the BCF2 header (%s)", contigOffset, contigNames)); + } + } + + // ---------------------------------------------------------------------- + // + // Utility functions + // + // ---------------------------------------------------------------------- + + private final Collection asStrings(final Object o) { + return asCollection(String.class, o); + } + + private final Collection asCollection(final Class c, final Object o) { + if ( o == null ) + return Collections.emptyList(); + else if ( o instanceof List ) { + return (List)o; + } else { + return (Set)Collections.singleton(o); + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Constants.java b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Constants.java new file mode 100644 index 000000000..5f92f3886 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Constants.java @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.codecs.bcf2; + +import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader; + +import java.nio.charset.Charset; + +public class BCF2Constants { + public static final String VERSION_LINE_FORMAT = "fileformat=BCF2v%d.%d"; + public static final String VERSION_LINE = String.format(VCFHeader.METADATA_INDICATOR + VERSION_LINE_FORMAT, 0, 1); + public static final String DICTIONARY_LINE_TAG = "dictionary"; + public static final String DICTIONARY_LINE_ENTRY_SEPARATOR = ","; + + public static final Charset BCF2_TEXT_CHARSET = Charset.forName("US-ASCII"); // TODO: enforce this! + + // Note that these values are prefixed by FFFFFF for convenience + public static final int INT8_MISSING_VALUE = 0xFFFFFF80; + public static final int INT16_MISSING_VALUE = 0xFFFF8000; + public static final int INT32_MISSING_VALUE = 0x80000000; + public static final int FLOAT_MISSING_VALUE = 0x7F800001; +} diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Decoder.java b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Decoder.java new file mode 100644 index 000000000..7577b33d7 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Decoder.java @@ -0,0 +1,277 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.codecs.bcf2; + +import org.apache.log4j.Logger; +import org.broad.tribble.FeatureCodec; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.UserException; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.util.ArrayList; + +public class BCF2Decoder { + final protected static Logger logger = Logger.getLogger(FeatureCodec.class); + + byte[] recordBytes; + ByteArrayInputStream recordStream; + + public BCF2Decoder() { + // nothing to do + } + + /** + * Create a new decoder ready to read BCF2 data from the byte[] recordBytes, for testing purposes + * + * @param recordBytes + */ + protected BCF2Decoder(final byte[] recordBytes) { + setRecordBytes(recordBytes); + } + + // ---------------------------------------------------------------------- + // + // Routines to load, set, skip blocks of underlying data we are decoding + // + // ---------------------------------------------------------------------- + + /** + * Reads the next record from input stream and prepare this decoder to decode values from it + * + * @param stream + * @return + */ + public void readNextBlock(final int blockSizeInBytes, final InputStream stream) { + setRecordBytes(readRecordBytes(blockSizeInBytes, stream)); + } + + /** + * Skips the next record from input stream, invalidating current block data + * + * @param stream + * @return + */ + public void skipNextBlock(final int blockSizeInBytes, final InputStream stream) { + try { + final int bytesRead = (int)stream.skip(blockSizeInBytes); + validateReadBytes(bytesRead, blockSizeInBytes); + } catch ( IOException e ) { + throw new UserException.CouldNotReadInputFile("I/O error while reading BCF2 file", e); + } + this.recordBytes = null; + this.recordStream = null; + } + + /** + * Returns the byte[] for the block of data we are currently decoding + * @return + */ + public byte[] getRecordBytes() { + return recordBytes; + } + + /** + * The size of the current block in bytes + * + * @return + */ + public int getBlockSize() { + return recordBytes.length; + } + + public boolean blockIsFullyDecoded() { + return recordStream.available() == 0; + } + + /** + * Use the recordBytes[] to read BCF2 records from now on + * + * @param recordBytes + */ + public void setRecordBytes(final byte[] recordBytes) { + this.recordBytes = recordBytes; + this.recordStream = new ByteArrayInputStream(recordBytes); + } + + // ---------------------------------------------------------------------- + // + // High-level decoder + // + // ---------------------------------------------------------------------- + + public final Object decodeTypedValue() { + final byte typeDescriptor = readTypeDescriptor(); + return decodeTypedValue(typeDescriptor); + } + + public final Object decodeTypedValue(final byte typeDescriptor) { + final int size = TypeDescriptor.sizeIsOverflow(typeDescriptor) ? decodeVectorSize() : TypeDescriptor.decodeSize(typeDescriptor); + final BCFType type = TypeDescriptor.decodeType(typeDescriptor); + + assert size >= 0; + + if ( size == 0 ) { + return null; + } else if ( type == BCFType.CHAR ) { // special case string decoding for efficiency + return decodeLiteralString(size); + } else if ( size == 1 ) { + return decodeSingleValue(type); + } else { + final ArrayList ints = new ArrayList(size); + for ( int i = 0; i < size; i++ ) { + ints.add(decodeSingleValue(type)); + } + return ints; + } + } + + public final Object decodeSingleValue(final BCFType type) { + // TODO -- decodeTypedValue should integrate this routine + final int value = readInt(type.getSizeInBytes(), recordStream); + + if ( value == type.getMissingBytes() ) + return null; + else { + switch (type) { + case INT8: + case INT16: + case INT32: return value; + case FLOAT: return (double)rawFloatToFloat(value); + case CHAR: return value & 0xFF; // TODO -- I cannot imagine why we'd get here, as string needs to be special cased + default: throw new ReviewedStingException("BCF2 codec doesn't know how to decode type " + type ); + } + } + } + + // ---------------------------------------------------------------------- + // + // Decode raw primitive data types (ints, floats, and strings) + // + // ---------------------------------------------------------------------- + + private final String decodeLiteralString(final int size) { + // TODO -- assumes size > 0 + final byte[] bytes = new byte[size]; // TODO -- in principle should just grab bytes from underlying array + try { + recordStream.read(bytes); + return new String(bytes); + } catch ( IOException e ) { + throw new ReviewedStingException("readByte failure", e); + } + } + + private final int decodeVectorSize() { + final byte typeDescriptor = readTypeDescriptor(); + final int size = TypeDescriptor.decodeSize(typeDescriptor); + final BCFType type = TypeDescriptor.decodeType(typeDescriptor); + + assert size == 1; + assert type == BCFType.INT8 || type == BCFType.INT16 || type == BCFType.INT32; + + return decodeInt(type.getSizeInBytes()); + } + + public final int decodeInt(int bytesForEachInt) { + return readInt(bytesForEachInt, recordStream); + } + + public final float rawFloatToFloat(final int rawFloat) { + return Float.intBitsToFloat(rawFloat); + } + + // ---------------------------------------------------------------------- + // + // Utility functions + // + // ---------------------------------------------------------------------- + + /** + * Read the size of the next block from inputStream + * + * @param inputStream + * @return + */ + public final int readBlockSize(final InputStream inputStream) { + return readInt(4, inputStream); + } + + /** + * + * @param inputStream + * @return + */ + private final static byte[] readRecordBytes(final int blockSizeInBytes, final InputStream inputStream) { + final byte[] record = new byte[blockSizeInBytes]; + try { + final int bytesRead = inputStream.read(record); + validateReadBytes(bytesRead, blockSizeInBytes); + } catch ( IOException e ) { + throw new UserException.CouldNotReadInputFile("I/O error while reading BCF2 file", e); + } + + return record; + } + + private final static void validateReadBytes(final int actuallyRead, final int expected) { + if ( actuallyRead < expected ) { + throw new UserException.MalformedBCF2(String.format("Failed to read next complete record: %s", + actuallyRead == -1 ? + "premature end of input stream" : + String.format("expected %d bytes but read only %d", expected, actuallyRead))); + } + } + + public final byte readTypeDescriptor() { + return readByte(recordStream); + } + + private final static byte readByte(final InputStream stream) { + try { + return (byte)(stream.read() & 0xFF); + } catch ( IOException e ) { + throw new ReviewedStingException("readByte failure", e); + } + } + + private final static int readInt(int bytesForEachInt, final InputStream stream) { + switch ( bytesForEachInt ) { + case 1: { + return (byte)(readByte(stream)); + } case 2: { + final int b1 = readByte(stream) & 0xFF; + final int b2 = readByte(stream) & 0xFF; + return (short)((b1 << 8) | b2); + } case 4: { + final int b1 = readByte(stream) & 0xFF; + final int b2 = readByte(stream) & 0xFF; + final int b3 = readByte(stream) & 0xFF; + final int b4 = readByte(stream) & 0xFF; + return (int)(b1 << 24 | b2 << 16 | b3 << 8 | b4); + } default: throw new ReviewedStingException("Unexpected size during decoding"); + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Encoder.java b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Encoder.java new file mode 100644 index 000000000..87439915f --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Encoder.java @@ -0,0 +1,234 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.codecs.bcf2; + +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.OutputStream; +import java.util.*; + +/** + * Simple BCF2 encoder + * + * @author depristo + * @since 5/12 + */ +public class BCF2Encoder { + // TODO -- increase default size? + public static final int WRITE_BUFFER_INITIAL_SIZE = 16384; + private ByteArrayOutputStream encodeStream = new ByteArrayOutputStream(WRITE_BUFFER_INITIAL_SIZE); + + // -------------------------------------------------------------------------------- + // + // Functions to return the data being encoded here + // + // -------------------------------------------------------------------------------- + + public int getRecordSizeInBytes() { + return encodeStream.size(); + } + + public byte[] getRecordBytes() { + byte[] bytes = encodeStream.toByteArray(); + encodeStream.reset(); + return bytes; + } + + // -------------------------------------------------------------------------------- + // + // Super-high level interface + // + // -------------------------------------------------------------------------------- + + /** + * Totally generic encoder that examines o, determines the best way to encode it, and encodes it + * @param o + * @return + */ + public final BCFType encode(final Object o) throws IOException { + if ( o == null ) throw new ReviewedStingException("Generic encode cannot deal with null values"); + + if ( o instanceof String ) { + return encodeString((String)o); + } else if ( o instanceof List ) { + final BCFType type = determinePrimitiveType(((List) o).get(0)); + encodeTypedVector((List) o, type); + return type; + } else { + final BCFType type = determinePrimitiveType(o); + encodeTypedSingleton(o, type); + return type; + } + } + + // -------------------------------------------------------------------------------- + // + // Writing typed values (have type byte) + // + // -------------------------------------------------------------------------------- + + public final void encodeTypedMissing(final BCFType type) throws IOException { + encodeTypedVector(Collections.emptyList(), type); + } + + // todo -- should be specialized for each object type for efficiency + public final void encodeTypedSingleton(final Object v, final BCFType type) throws IOException { + encodeTypedVector(Collections.singleton(v), type); + } + + public final BCFType encodeString(final String v) throws IOException { + // TODO -- this needs to be optimized + final byte[] bytes = v.getBytes(); + final List l = new ArrayList(bytes.length); + for ( int i = 0; i < bytes.length; i++) l.add(bytes[i]); + encodeTypedVector(l, BCFType.CHAR); + return BCFType.CHAR; + } + + public final void encodeTypedVector(final Collection v, final BCFType type) throws IOException { + encodeType(v.size(), type); + encodeRawValues(v, type); + } + + public final BCFType encodeTypedIntOfBestSize(final int value) throws IOException { + final BCFType type = determineIntegerType(value); + encodeTypedSingleton(value, type); + return type; + } + + // -------------------------------------------------------------------------------- + // + // Writing raw values (don't have a type byte) + // + // -------------------------------------------------------------------------------- + + public final void encodeRawValues(final Collection v, final BCFType type) throws IOException { + for ( final T v1 : v ) { + encodeRawValue(v1, type); + } + } + + public final void encodeRawValue(final T value, final BCFType type) throws IOException { + if ( value == type.getMissingJavaValue() ) + encodeRawMissingValue(type); + else { + switch (type) { + case INT8: + case INT16: + case INT32: encodePrimitive((Integer)value, type); break; + case FLOAT: encodeRawFloat((Float) value, type); break; + case CHAR: encodeRawChar((Byte) value); break; + default: throw new ReviewedStingException("Illegal type encountered " + type); + } + } + } + + public final void encodeRawMissingValue(final BCFType type) throws IOException { + encodePrimitive(type.getMissingBytes(), type); + } + + public final void encodeRawMissingValues(final int size, final BCFType type) throws IOException { + for ( int i = 0; i < size; i++ ) + encodeRawMissingValue(type); + } + + // -------------------------------------------------------------------------------- + // + // low-level encoders + // + // -------------------------------------------------------------------------------- + + public final void encodeRawChar(final byte c) throws IOException { + encodeStream.write(c); + } + + public final void encodeRawFloat(final float value, final BCFType type) throws IOException { + encodePrimitive(Float.floatToIntBits(value), type); + } + + public final void encodeType(final int size, final BCFType type) throws IOException { + final byte typeByte = TypeDescriptor.encodeTypeDescriptor(size, type); + encodeStream.write(typeByte); + if ( TypeDescriptor.willOverflow(size) ) + encodeTypedIntOfBestSize(size); + } + + public final void encodeRawInt(final int value, final BCFType type) throws IOException { + encodePrimitive(value, type, encodeStream); + } + + public final void encodePrimitive(final int value, final BCFType type) throws IOException { + encodePrimitive(value, type, encodeStream); + } + + // -------------------------------------------------------------------------------- + // + // utility functions + // + // -------------------------------------------------------------------------------- + + public final BCFType determineIntegerType(final List values) { + BCFType maxType = BCFType.INT8; + for ( final int value : values ) { + final BCFType type1 = determineIntegerType(value); + switch ( type1 ) { + case INT8: break; + case INT16: maxType = BCFType.INT16; break; + case INT32: return BCFType.INT32; // fast path for largest possible value + default: throw new ReviewedStingException("Unexpected integer type " + type1 ); + } + } + return maxType; + } + + public final BCFType determineIntegerType(final int value) { + for ( final BCFType potentialType : TypeDescriptor.INTEGER_TYPES_BY_SIZE ) { + if ( potentialType.withinRange(value) ) + return potentialType; + } + + throw new ReviewedStingException("Integer cannot be encoded in allowable range of even INT32: " + value); + } + + private final BCFType determinePrimitiveType(final Object v) { + if ( v instanceof Integer ) + return determineIntegerType((Integer)v); + else if ( v instanceof Float ) + return BCFType.FLOAT; + else + throw new ReviewedStingException("No native encoding for Object of type " + v.getClass().getSimpleName()); + } + + public final static void encodePrimitive(final int value, final BCFType type, final OutputStream encodeStream) throws IOException { + for ( int i = type.getSizeInBytes() - 1; i >= 0; i-- ) { + final int shift = i * 8; + int mask = 0xFF << shift; + int byteValue = (mask & value) >> shift; + encodeStream.write(byteValue); + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2TestWalker.java b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2TestWalker.java new file mode 100755 index 000000000..4bc43aea5 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2TestWalker.java @@ -0,0 +1,136 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.codecs.bcf2; + +import org.broad.tribble.FeatureCodecHeader; +import org.broad.tribble.readers.PositionalBufferedStream; +import org.broadinstitute.sting.commandline.*; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.RodWalker; +import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader; +import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder; + +import java.io.*; +import java.util.*; + +/** + * Testing BCF2 + * + * @author Mark DePristo + * @since 2012 + */ +public class BCF2TestWalker extends RodWalker { + /** + * Variants from this VCF file are used by this tool as input. + * The file must at least contain the standard VCF header lines, but + * can be empty (i.e., no variants are contained in the file). + */ + @Input(fullName="variant", shortName = "V", doc="Input VCF file", required=true) + public RodBinding variants; + + @Argument(doc="keep variants", required=false) + public boolean keepVariants = false; + + @Argument(doc="quiet", required=false) + public boolean quiet = false; + + @Argument(doc="dontIndexOnTheFly", required=false) + public boolean dontIndexOnTheFly = false; + + @Output(doc="File to which results should be written",required=true) + protected File bcfFile; + + private final List vcs = new ArrayList(); + protected BCF2Writer writer; + + @Override + public void initialize() { + final Map vcfRods = VCFUtils.getVCFHeadersFromRods(getToolkit(), Collections.singletonList(variants)); + final VCFHeader header = VCFUtils.withUpdatedContigs(vcfRods.values().iterator().next(), getToolkit()); + try { + writer = new BCF2Writer("out", bcfFile, new FileOutputStream(bcfFile), + getToolkit().getMasterSequenceDictionary(), ! dontIndexOnTheFly ); + writer.writeHeader(header); + } catch ( FileNotFoundException e ) { + throw new UserException.CouldNotCreateOutputFile(bcfFile, e); + } + } + + public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + if ( tracker == null ) // RodWalkers can make funky map calls + return 0; + + for ( VariantContext vc : tracker.getValues(variants, context.getLocation())) { + writer.add(vc); + if ( keepVariants ) vcs.add(vc); + } + + return 1; + } + + // + // default reduce -- doesn't do anything at all + // + public Integer reduceInit() { return 0; } + public Integer reduce(Integer counter, Integer sum) { return counter + sum; } + + public void onTraversalDone(Integer sum) { + try { + writer.close(); + logger.info("Closed writer"); + + // read in the BCF records + BCF2Codec codec = new BCF2Codec(); + PositionalBufferedStream pbs = new PositionalBufferedStream(new FileInputStream(bcfFile)); + FeatureCodecHeader header = codec.readHeader(pbs); + pbs.close(); + + pbs = new PositionalBufferedStream(new FileInputStream(bcfFile)); + pbs.skip(header.getHeaderEnd()); + Iterator it = vcs.iterator(); + while ( ! pbs.isDone() ) { + if ( keepVariants ) { + VariantContext expected = it.next(); + if ( ! quiet ) + System.out.printf("vcf = %s %d %s%n", expected.getChr(), expected.getStart(), expected); + } + VariantContext bcfRaw = codec.decode(pbs); + VariantContext bcf = new VariantContextBuilder(bcfRaw).source("variant").make(); + if ( ! quiet ) { + System.out.printf("bcf = %s %d %s%n", bcf.getChr(), bcf.getStart(), bcf.toString()); + System.out.printf("--------------------------------------------------%n"); + } + } + + } catch ( IOException e ) { + throw new UserException.CouldNotCreateOutputFile(bcfFile, "bad user!"); + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Writer.java b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Writer.java new file mode 100644 index 000000000..d44ecfa78 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Writer.java @@ -0,0 +1,387 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.codecs.bcf2; + +import net.sf.samtools.SAMSequenceDictionary; +import org.apache.log4j.Logger; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.codecs.vcf.*; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.variantcontext.Allele; +import org.broadinstitute.sting.utils.variantcontext.Genotype; +import org.broadinstitute.sting.utils.variantcontext.GenotypesContext; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; + +import java.io.File; +import java.io.IOException; +import java.io.OutputStream; +import java.io.OutputStreamWriter; +import java.util.*; + +public class BCF2Writer extends IndexingVCFWriter { + final protected static Logger logger = Logger.getLogger(BCF2Writer.class); + private final static boolean doNotWriteGenotypes = false; + private OutputStream outputStream; // Note: do not flush until completely done writing, to avoid issues with eventual BGZF support + private VCFHeader header; + private Map contigDictionary = new HashMap(); + private Map stringDictionary = new LinkedHashMap(); + + private final BCF2Encoder encoder = new BCF2Encoder(); // initialized after the header arrives + + public BCF2Writer(final String name, final File location, final OutputStream output, final SAMSequenceDictionary refDict, final boolean enableOnTheFlyIndexing) { + super(name, location, output, refDict, enableOnTheFlyIndexing); + this.outputStream = getOutputStream(); + } + + // -------------------------------------------------------------------------------- + // + // Interface functions + // + // -------------------------------------------------------------------------------- + + @Override + public void writeHeader(final VCFHeader header) { + this.header = header; + + // create the config offsets map + for ( final VCFContigHeaderLine contig : header.getContigLines()) + contigDictionary.put(contig.getID(), contig.getContigIndex()); + + // set up the strings dictionary + int offset = 0; + stringDictionary.put(VCFConstants.PASSES_FILTERS_v4, offset++); // special case the special PASS field + for ( VCFHeaderLine line : header.getMetaData() ) { + if ( line instanceof VCFIDHeaderLine ) { + VCFIDHeaderLine idLine = (VCFIDHeaderLine)line; + stringDictionary.put(idLine.getID(), offset++); + } + } + + // add the dictionary ##dictionary=x,y,z line to the header + final String dictionaryLineValue = Utils.join(BCF2Constants.DICTIONARY_LINE_ENTRY_SEPARATOR, stringDictionary.keySet()); + header.addMetaDataLine(new VCFHeaderLine(BCF2Constants.DICTIONARY_LINE_TAG, dictionaryLineValue)); + + // write out the header + StandardVCFWriter.writeHeader(header, new OutputStreamWriter(outputStream), doNotWriteGenotypes, BCF2Constants.VERSION_LINE, "BCF2 stream"); + } + + @Override + public void add( final VariantContext initialVC ) { + final VariantContext vc = initialVC.fullyDecode(header); + super.add(vc); // allow on the fly indexing + + try { + final byte[] infoBlock = buildSitesData(vc); + final byte[] genotypesBlock = buildSamplesData(vc); + + // write the two blocks to disk + writeBlock(infoBlock, genotypesBlock); + } + catch ( IOException e ) { + throw new UserException("Error writing record to BCF2 file: " + vc.toString(), e); + } + } + + @Override + public void close() { + try { + outputStream.flush(); + outputStream.close(); + } + catch ( IOException e ) { + throw new UserException("Failed to close BCF2 file"); + } + super.close(); + } + + // -------------------------------------------------------------------------------- + // + // implicit block + // + // The first four records of BCF are inline untype encoded data of: + // + // 4 byte integer chrom offset + // 4 byte integer start + // 4 byte integer ref length + // 4 byte float qual + // + // -------------------------------------------------------------------------------- + private byte[] buildSitesData( VariantContext vc ) throws IOException { + final int contigIndex = contigDictionary.get(vc.getChr()); + if ( contigIndex == -1 ) + throw new UserException(String.format("Contig %s not found in sequence dictionary from reference", vc.getChr())); + + // note use of encodeRawValue to not insert the typing byte + encoder.encodeRawValue(contigIndex, BCFType.INT32); + + // pos + encoder.encodeRawValue(vc.getStart(), BCFType.INT32); + + // ref length + encoder.encodeRawValue(vc.getEnd() - vc.getStart() + 1, BCFType.INT32); + + // qual + if ( vc.hasLog10PError() ) + encoder.encodeRawFloat((float) vc.getPhredScaledQual(), BCFType.FLOAT); + else + encoder.encodeRawMissingValue(BCFType.FLOAT); + + // info fields + final int nAlleles = vc.getNAlleles(); + final int nInfo = vc.getAttributes().size(); + final int nGenotypeFormatFields = StandardVCFWriter.calcVCFGenotypeKeys(vc).size(); + final int nSamples = vc.getNSamples(); + + encoder.encodeRawInt((nAlleles << 16) | (nInfo & 0x00FF), BCFType.INT32); + encoder.encodeRawInt((nGenotypeFormatFields << 24) | (nSamples & 0x0FFF), BCFType.INT32); + + buildID(vc); + buildAlleles(vc); + buildFilter(vc); + buildInfo(vc); + + return encoder.getRecordBytes(); + } + + private void buildID( VariantContext vc ) throws IOException { + encoder.encodeString(vc.getID()); + } + + private void buildAlleles( VariantContext vc ) throws IOException { + for ( final Allele allele : vc.getAlleles() ) { + final String s = vc.getAlleleWithRefPadding(allele); + encoder.encodeString(s); + } + } + + private void buildFilter( VariantContext vc ) throws IOException { + if ( vc.isFiltered() ) { + encodeStringsByRef(vc.getFilters()); + } else { + encoder.encodeTypedMissing(BCFType.INT32); + } + } + + private void buildInfo( VariantContext vc ) throws IOException { + for ( Map.Entry infoFieldEntry : vc.getAttributes().entrySet() ) { + final String key = infoFieldEntry.getKey(); + Object value = infoFieldEntry.getValue(); + + final VCFToBCFType typeEquiv = getBCF2TypeFromHeader(key, value); + // handle the special FLAG case -- super annoying + if ( typeEquiv.vcfType == VCFHeaderLineType.Flag ) value = 1; + + encodeStringByRef(key); + if ( value instanceof List ) // NOTE: ONLY WORKS WITH LISTS + encoder.encodeTypedVector((List) value, typeEquiv.bcfType); + else if ( value instanceof String ) + encoder.encodeString((String)value); + else + encoder.encodeTypedSingleton(value, typeEquiv.bcfType); + } + } + + private byte[] buildSamplesData(final VariantContext vc) throws IOException { + // write size + List genotypeFields = StandardVCFWriter.calcVCFGenotypeKeys(vc); + for ( final String field : genotypeFields ) { + if ( field.equals(VCFConstants.GENOTYPE_KEY) ) { + addGenotypes(vc); + } else if ( field.equals(VCFConstants.GENOTYPE_QUALITY_KEY) ) { + addGQ(vc); + } else if ( field.equals(VCFConstants.GENOTYPE_FILTER_KEY) ) { + addGenotypeFilters(vc); + } else { + addGenericGenotypeField(vc, field); + } + } + + return encoder.getRecordBytes(); + } + + private final int getNGenotypeFieldValues(final String field, final VariantContext vc) { + final VCFCompoundHeaderLine metaData = VariantContext.getMetaDataForField(header, field); + int nFields = metaData.getCount(vc.getAlternateAlleles().size()); + if ( nFields == -1 ) { // unbounded, need to look at values + return computeMaxSizeOfGenotypeFieldFromValues(field, vc); + } else { + return nFields; + } + } + + private final int computeMaxSizeOfGenotypeFieldFromValues(final String field, final VariantContext vc) { + int size = 1; + final GenotypesContext gc = vc.getGenotypes(); + + for ( final Genotype g : gc ) { + final Object o = g.getAttribute(field); + if ( o == null ) continue; + if ( o instanceof List ) { + // only do compute if first value is of type list + final List values = (List)g.getAttribute(field); + if ( values != null ) + size = Math.max(size, values.size()); + } else { + return 1; + } + } + + return size; + } + + private final void addGenericGenotypeField(final VariantContext vc, final String field) throws IOException { + final int numInFormatField = getNGenotypeFieldValues(field, vc); + final VCFToBCFType type = getBCF2TypeFromHeader(field, null); + + startGenotypeField(field, numInFormatField, type.bcfType); + for ( final Genotype g : vc.getGenotypes() ) { + if ( ! g.hasAttribute(field) ) { + encoder.encodeRawMissingValues(numInFormatField, type.bcfType); + } else { + final Object val = g.getAttribute(field); + final Collection vals = numInFormatField == 1 ? Collections.singleton(val) : (Collection)val; + encoder.encodeRawValues(vals, type.bcfType); + } + } + } + + private final class VCFToBCFType { + VCFHeaderLineType vcfType; + BCFType bcfType; + + private VCFToBCFType(final VCFHeaderLineType vcfType, final BCFType bcfType) { + this.vcfType = vcfType; + this.bcfType = bcfType; + } + } + + // TODO -- we really need explicit converters as first class objects + private final VCFToBCFType getBCF2TypeFromHeader(final String field, final Object maybeIntValue) { + // TODO -- need to generalize so we can enable vectors of compressed genotype ints + final VCFCompoundHeaderLine metaData = VariantContext.getMetaDataForField(header, field); + + // TODO -- no sense in allocating these over and over + switch ( metaData.getType() ) { + case Character: return new VCFToBCFType(metaData.getType(), BCFType.CHAR); + case Flag: return new VCFToBCFType(metaData.getType(), BCFType.INT8); + case String: return new VCFToBCFType(metaData.getType(), BCFType.CHAR); + case Integer: return new VCFToBCFType(metaData.getType(), maybeIntValue != null ? encoder.determineIntegerType((Integer)maybeIntValue) : BCFType.INT32); + case Float: return new VCFToBCFType(metaData.getType(), BCFType.FLOAT); + default: throw new ReviewedStingException("Unexpected type for field " + field); + } + } + + private final void addGenotypeFilters(final VariantContext vc) throws IOException { + logger.warn("Skipping genotype filter field"); +// // TODO -- FIXME -- string is wrong here -- need to compute string size... +// startGenotypeField(VCFConstants.GENOTYPE_FILTER_KEY, 1, BCFType.CHAR); +// for ( final Genotype g : vc.getGenotypes() ) { +// if ( g.filtersWereApplied() && g.isFiltered() ) { +// encoder.encodeString(ParsingUtils.join(";", ParsingUtils.sortList(g.getFilters()))); +// } else { +// encoder.encodeRawMissingValues(1, BCFType.CHAR); // todo fixme +// } +// } + } + + private final void addGQ(final VariantContext vc) throws IOException { + startGenotypeField(VCFConstants.GENOTYPE_QUALITY_KEY, 1, BCFType.INT8); + for ( final Genotype g : vc.getGenotypes() ) { + if ( g.hasLog10PError() ) { + final int GQ = (int)Math.round(Math.min(g.getPhredScaledQual(), VCFConstants.MAX_GENOTYPE_QUAL)); + if ( GQ > VCFConstants.MAX_GENOTYPE_QUAL ) throw new ReviewedStingException("Unexpectedly large GQ " + GQ + " at " + vc); + encoder.encodeRawValue(GQ, BCFType.INT8); + } else { + encoder.encodeRawMissingValues(1, BCFType.INT8); + } + } + } + + private final void addGenotypes(final VariantContext vc) throws IOException { + if ( vc.getNAlleles() > 127 ) + throw new ReviewedStingException("Current BCF2 encoder cannot handle sites " + + "with > 127 alleles, but you have " + vc.getNAlleles() + " at " + + vc.getChr() + ":" + vc.getStart()); + + final Map alleleMap = StandardVCFWriter.buildAlleleMap(vc); + final int requiredPloidy = 2; // TODO -- handle ploidy, will need padding / depadding + startGenotypeField(VCFConstants.GENOTYPE_KEY, requiredPloidy, BCFType.INT8); + for ( final Genotype g : vc.getGenotypes() ) { + if ( g.getPloidy() != requiredPloidy ) throw new ReviewedStingException("Cannot currently handle non-diploid calls!"); + final List encoding = new ArrayList(requiredPloidy); + for ( final Allele a : g.getAlleles() ) { + final int offset = a.isNoCall() ? -1 : Integer.valueOf(alleleMap.get(a)); + encoding.add(((offset+1) << 1) | (g.isPhased() ? 0x01 : 0x00)); + } + encoder.encodeRawValues(encoding, BCFType.INT8); + } + } + + /** + * Write the data in the encoder to the outputstream as a length encoded + * block of data. After this call the encoder stream will be ready to + * start a new data block + * + * @throws IOException + */ + private void writeBlock(final byte[] infoBlock, final byte[] genotypesBlock) throws IOException { + BCF2Encoder.encodePrimitive(infoBlock.length, BCFType.INT32, outputStream); + BCF2Encoder.encodePrimitive(genotypesBlock.length, BCFType.INT32, outputStream); + outputStream.write(infoBlock); + outputStream.write(genotypesBlock); + } + + public final BCFType encodeStringByRef(final String string) throws IOException { + return encodeStringsByRef(Collections.singleton(string)); + } + + public final BCFType encodeStringsByRef(final Collection strings) throws IOException { + final List offsets = new ArrayList(strings.size()); + BCFType maxType = BCFType.INT8; // start with the smallest size + + // iterate over strings until we find one that needs 16 bits, and break + for ( final String string : strings ) { + final int offset = stringDictionary.get(string); + offsets.add(offset); + final BCFType type1 = encoder.determineIntegerType(offset); + switch ( type1 ) { + case INT8: break; + case INT16: if ( maxType == BCFType.INT8 ) maxType = BCFType.INT16; break; + case INT32: maxType = BCFType.INT32; break; + default: throw new ReviewedStingException("Unexpected type " + type1); + } + } + + // we've checked the types for all strings, so write them out + encoder.encodeTypedVector(offsets, maxType); + return maxType; + } + + public final void startGenotypeField(final String key, final int size, final BCFType valueType) throws IOException { + encodeStringByRef(key); + encoder.encodeType(size, valueType); + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCFType.java b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCFType.java new file mode 100644 index 000000000..ed7bf52a2 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCFType.java @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.codecs.bcf2; + +/** + * BCF2 types and information + * + * @author depristo + * @since 05/12 + */ +public enum BCFType { + RESERVED_0, + INT8(1, BCF2Constants.INT8_MISSING_VALUE, -127, 127), // todo -- confirm range + INT16(2, BCF2Constants.INT16_MISSING_VALUE, -32767, 32767), + INT32(4, BCF2Constants.INT32_MISSING_VALUE, -2147483647, 2147483647), + RESERVED_4, + FLOAT(4, BCF2Constants.FLOAT_MISSING_VALUE), + RESERVED_6, + CHAR; + + private final Object missingJavaValue; + private final int missingBytes; + private final int sizeInBytes; + private final long minValue, maxValue; + + BCFType() { + this(-1, 0, 0, 0); + } + + BCFType(final int sizeInBytes, final int missingBytes) { + this(sizeInBytes, missingBytes, 0, 0); + } + + BCFType(final int sizeInBytes, final int missingBytes, final long minValue, final long maxValue) { + this.sizeInBytes = sizeInBytes; + this.missingJavaValue = null; + this.missingBytes = missingBytes; + this.minValue = minValue; + this.maxValue = maxValue; + } + + public int getSizeInBytes() { + return sizeInBytes; + } + public int getID() { return ordinal(); } + public final boolean withinRange(final long v) { return v >= minValue && v <= maxValue; } + public Object getMissingJavaValue() { return missingJavaValue; } + public int getMissingBytes() { return missingBytes; } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/TypeDescriptor.java b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/TypeDescriptor.java new file mode 100644 index 000000000..4eb59df5d --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/TypeDescriptor.java @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.codecs.bcf2; + +/** + * Simple BCF decoder + * @author Mark DePristo + * @since 5/3/12 + */ +public class TypeDescriptor { + public static final int OVERFLOW_ELEMENT_MARKER = 15; + public static final int MAX_INLINE_ELEMENTS = 14; + + public final static BCFType[] INTEGER_TYPES_BY_SIZE = new BCFType[3]; + public final static BCFType[] DICTIONARY_TYPES_BY_SIZE = INTEGER_TYPES_BY_SIZE; + private final static BCFType[] lookup = BCFType.values(); + + static { + INTEGER_TYPES_BY_SIZE[0] = BCFType.INT8; + INTEGER_TYPES_BY_SIZE[1] = BCFType.INT16; + INTEGER_TYPES_BY_SIZE[2] = BCFType.INT32; + } + + public final static byte encodeTypeDescriptor(final int nElements, final BCFType type ) { + int encodeSize = Math.min(nElements, OVERFLOW_ELEMENT_MARKER); + byte typeByte = (byte)((0x0F & encodeSize) << 4 | (type.getID() & 0x0F)); + return typeByte; + } + + public final static int decodeSize(final byte typeDescriptor) { + return (0xF0 & typeDescriptor) >> 4; + } + + public final static int decodeTypeID(final byte typeDescriptor) { + return typeDescriptor & 0x0F; + } + + public final static BCFType decodeType(final byte typeDescriptor) { + return lookup[decodeTypeID(typeDescriptor)]; + } + + public final static boolean sizeIsOverflow(final byte typeDescriptor) { + return decodeSize(typeDescriptor) == OVERFLOW_ELEMENT_MARKER; + } + + public final static boolean willOverflow(final long nElements) { + return nElements > MAX_INLINE_ELEMENTS; + } +} diff --git a/public/java/test/org/broadinstitute/sting/utils/codecs/bcf2/EncoderDecoderUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/codecs/bcf2/EncoderDecoderUnitTest.java new file mode 100644 index 000000000..eb4e6393e --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/codecs/bcf2/EncoderDecoderUnitTest.java @@ -0,0 +1,362 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +// our package +package org.broadinstitute.sting.utils.codecs.bcf2; + + +// the imports for unit testing. + + +import org.broadinstitute.sting.BaseTest; +import org.testng.Assert; +import org.testng.annotations.BeforeSuite; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.util.*; + + +public class EncoderDecoderUnitTest extends BaseTest { + private final float FLOAT_TOLERANCE = (float)1e-8; + final List primitives = new ArrayList(); + final List basicTypes = new ArrayList(); + final List forCombinations = new ArrayList(); + + @BeforeSuite + public void before() { + basicTypes.add(new BCF2TypedValue(1, BCFType.INT8)); + basicTypes.add(new BCF2TypedValue(1000, BCFType.INT16)); + basicTypes.add(new BCF2TypedValue(1000000, BCFType.INT32)); + basicTypes.add(new BCF2TypedValue(1.2345e6, BCFType.FLOAT)); + basicTypes.add(new BCF2TypedValue(new Byte((byte)'A'), BCFType.CHAR)); + + // small ints + primitives.add(new BCF2TypedValue(0, BCFType.INT8)); + primitives.add(new BCF2TypedValue(10, BCFType.INT8)); + primitives.add(new BCF2TypedValue(-1, BCFType.INT8)); + primitives.add(new BCF2TypedValue(100, BCFType.INT8)); + primitives.add(new BCF2TypedValue(-100, BCFType.INT8)); + primitives.add(new BCF2TypedValue(-127, BCFType.INT8)); // last value in range + primitives.add(new BCF2TypedValue( 127, BCFType.INT8)); // last value in range + + // medium ints + primitives.add(new BCF2TypedValue(-1000, BCFType.INT16)); + primitives.add(new BCF2TypedValue(1000, BCFType.INT16)); + primitives.add(new BCF2TypedValue(-128, BCFType.INT16)); // first value in range + primitives.add(new BCF2TypedValue( 128, BCFType.INT16)); // first value in range + primitives.add(new BCF2TypedValue(-32767, BCFType.INT16)); // last value in range + primitives.add(new BCF2TypedValue( 32767, BCFType.INT16)); // last value in range + + // larger ints + primitives.add(new BCF2TypedValue(-32768, BCFType.INT32)); // first value in range + primitives.add(new BCF2TypedValue( 32768, BCFType.INT32)); // first value in range + primitives.add(new BCF2TypedValue(-100000, BCFType.INT32)); + primitives.add(new BCF2TypedValue(100000, BCFType.INT32)); + primitives.add(new BCF2TypedValue(-2147483647, BCFType.INT32)); + primitives.add(new BCF2TypedValue(2147483647, BCFType.INT32)); + + // floats + primitives.add(new BCF2TypedValue(0.0, BCFType.FLOAT)); + primitives.add(new BCF2TypedValue(-0.0, BCFType.FLOAT)); + primitives.add(new BCF2TypedValue(1.0, BCFType.FLOAT)); + primitives.add(new BCF2TypedValue(-1.0, BCFType.FLOAT)); + primitives.add(new BCF2TypedValue(1.1, BCFType.FLOAT)); + primitives.add(new BCF2TypedValue(-1.1, BCFType.FLOAT)); + primitives.add(new BCF2TypedValue(5.0 / 3.0, BCFType.FLOAT)); + primitives.add(new BCF2TypedValue(-5.0 / 3.0, BCFType.FLOAT)); + primitives.add(new BCF2TypedValue(1.23e3, BCFType.FLOAT)); + primitives.add(new BCF2TypedValue(1.23e6, BCFType.FLOAT)); + primitives.add(new BCF2TypedValue(1.23e9, BCFType.FLOAT)); + primitives.add(new BCF2TypedValue(1.23e12, BCFType.FLOAT)); + primitives.add(new BCF2TypedValue(1.23e15, BCFType.FLOAT)); + primitives.add(new BCF2TypedValue(-1.23e3, BCFType.FLOAT)); + primitives.add(new BCF2TypedValue(-1.23e6, BCFType.FLOAT)); + primitives.add(new BCF2TypedValue(-1.23e9, BCFType.FLOAT)); + primitives.add(new BCF2TypedValue(-1.23e12, BCFType.FLOAT)); + primitives.add(new BCF2TypedValue(-1.23e15, BCFType.FLOAT)); + primitives.add(new BCF2TypedValue(Float.MIN_VALUE, BCFType.FLOAT)); + primitives.add(new BCF2TypedValue(Float.MAX_VALUE, BCFType.FLOAT)); + primitives.add(new BCF2TypedValue(Float.NEGATIVE_INFINITY, BCFType.FLOAT)); + primitives.add(new BCF2TypedValue(Float.POSITIVE_INFINITY, BCFType.FLOAT)); + primitives.add(new BCF2TypedValue(Float.NaN, BCFType.FLOAT)); + + // strings + //primitives.add(new BCF2TypedValue("", BCFType.CHAR)); <- will be null (which is right) + primitives.add(new BCF2TypedValue("S", BCFType.CHAR)); + primitives.add(new BCF2TypedValue("S2", BCFType.CHAR)); + primitives.add(new BCF2TypedValue("12345678910", BCFType.CHAR)); + primitives.add(new BCF2TypedValue("ABCDEFGHIJKLMNOPQRSTUVWXYZ", BCFType.CHAR)); + primitives.add(new BCF2TypedValue("ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZ", BCFType.CHAR)); + + // missing values + for ( BCFType type : BCFType.values() ) { + primitives.add(new BCF2TypedValue(null, type)); + } + + forCombinations.add(new BCF2TypedValue(10, BCFType.INT8)); + forCombinations.add(new BCF2TypedValue(100, BCFType.INT8)); + forCombinations.add(new BCF2TypedValue(-100, BCFType.INT8)); + forCombinations.add(new BCF2TypedValue(-128, BCFType.INT16)); // first value in range + forCombinations.add(new BCF2TypedValue( 128, BCFType.INT16)); // first value in range + forCombinations.add(new BCF2TypedValue(-100000, BCFType.INT32)); + forCombinations.add(new BCF2TypedValue(100000, BCFType.INT32)); + forCombinations.add(new BCF2TypedValue(0.0, BCFType.FLOAT)); + forCombinations.add(new BCF2TypedValue(1.23e6, BCFType.FLOAT)); + forCombinations.add(new BCF2TypedValue(-1.23e6, BCFType.FLOAT)); + forCombinations.add(new BCF2TypedValue("S", BCFType.CHAR)); + forCombinations.add(new BCF2TypedValue("ABCDEFGHIJKLMNOPQRSTUVWXYZ", BCFType.CHAR)); + forCombinations.add(new BCF2TypedValue("ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZ", BCFType.CHAR)); + + // missing values + for ( BCFType type : BCFType.values() ) { + forCombinations.add(new BCF2TypedValue(null, type)); + } + + } + + // -------------------------------------------------------------------------------- + // + // merge case Provider + // + // -------------------------------------------------------------------------------- + + private class BCF2TypedValue { + final BCFType type; + final Object value; + + private BCF2TypedValue(final int value, final BCFType type) { + this(new Integer(value), type); + } + + private BCF2TypedValue(final double value, final BCFType type) { + this(new Float(value), type); + } + + private BCF2TypedValue(final Object value, final BCFType type) { + this.type = type; + this.value = value; + } + + public boolean isMissing() { return value == null; } + + @Override + public String toString() { + return String.format("%s of %s", value, type); + } + } + + @DataProvider(name = "BCF2EncodingTestProviderSingletons") + public Object[][] BCF2EncodingTestProviderSingletons() { + List tests = new ArrayList(); + for ( BCF2TypedValue tv : primitives ) + tests.add(new Object[]{Arrays.asList(tv)}); + return tests.toArray(new Object[][]{}); + } + + @DataProvider(name = "BCF2EncodingTestProviderBasicTypes") + public Object[][] BCF2EncodingTestProviderBasicTypes() { + List tests = new ArrayList(); + for ( BCF2TypedValue tv : basicTypes ) + tests.add(new Object[]{Arrays.asList(tv)}); + return tests.toArray(new Object[][]{}); + } + + @DataProvider(name = "BCF2EncodingTestProviderSequences") + public Object[][] BCF2EncodingTestProviderSequences() { + List tests = new ArrayList(); + for ( BCF2TypedValue tv1 : forCombinations ) + for ( BCF2TypedValue tv2 : forCombinations ) + for ( BCF2TypedValue tv3 : forCombinations ) + tests.add(new Object[]{Arrays.asList(tv1, tv2, tv3)}); + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "BCF2EncodingTestProviderSingletons") + public void testBCF2EncodingSingletons(final List toEncode) throws IOException { + final byte[] record = encodeRecord(toEncode); + decodeRecord(toEncode, record); + } + + @Test(dataProvider = "BCF2EncodingTestProviderBasicTypes") + public void testBCF2EncodingVectors(final List toEncode) throws IOException { + for ( final BCF2TypedValue tv : toEncode ) { + for ( final int length : Arrays.asList(2, 5, 10, 15, 20, 25) ) { + BCF2Encoder encoder = new BCF2Encoder(); + List expected = Collections.nCopies(length, tv.value); + encoder.encodeTypedVector(expected, tv.type); + + BCF2Decoder decoder = new BCF2Decoder(encoder.getRecordBytes()); + final Object decoded = decoder.decodeTypedValue(); + + if ( tv.type == BCFType.CHAR ) { + Assert.assertTrue(decoded instanceof String); + final String decodedString = (String)decoded; + Assert.assertTrue(decodedString.length() == length); + } else { + Assert.assertTrue(decoded instanceof List); + final List decodedList = (List)decoded; + Assert.assertEquals(decodedList.size(), expected.size()); + for ( Object decodedValue : decodedList ) + myAssertEquals(tv, decodedValue); + } + } + } + } + + @Test(dataProvider = "BCF2EncodingTestProviderBasicTypes") + public void testBCF2EncodingVectorsWithMissing(final List toEncode) throws IOException { + for ( final BCF2TypedValue tv : toEncode ) { + if ( tv.type != BCFType.CHAR ) { + for ( final int length : Arrays.asList(2, 5, 10, 15, 20, 25) ) { + final byte td = TypeDescriptor.encodeTypeDescriptor(1, tv.type); + + final BCF2Encoder encoder = new BCF2Encoder(); + for ( int i = 0; i < length; i++ ) { + encoder.encodeRawValue(i % 2 == 0 ? null : tv.value, tv.type); + } + + final BCF2Decoder decoder = new BCF2Decoder(encoder.getRecordBytes()); + + for ( int i = 0; i < length; i++ ) { + final Object decoded = decoder.decodeTypedValue(td); + myAssertEquals(i % 2 == 0 ? new BCF2TypedValue(null, tv.type) : tv, decoded); + } + } + } + } + } + + + @Test(dataProvider = "BCF2EncodingTestProviderSequences", dependsOnMethods = "testBCF2EncodingSingletons") + public void testBCF2EncodingTestProviderSequences(final List toEncode) throws IOException { + final byte[] record = encodeRecord(toEncode); + decodeRecord(toEncode, record); + } + + @Test(dataProvider = "BCF2EncodingTestProviderSequences", dependsOnMethods = "testBCF2EncodingTestProviderSequences") + public void testReadAndSkipWithMultipleBlocks(final List block) throws IOException { + testReadAndSkipWithMultipleBlocks(block, forCombinations); + testReadAndSkipWithMultipleBlocks(forCombinations, block); + } + + public void testReadAndSkipWithMultipleBlocks(final List block1, final List block2) throws IOException { + final byte[] record1 = encodeRecord(block1); + final byte[] record2 = encodeRecord(block2); + + // each record is individually good + decodeRecord(block1, record1); + decodeRecord(block2, record2); + + BCF2Decoder decoder = new BCF2Decoder(); + + // test setting + decoder.setRecordBytes(record1); + decodeRecord(block1, decoder); + decoder.setRecordBytes(record2); + decodeRecord(block2, decoder); + + // test combining the streams + final byte[] combined = combineRecords(record1, record2); + final List combinedObjects = new ArrayList(block1); + combinedObjects.addAll(block2); + + // the combined bytes is the same as the combined objects + InputStream stream = new ByteArrayInputStream(combined); + decoder.readNextBlock(record1.length, stream); + decodeRecord(block1, decoder); + decoder.readNextBlock(record2.length, stream); + decodeRecord(block2, decoder); + + // skipping the first block allows us to read the second block directly + stream = new ByteArrayInputStream(combined); + decoder.skipNextBlock(record1.length, stream); + decoder.readNextBlock(record2.length, stream); + decodeRecord(block2, decoder); + } + + private final byte[] combineRecords(final byte[] record1, final byte[] record2) throws IOException { + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + baos.write(record1); + baos.write(record2); + return baos.toByteArray(); + } + + private final byte[] encodeRecord(final List toEncode) throws IOException { + BCF2Encoder encoder = new BCF2Encoder(); + + for ( final BCF2TypedValue tv : toEncode ) { + if ( tv.isMissing() ) + encoder.encodeTypedMissing(tv.type); + else { + final BCFType encodedType = encoder.encode(tv.value); + if ( tv.type != null ) // only if we have an expectation + Assert.assertEquals(encodedType, tv.type); + } + } + + // check output + final byte[] record = encoder.getRecordBytes(); + Assert.assertNotNull(record); + Assert.assertTrue(record.length > 0); + return record; + } + + private final void decodeRecord(final List toEncode, final byte[] record) { + decodeRecord(toEncode, new BCF2Decoder(record)); + } + + private final void decodeRecord(final List toEncode, final BCF2Decoder decoder) { + for ( final BCF2TypedValue tv : toEncode ) { + Assert.assertFalse(decoder.blockIsFullyDecoded()); + final Object decoded = decoder.decodeTypedValue(); + + myAssertEquals(tv, decoded); + } + + Assert.assertTrue(decoder.blockIsFullyDecoded()); + } + + private final void myAssertEquals(final BCF2TypedValue tv, final Object decoded) { + if ( tv.value == null ) { // special needs for instanceof double + Assert.assertEquals(decoded, tv.value); + } else if ( tv.type == BCFType.FLOAT ) { // need tolerance for floats, and they aren't null + Assert.assertTrue(decoded instanceof Double); + + final float valueFloat = (float)(Float)tv.value; + final float decodedFloat = (float)(double)(Double)decoded; + + if ( Float.isNaN(valueFloat) ) // NaN == NaN => false unfortunately + Assert.assertTrue(Float.isNaN(decodedFloat)); + else { + Assert.assertEquals(decodedFloat, valueFloat, FLOAT_TOLERANCE); + } + } else + Assert.assertEquals(decoded, tv.value); + } +} \ No newline at end of file