From ce9e9eebb13c3dc100298a48dd87a1ad5fac521a Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Sat, 12 May 2012 13:07:08 -0400 Subject: [PATCH] No dictionary in header. Now built dynamically from the header in the writer and codec -- Created BCF2Utils and moved BCF2Constants and TypeDescriptor methods there --- .../sting/utils/codecs/bcf2/BCF2Codec.java | 20 +++---- .../utils/codecs/bcf2/BCF2Constants.java | 42 -------------- .../sting/utils/codecs/bcf2/BCF2Decoder.java | 8 +-- .../sting/utils/codecs/bcf2/BCF2Encoder.java | 6 +- .../sting/utils/codecs/bcf2/BCF2Type.java | 8 +-- .../{TypeDescriptor.java => BCF2Utils.java} | 58 ++++++++++++++++--- .../utils/codecs/bcf2/writer/BCF2Writer.java | 25 +++----- .../codecs/bcf2/EncoderDecoderUnitTest.java | 2 +- 8 files changed, 77 insertions(+), 92 deletions(-) delete mode 100644 public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Constants.java rename public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/{TypeDescriptor.java => BCF2Utils.java} (53%) diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java index 621637c29..b6d9e8d90 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java @@ -47,7 +47,7 @@ public class BCF2Codec implements FeatureCodec { final protected static Logger logger = Logger.getLogger(BCF2Codec.class); private VCFHeader header = null; private final ArrayList contigNames = new ArrayList(); - private final ArrayList dictionary = new ArrayList(); + private ArrayList dictionary; private final BCF2Decoder decoder = new BCF2Decoder(); private boolean skipGenotypes = false; @@ -126,7 +126,7 @@ public class BCF2Codec implements FeatureCodec { contigNames.add(contig.getID()); // create the string dictionary - parseDictionary(header); + dictionary = parseDictionary(header); // position right before next line (would be right before first real record byte at end of header) return new FeatureCodecHeader(header, inputStream.getPosition()); @@ -138,7 +138,7 @@ public class BCF2Codec implements FeatureCodec { FileInputStream fis = new FileInputStream(path); AsciiLineReader reader = new AsciiLineReader(new PositionalBufferedStream(fis)); String firstLine = reader.readLine(); - if ( firstLine != null && firstLine.equals(BCF2Constants.VERSION_LINE) ) { + if ( firstLine != null && firstLine.equals(BCF2Utils.VERSION_LINE) ) { return true; } } catch ( FileNotFoundException e ) { @@ -150,18 +150,14 @@ public class BCF2Codec implements FeatureCodec { return false; } - private final void parseDictionary(final VCFHeader header) { - for ( final VCFHeaderLine line : header.getMetaData() ) { - if ( line.getKey().equals(BCF2Constants.DICTIONARY_LINE_TAG) ) { - for ( final String string : line.getValue().split(BCF2Constants.DICTIONARY_LINE_ENTRY_SEPARATOR) ) - dictionary.add(string); - break; - } - } + private final ArrayList parseDictionary(final VCFHeader header) { + final ArrayList dict = BCF2Utils.makeDictionary(header); // if we got here we never found a dictionary, or there are no elements in the dictionary - if ( dictionary.size() == 0 ) + if ( dict.size() == 0 ) throw new UserException.MalformedBCF2("Dictionary header element was absent or empty"); + + return dict; } public boolean isSkippingGenotypes() { diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Constants.java b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Constants.java deleted file mode 100644 index 5936db9fd..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Constants.java +++ /dev/null @@ -1,42 +0,0 @@ -/* - * Copyright (c) 2012, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.utils.codecs.bcf2; - -import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader; - -import java.nio.charset.Charset; - -public class BCF2Constants { - public static final String VERSION_LINE_FORMAT = "fileformat=BCF2v%d.%d"; - public static final String VERSION_LINE = String.format(VCFHeader.METADATA_INDICATOR + VERSION_LINE_FORMAT, 0, 1); - public static final String DICTIONARY_LINE_TAG = "dictionary"; - public static final String DICTIONARY_LINE_ENTRY_SEPARATOR = ","; - - // Note that these values are prefixed by FFFFFF for convenience - public static final int INT8_MISSING_VALUE = 0xFFFFFF80; - public static final int INT16_MISSING_VALUE = 0xFFFF8000; - public static final int INT32_MISSING_VALUE = 0x80000000; - public static final int FLOAT_MISSING_VALUE = 0x7F800001; -} diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Decoder.java b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Decoder.java index 2adeb71d8..5d6ebf113 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Decoder.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Decoder.java @@ -129,8 +129,8 @@ public class BCF2Decoder { } public final Object decodeTypedValue(final byte typeDescriptor) { - final int size = TypeDescriptor.sizeIsOverflow(typeDescriptor) ? decodeVectorSize() : TypeDescriptor.decodeSize(typeDescriptor); - final BCF2Type type = TypeDescriptor.decodeType(typeDescriptor); + final int size = BCF2Utils.sizeIsOverflow(typeDescriptor) ? decodeVectorSize() : BCF2Utils.decodeSize(typeDescriptor); + final BCF2Type type = BCF2Utils.decodeType(typeDescriptor); assert size >= 0; @@ -186,8 +186,8 @@ public class BCF2Decoder { private final int decodeVectorSize() { final byte typeDescriptor = readTypeDescriptor(); - final int size = TypeDescriptor.decodeSize(typeDescriptor); - final BCF2Type type = TypeDescriptor.decodeType(typeDescriptor); + final int size = BCF2Utils.decodeSize(typeDescriptor); + final BCF2Type type = BCF2Utils.decodeType(typeDescriptor); assert size == 1; assert type == BCF2Type.INT8 || type == BCF2Type.INT16 || type == BCF2Type.INT32; diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Encoder.java b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Encoder.java index 090ae9144..dc3e17444 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Encoder.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Encoder.java @@ -171,9 +171,9 @@ public class BCF2Encoder { } public final void encodeType(final int size, final BCF2Type type) throws IOException { - final byte typeByte = TypeDescriptor.encodeTypeDescriptor(size, type); + final byte typeByte = BCF2Utils.encodeTypeDescriptor(size, type); encodeStream.write(typeByte); - if ( TypeDescriptor.willOverflow(size) ) + if ( BCF2Utils.willOverflow(size) ) encodeTypedIntOfBestSize(size); } @@ -206,7 +206,7 @@ public class BCF2Encoder { } public final BCF2Type determineIntegerType(final int value) { - for ( final BCF2Type potentialType : TypeDescriptor.INTEGER_TYPES_BY_SIZE ) { + for ( final BCF2Type potentialType : BCF2Utils.INTEGER_TYPES_BY_SIZE ) { if ( potentialType.withinRange(value) ) return potentialType; } diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Type.java b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Type.java index 3acfd8468..37bdda6a4 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Type.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Type.java @@ -32,11 +32,11 @@ package org.broadinstitute.sting.utils.codecs.bcf2; */ public enum BCF2Type { RESERVED_0, - INT8(1, BCF2Constants.INT8_MISSING_VALUE, -127, 127), // todo -- confirm range - INT16(2, BCF2Constants.INT16_MISSING_VALUE, -32767, 32767), - INT32(4, BCF2Constants.INT32_MISSING_VALUE, -2147483647, 2147483647), + INT8(1, BCF2Utils.INT8_MISSING_VALUE, -127, 127), // todo -- confirm range + INT16(2, BCF2Utils.INT16_MISSING_VALUE, -32767, 32767), + INT32(4, BCF2Utils.INT32_MISSING_VALUE, -2147483647, 2147483647), RESERVED_4, - FLOAT(4, BCF2Constants.FLOAT_MISSING_VALUE), + FLOAT(4, BCF2Utils.FLOAT_MISSING_VALUE), RESERVED_6, CHAR; diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/TypeDescriptor.java b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Utils.java similarity index 53% rename from public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/TypeDescriptor.java rename to public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Utils.java index ebd100811..3d6fe056d 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/TypeDescriptor.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Utils.java @@ -24,23 +24,63 @@ package org.broadinstitute.sting.utils.codecs.bcf2; +import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; +import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader; +import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLine; +import org.broadinstitute.sting.utils.codecs.vcf.VCFIDHeaderLine; + +import java.util.ArrayList; + /** - * Convenience methods for encoding, decoding BCF2 type descriptors (size + type) - * @author Mark DePristo - * @since 5/3/12 + * Common utilities for working with BCF2 files + * + * Includes convenience methods for encoding, decoding BCF2 type descriptors (size + type) + * + * @author depristo + * @since 5/12 */ -class TypeDescriptor { +public class BCF2Utils { public static final int OVERFLOW_ELEMENT_MARKER = 15; public static final int MAX_INLINE_ELEMENTS = 14; - public final static BCF2Type[] INTEGER_TYPES_BY_SIZE = new BCF2Type[3]; - public final static BCF2Type[] DICTIONARY_TYPES_BY_SIZE = INTEGER_TYPES_BY_SIZE; private final static BCF2Type[] LOOKUP = BCF2Type.values(); + public static final String VERSION_LINE_FORMAT = "fileformat=BCF2v%d.%d"; + public static final String VERSION_LINE = String.format(VCFHeader.METADATA_INDICATOR + VERSION_LINE_FORMAT, 0, 1); + + // Note that these values are prefixed by FFFFFF for convenience + public static final int INT8_MISSING_VALUE = 0xFFFFFF80; + public static final int INT16_MISSING_VALUE = 0xFFFF8000; + public static final int INT32_MISSING_VALUE = 0x80000000; + public static final int FLOAT_MISSING_VALUE = 0x7F800001; static { - INTEGER_TYPES_BY_SIZE[0] = BCF2Type.INT8; - INTEGER_TYPES_BY_SIZE[1] = BCF2Type.INT16; - INTEGER_TYPES_BY_SIZE[2] = BCF2Type.INT32; + BCF2Utils.INTEGER_TYPES_BY_SIZE[0] = BCF2Type.INT8; + BCF2Utils.INTEGER_TYPES_BY_SIZE[1] = BCF2Type.INT16; + BCF2Utils.INTEGER_TYPES_BY_SIZE[2] = BCF2Type.INT32; + } + + /** + * Create a strings dictionary from the VCF header + * + * The dictionary is an ordered list of common VCF identifers (FILTER, INFO, and FORMAT) + * fields. + * + * @param header the VCFHeader from which to build the dictionary + * @return a non-null dictionary of elements, may be empty + */ + public final static ArrayList makeDictionary(final VCFHeader header) { + final ArrayList dict = new ArrayList(); + + // set up the strings dictionary + dict.add(VCFConstants.PASSES_FILTERS_v4); // special case the special PASS field + for ( VCFHeaderLine line : header.getMetaData() ) { + if ( line instanceof VCFIDHeaderLine) { + VCFIDHeaderLine idLine = (VCFIDHeaderLine)line; + dict.add(idLine.getID()); + } + } + + return dict; } public final static byte encodeTypeDescriptor(final int nElements, final BCF2Type type ) { diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/writer/BCF2Writer.java b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/writer/BCF2Writer.java index 81bb2ce8e..822f62407 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/writer/BCF2Writer.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/writer/BCF2Writer.java @@ -26,10 +26,9 @@ package org.broadinstitute.sting.utils.codecs.bcf2.writer; import net.sf.samtools.SAMSequenceDictionary; import org.apache.log4j.Logger; -import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.codecs.bcf2.BCF2Constants; import org.broadinstitute.sting.utils.codecs.bcf2.BCF2Encoder; import org.broadinstitute.sting.utils.codecs.bcf2.BCF2Type; +import org.broadinstitute.sting.utils.codecs.bcf2.BCF2Utils; import org.broadinstitute.sting.utils.codecs.vcf.*; import org.broadinstitute.sting.utils.codecs.vcf.writer.IndexingVCFWriter; import org.broadinstitute.sting.utils.codecs.vcf.writer.StandardVCFWriter; @@ -52,7 +51,7 @@ public class BCF2Writer extends IndexingVCFWriter { private OutputStream outputStream; // Note: do not flush until completely done writing, to avoid issues with eventual BGZF support private VCFHeader header; private Map contigDictionary = new HashMap(); - private Map stringDictionary = new LinkedHashMap(); + private Map stringDictionaryMap = new LinkedHashMap(); private final BCF2Encoder encoder = new BCF2Encoder(); // initialized after the header arrives @@ -75,22 +74,14 @@ public class BCF2Writer extends IndexingVCFWriter { for ( final VCFContigHeaderLine contig : header.getContigLines()) contigDictionary.put(contig.getID(), contig.getContigIndex()); - // set up the strings dictionary - int offset = 0; - stringDictionary.put(VCFConstants.PASSES_FILTERS_v4, offset++); // special case the special PASS field - for ( VCFHeaderLine line : header.getMetaData() ) { - if ( line instanceof VCFIDHeaderLine ) { - VCFIDHeaderLine idLine = (VCFIDHeaderLine)line; - stringDictionary.put(idLine.getID(), offset++); - } + // set up the map from dictionary string values -> offset + final ArrayList dict = BCF2Utils.makeDictionary(header); + for ( int i = 0; i < dict.size(); i++ ) { + stringDictionaryMap.put(dict.get(i), i); } - // add the dictionary ##dictionary=x,y,z line to the header - final String dictionaryLineValue = Utils.join(BCF2Constants.DICTIONARY_LINE_ENTRY_SEPARATOR, stringDictionary.keySet()); - header.addMetaDataLine(new VCFHeaderLine(BCF2Constants.DICTIONARY_LINE_TAG, dictionaryLineValue)); - // write out the header - StandardVCFWriter.writeHeader(header, new OutputStreamWriter(outputStream), doNotWriteGenotypes, BCF2Constants.VERSION_LINE, "BCF2 stream"); + StandardVCFWriter.writeHeader(header, new OutputStreamWriter(outputStream), doNotWriteGenotypes, BCF2Utils.VERSION_LINE, "BCF2 stream"); } @Override @@ -369,7 +360,7 @@ public class BCF2Writer extends IndexingVCFWriter { // iterate over strings until we find one that needs 16 bits, and break for ( final String string : strings ) { - final int offset = stringDictionary.get(string); + final int offset = stringDictionaryMap.get(string); offsets.add(offset); final BCF2Type type1 = encoder.determineIntegerType(offset); switch ( type1 ) { diff --git a/public/java/test/org/broadinstitute/sting/utils/codecs/bcf2/EncoderDecoderUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/codecs/bcf2/EncoderDecoderUnitTest.java index bdaf89cb3..b9540da32 100644 --- a/public/java/test/org/broadinstitute/sting/utils/codecs/bcf2/EncoderDecoderUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/codecs/bcf2/EncoderDecoderUnitTest.java @@ -234,7 +234,7 @@ public class EncoderDecoderUnitTest extends BaseTest { for ( final BCF2TypedValue tv : toEncode ) { if ( tv.type != BCF2Type.CHAR ) { for ( final int length : Arrays.asList(2, 5, 10, 15, 20, 25) ) { - final byte td = TypeDescriptor.encodeTypeDescriptor(1, tv.type); + final byte td = BCF2Utils.encodeTypeDescriptor(1, tv.type); final BCF2Encoder encoder = new BCF2Encoder(); for ( int i = 0; i < length; i++ ) {