No dictionary in header. Now built dynamically from the header in the writer and codec

-- Created BCF2Utils and moved BCF2Constants and TypeDescriptor methods there
This commit is contained in:
Mark DePristo 2012-05-12 13:07:08 -04:00
parent f0b081a85f
commit ce9e9eebb1
8 changed files with 77 additions and 92 deletions

View File

@ -47,7 +47,7 @@ public class BCF2Codec implements FeatureCodec<VariantContext> {
final protected static Logger logger = Logger.getLogger(BCF2Codec.class);
private VCFHeader header = null;
private final ArrayList<String> contigNames = new ArrayList<String>();
private final ArrayList<String> dictionary = new ArrayList<String>();
private ArrayList<String> dictionary;
private final BCF2Decoder decoder = new BCF2Decoder();
private boolean skipGenotypes = false;
@ -126,7 +126,7 @@ public class BCF2Codec implements FeatureCodec<VariantContext> {
contigNames.add(contig.getID());
// create the string dictionary
parseDictionary(header);
dictionary = parseDictionary(header);
// position right before next line (would be right before first real record byte at end of header)
return new FeatureCodecHeader(header, inputStream.getPosition());
@ -138,7 +138,7 @@ public class BCF2Codec implements FeatureCodec<VariantContext> {
FileInputStream fis = new FileInputStream(path);
AsciiLineReader reader = new AsciiLineReader(new PositionalBufferedStream(fis));
String firstLine = reader.readLine();
if ( firstLine != null && firstLine.equals(BCF2Constants.VERSION_LINE) ) {
if ( firstLine != null && firstLine.equals(BCF2Utils.VERSION_LINE) ) {
return true;
}
} catch ( FileNotFoundException e ) {
@ -150,18 +150,14 @@ public class BCF2Codec implements FeatureCodec<VariantContext> {
return false;
}
private final void parseDictionary(final VCFHeader header) {
for ( final VCFHeaderLine line : header.getMetaData() ) {
if ( line.getKey().equals(BCF2Constants.DICTIONARY_LINE_TAG) ) {
for ( final String string : line.getValue().split(BCF2Constants.DICTIONARY_LINE_ENTRY_SEPARATOR) )
dictionary.add(string);
break;
}
}
private final ArrayList<String> parseDictionary(final VCFHeader header) {
final ArrayList<String> dict = BCF2Utils.makeDictionary(header);
// if we got here we never found a dictionary, or there are no elements in the dictionary
if ( dictionary.size() == 0 )
if ( dict.size() == 0 )
throw new UserException.MalformedBCF2("Dictionary header element was absent or empty");
return dict;
}
public boolean isSkippingGenotypes() {

View File

@ -1,42 +0,0 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.codecs.bcf2;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
import java.nio.charset.Charset;
public class BCF2Constants {
public static final String VERSION_LINE_FORMAT = "fileformat=BCF2v%d.%d";
public static final String VERSION_LINE = String.format(VCFHeader.METADATA_INDICATOR + VERSION_LINE_FORMAT, 0, 1);
public static final String DICTIONARY_LINE_TAG = "dictionary";
public static final String DICTIONARY_LINE_ENTRY_SEPARATOR = ",";
// Note that these values are prefixed by FFFFFF for convenience
public static final int INT8_MISSING_VALUE = 0xFFFFFF80;
public static final int INT16_MISSING_VALUE = 0xFFFF8000;
public static final int INT32_MISSING_VALUE = 0x80000000;
public static final int FLOAT_MISSING_VALUE = 0x7F800001;
}

View File

@ -129,8 +129,8 @@ public class BCF2Decoder {
}
public final Object decodeTypedValue(final byte typeDescriptor) {
final int size = TypeDescriptor.sizeIsOverflow(typeDescriptor) ? decodeVectorSize() : TypeDescriptor.decodeSize(typeDescriptor);
final BCF2Type type = TypeDescriptor.decodeType(typeDescriptor);
final int size = BCF2Utils.sizeIsOverflow(typeDescriptor) ? decodeVectorSize() : BCF2Utils.decodeSize(typeDescriptor);
final BCF2Type type = BCF2Utils.decodeType(typeDescriptor);
assert size >= 0;
@ -186,8 +186,8 @@ public class BCF2Decoder {
private final int decodeVectorSize() {
final byte typeDescriptor = readTypeDescriptor();
final int size = TypeDescriptor.decodeSize(typeDescriptor);
final BCF2Type type = TypeDescriptor.decodeType(typeDescriptor);
final int size = BCF2Utils.decodeSize(typeDescriptor);
final BCF2Type type = BCF2Utils.decodeType(typeDescriptor);
assert size == 1;
assert type == BCF2Type.INT8 || type == BCF2Type.INT16 || type == BCF2Type.INT32;

View File

@ -171,9 +171,9 @@ public class BCF2Encoder {
}
public final void encodeType(final int size, final BCF2Type type) throws IOException {
final byte typeByte = TypeDescriptor.encodeTypeDescriptor(size, type);
final byte typeByte = BCF2Utils.encodeTypeDescriptor(size, type);
encodeStream.write(typeByte);
if ( TypeDescriptor.willOverflow(size) )
if ( BCF2Utils.willOverflow(size) )
encodeTypedIntOfBestSize(size);
}
@ -206,7 +206,7 @@ public class BCF2Encoder {
}
public final BCF2Type determineIntegerType(final int value) {
for ( final BCF2Type potentialType : TypeDescriptor.INTEGER_TYPES_BY_SIZE ) {
for ( final BCF2Type potentialType : BCF2Utils.INTEGER_TYPES_BY_SIZE ) {
if ( potentialType.withinRange(value) )
return potentialType;
}

View File

@ -32,11 +32,11 @@ package org.broadinstitute.sting.utils.codecs.bcf2;
*/
public enum BCF2Type {
RESERVED_0,
INT8(1, BCF2Constants.INT8_MISSING_VALUE, -127, 127), // todo -- confirm range
INT16(2, BCF2Constants.INT16_MISSING_VALUE, -32767, 32767),
INT32(4, BCF2Constants.INT32_MISSING_VALUE, -2147483647, 2147483647),
INT8(1, BCF2Utils.INT8_MISSING_VALUE, -127, 127), // todo -- confirm range
INT16(2, BCF2Utils.INT16_MISSING_VALUE, -32767, 32767),
INT32(4, BCF2Utils.INT32_MISSING_VALUE, -2147483647, 2147483647),
RESERVED_4,
FLOAT(4, BCF2Constants.FLOAT_MISSING_VALUE),
FLOAT(4, BCF2Utils.FLOAT_MISSING_VALUE),
RESERVED_6,
CHAR;

View File

@ -24,23 +24,63 @@
package org.broadinstitute.sting.utils.codecs.bcf2;
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLine;
import org.broadinstitute.sting.utils.codecs.vcf.VCFIDHeaderLine;
import java.util.ArrayList;
/**
* Convenience methods for encoding, decoding BCF2 type descriptors (size + type)
* @author Mark DePristo
* @since 5/3/12
* Common utilities for working with BCF2 files
*
* Includes convenience methods for encoding, decoding BCF2 type descriptors (size + type)
*
* @author depristo
* @since 5/12
*/
class TypeDescriptor {
public class BCF2Utils {
public static final int OVERFLOW_ELEMENT_MARKER = 15;
public static final int MAX_INLINE_ELEMENTS = 14;
public final static BCF2Type[] INTEGER_TYPES_BY_SIZE = new BCF2Type[3];
public final static BCF2Type[] DICTIONARY_TYPES_BY_SIZE = INTEGER_TYPES_BY_SIZE;
private final static BCF2Type[] LOOKUP = BCF2Type.values();
public static final String VERSION_LINE_FORMAT = "fileformat=BCF2v%d.%d";
public static final String VERSION_LINE = String.format(VCFHeader.METADATA_INDICATOR + VERSION_LINE_FORMAT, 0, 1);
// Note that these values are prefixed by FFFFFF for convenience
public static final int INT8_MISSING_VALUE = 0xFFFFFF80;
public static final int INT16_MISSING_VALUE = 0xFFFF8000;
public static final int INT32_MISSING_VALUE = 0x80000000;
public static final int FLOAT_MISSING_VALUE = 0x7F800001;
static {
INTEGER_TYPES_BY_SIZE[0] = BCF2Type.INT8;
INTEGER_TYPES_BY_SIZE[1] = BCF2Type.INT16;
INTEGER_TYPES_BY_SIZE[2] = BCF2Type.INT32;
BCF2Utils.INTEGER_TYPES_BY_SIZE[0] = BCF2Type.INT8;
BCF2Utils.INTEGER_TYPES_BY_SIZE[1] = BCF2Type.INT16;
BCF2Utils.INTEGER_TYPES_BY_SIZE[2] = BCF2Type.INT32;
}
/**
* Create a strings dictionary from the VCF header
*
* The dictionary is an ordered list of common VCF identifers (FILTER, INFO, and FORMAT)
* fields.
*
* @param header the VCFHeader from which to build the dictionary
* @return a non-null dictionary of elements, may be empty
*/
public final static ArrayList<String> makeDictionary(final VCFHeader header) {
final ArrayList<String> dict = new ArrayList<String>();
// set up the strings dictionary
dict.add(VCFConstants.PASSES_FILTERS_v4); // special case the special PASS field
for ( VCFHeaderLine line : header.getMetaData() ) {
if ( line instanceof VCFIDHeaderLine) {
VCFIDHeaderLine idLine = (VCFIDHeaderLine)line;
dict.add(idLine.getID());
}
}
return dict;
}
public final static byte encodeTypeDescriptor(final int nElements, final BCF2Type type ) {

View File

@ -26,10 +26,9 @@ package org.broadinstitute.sting.utils.codecs.bcf2.writer;
import net.sf.samtools.SAMSequenceDictionary;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.codecs.bcf2.BCF2Constants;
import org.broadinstitute.sting.utils.codecs.bcf2.BCF2Encoder;
import org.broadinstitute.sting.utils.codecs.bcf2.BCF2Type;
import org.broadinstitute.sting.utils.codecs.bcf2.BCF2Utils;
import org.broadinstitute.sting.utils.codecs.vcf.*;
import org.broadinstitute.sting.utils.codecs.vcf.writer.IndexingVCFWriter;
import org.broadinstitute.sting.utils.codecs.vcf.writer.StandardVCFWriter;
@ -52,7 +51,7 @@ public class BCF2Writer extends IndexingVCFWriter {
private OutputStream outputStream; // Note: do not flush until completely done writing, to avoid issues with eventual BGZF support
private VCFHeader header;
private Map<String, Integer> contigDictionary = new HashMap<String, Integer>();
private Map<String, Integer> stringDictionary = new LinkedHashMap<String, Integer>();
private Map<String, Integer> stringDictionaryMap = new LinkedHashMap<String, Integer>();
private final BCF2Encoder encoder = new BCF2Encoder(); // initialized after the header arrives
@ -75,22 +74,14 @@ public class BCF2Writer extends IndexingVCFWriter {
for ( final VCFContigHeaderLine contig : header.getContigLines())
contigDictionary.put(contig.getID(), contig.getContigIndex());
// set up the strings dictionary
int offset = 0;
stringDictionary.put(VCFConstants.PASSES_FILTERS_v4, offset++); // special case the special PASS field
for ( VCFHeaderLine line : header.getMetaData() ) {
if ( line instanceof VCFIDHeaderLine ) {
VCFIDHeaderLine idLine = (VCFIDHeaderLine)line;
stringDictionary.put(idLine.getID(), offset++);
}
// set up the map from dictionary string values -> offset
final ArrayList<String> dict = BCF2Utils.makeDictionary(header);
for ( int i = 0; i < dict.size(); i++ ) {
stringDictionaryMap.put(dict.get(i), i);
}
// add the dictionary ##dictionary=x,y,z line to the header
final String dictionaryLineValue = Utils.join(BCF2Constants.DICTIONARY_LINE_ENTRY_SEPARATOR, stringDictionary.keySet());
header.addMetaDataLine(new VCFHeaderLine(BCF2Constants.DICTIONARY_LINE_TAG, dictionaryLineValue));
// write out the header
StandardVCFWriter.writeHeader(header, new OutputStreamWriter(outputStream), doNotWriteGenotypes, BCF2Constants.VERSION_LINE, "BCF2 stream");
StandardVCFWriter.writeHeader(header, new OutputStreamWriter(outputStream), doNotWriteGenotypes, BCF2Utils.VERSION_LINE, "BCF2 stream");
}
@Override
@ -369,7 +360,7 @@ public class BCF2Writer extends IndexingVCFWriter {
// iterate over strings until we find one that needs 16 bits, and break
for ( final String string : strings ) {
final int offset = stringDictionary.get(string);
final int offset = stringDictionaryMap.get(string);
offsets.add(offset);
final BCF2Type type1 = encoder.determineIntegerType(offset);
switch ( type1 ) {

View File

@ -234,7 +234,7 @@ public class EncoderDecoderUnitTest extends BaseTest {
for ( final BCF2TypedValue tv : toEncode ) {
if ( tv.type != BCF2Type.CHAR ) {
for ( final int length : Arrays.asList(2, 5, 10, 15, 20, 25) ) {
final byte td = TypeDescriptor.encodeTypeDescriptor(1, tv.type);
final byte td = BCF2Utils.encodeTypeDescriptor(1, tv.type);
final BCF2Encoder encoder = new BCF2Encoder();
for ( int i = 0; i < length; i++ ) {