No dictionary in header. Now built dynamically from the header in the writer and codec
-- Created BCF2Utils and moved BCF2Constants and TypeDescriptor methods there
This commit is contained in:
parent
f0b081a85f
commit
ce9e9eebb1
|
|
@ -47,7 +47,7 @@ public class BCF2Codec implements FeatureCodec<VariantContext> {
|
|||
final protected static Logger logger = Logger.getLogger(BCF2Codec.class);
|
||||
private VCFHeader header = null;
|
||||
private final ArrayList<String> contigNames = new ArrayList<String>();
|
||||
private final ArrayList<String> dictionary = new ArrayList<String>();
|
||||
private ArrayList<String> dictionary;
|
||||
private final BCF2Decoder decoder = new BCF2Decoder();
|
||||
private boolean skipGenotypes = false;
|
||||
|
||||
|
|
@ -126,7 +126,7 @@ public class BCF2Codec implements FeatureCodec<VariantContext> {
|
|||
contigNames.add(contig.getID());
|
||||
|
||||
// create the string dictionary
|
||||
parseDictionary(header);
|
||||
dictionary = parseDictionary(header);
|
||||
|
||||
// position right before next line (would be right before first real record byte at end of header)
|
||||
return new FeatureCodecHeader(header, inputStream.getPosition());
|
||||
|
|
@ -138,7 +138,7 @@ public class BCF2Codec implements FeatureCodec<VariantContext> {
|
|||
FileInputStream fis = new FileInputStream(path);
|
||||
AsciiLineReader reader = new AsciiLineReader(new PositionalBufferedStream(fis));
|
||||
String firstLine = reader.readLine();
|
||||
if ( firstLine != null && firstLine.equals(BCF2Constants.VERSION_LINE) ) {
|
||||
if ( firstLine != null && firstLine.equals(BCF2Utils.VERSION_LINE) ) {
|
||||
return true;
|
||||
}
|
||||
} catch ( FileNotFoundException e ) {
|
||||
|
|
@ -150,18 +150,14 @@ public class BCF2Codec implements FeatureCodec<VariantContext> {
|
|||
return false;
|
||||
}
|
||||
|
||||
private final void parseDictionary(final VCFHeader header) {
|
||||
for ( final VCFHeaderLine line : header.getMetaData() ) {
|
||||
if ( line.getKey().equals(BCF2Constants.DICTIONARY_LINE_TAG) ) {
|
||||
for ( final String string : line.getValue().split(BCF2Constants.DICTIONARY_LINE_ENTRY_SEPARATOR) )
|
||||
dictionary.add(string);
|
||||
break;
|
||||
}
|
||||
}
|
||||
private final ArrayList<String> parseDictionary(final VCFHeader header) {
|
||||
final ArrayList<String> dict = BCF2Utils.makeDictionary(header);
|
||||
|
||||
// if we got here we never found a dictionary, or there are no elements in the dictionary
|
||||
if ( dictionary.size() == 0 )
|
||||
if ( dict.size() == 0 )
|
||||
throw new UserException.MalformedBCF2("Dictionary header element was absent or empty");
|
||||
|
||||
return dict;
|
||||
}
|
||||
|
||||
public boolean isSkippingGenotypes() {
|
||||
|
|
|
|||
|
|
@ -1,42 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2012, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.utils.codecs.bcf2;
|
||||
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
|
||||
|
||||
import java.nio.charset.Charset;
|
||||
|
||||
public class BCF2Constants {
|
||||
public static final String VERSION_LINE_FORMAT = "fileformat=BCF2v%d.%d";
|
||||
public static final String VERSION_LINE = String.format(VCFHeader.METADATA_INDICATOR + VERSION_LINE_FORMAT, 0, 1);
|
||||
public static final String DICTIONARY_LINE_TAG = "dictionary";
|
||||
public static final String DICTIONARY_LINE_ENTRY_SEPARATOR = ",";
|
||||
|
||||
// Note that these values are prefixed by FFFFFF for convenience
|
||||
public static final int INT8_MISSING_VALUE = 0xFFFFFF80;
|
||||
public static final int INT16_MISSING_VALUE = 0xFFFF8000;
|
||||
public static final int INT32_MISSING_VALUE = 0x80000000;
|
||||
public static final int FLOAT_MISSING_VALUE = 0x7F800001;
|
||||
}
|
||||
|
|
@ -129,8 +129,8 @@ public class BCF2Decoder {
|
|||
}
|
||||
|
||||
public final Object decodeTypedValue(final byte typeDescriptor) {
|
||||
final int size = TypeDescriptor.sizeIsOverflow(typeDescriptor) ? decodeVectorSize() : TypeDescriptor.decodeSize(typeDescriptor);
|
||||
final BCF2Type type = TypeDescriptor.decodeType(typeDescriptor);
|
||||
final int size = BCF2Utils.sizeIsOverflow(typeDescriptor) ? decodeVectorSize() : BCF2Utils.decodeSize(typeDescriptor);
|
||||
final BCF2Type type = BCF2Utils.decodeType(typeDescriptor);
|
||||
|
||||
assert size >= 0;
|
||||
|
||||
|
|
@ -186,8 +186,8 @@ public class BCF2Decoder {
|
|||
|
||||
private final int decodeVectorSize() {
|
||||
final byte typeDescriptor = readTypeDescriptor();
|
||||
final int size = TypeDescriptor.decodeSize(typeDescriptor);
|
||||
final BCF2Type type = TypeDescriptor.decodeType(typeDescriptor);
|
||||
final int size = BCF2Utils.decodeSize(typeDescriptor);
|
||||
final BCF2Type type = BCF2Utils.decodeType(typeDescriptor);
|
||||
|
||||
assert size == 1;
|
||||
assert type == BCF2Type.INT8 || type == BCF2Type.INT16 || type == BCF2Type.INT32;
|
||||
|
|
|
|||
|
|
@ -171,9 +171,9 @@ public class BCF2Encoder {
|
|||
}
|
||||
|
||||
public final void encodeType(final int size, final BCF2Type type) throws IOException {
|
||||
final byte typeByte = TypeDescriptor.encodeTypeDescriptor(size, type);
|
||||
final byte typeByte = BCF2Utils.encodeTypeDescriptor(size, type);
|
||||
encodeStream.write(typeByte);
|
||||
if ( TypeDescriptor.willOverflow(size) )
|
||||
if ( BCF2Utils.willOverflow(size) )
|
||||
encodeTypedIntOfBestSize(size);
|
||||
}
|
||||
|
||||
|
|
@ -206,7 +206,7 @@ public class BCF2Encoder {
|
|||
}
|
||||
|
||||
public final BCF2Type determineIntegerType(final int value) {
|
||||
for ( final BCF2Type potentialType : TypeDescriptor.INTEGER_TYPES_BY_SIZE ) {
|
||||
for ( final BCF2Type potentialType : BCF2Utils.INTEGER_TYPES_BY_SIZE ) {
|
||||
if ( potentialType.withinRange(value) )
|
||||
return potentialType;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -32,11 +32,11 @@ package org.broadinstitute.sting.utils.codecs.bcf2;
|
|||
*/
|
||||
public enum BCF2Type {
|
||||
RESERVED_0,
|
||||
INT8(1, BCF2Constants.INT8_MISSING_VALUE, -127, 127), // todo -- confirm range
|
||||
INT16(2, BCF2Constants.INT16_MISSING_VALUE, -32767, 32767),
|
||||
INT32(4, BCF2Constants.INT32_MISSING_VALUE, -2147483647, 2147483647),
|
||||
INT8(1, BCF2Utils.INT8_MISSING_VALUE, -127, 127), // todo -- confirm range
|
||||
INT16(2, BCF2Utils.INT16_MISSING_VALUE, -32767, 32767),
|
||||
INT32(4, BCF2Utils.INT32_MISSING_VALUE, -2147483647, 2147483647),
|
||||
RESERVED_4,
|
||||
FLOAT(4, BCF2Constants.FLOAT_MISSING_VALUE),
|
||||
FLOAT(4, BCF2Utils.FLOAT_MISSING_VALUE),
|
||||
RESERVED_6,
|
||||
CHAR;
|
||||
|
||||
|
|
|
|||
|
|
@ -24,23 +24,63 @@
|
|||
|
||||
package org.broadinstitute.sting.utils.codecs.bcf2;
|
||||
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLine;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFIDHeaderLine;
|
||||
|
||||
import java.util.ArrayList;
|
||||
|
||||
/**
|
||||
* Convenience methods for encoding, decoding BCF2 type descriptors (size + type)
|
||||
* @author Mark DePristo
|
||||
* @since 5/3/12
|
||||
* Common utilities for working with BCF2 files
|
||||
*
|
||||
* Includes convenience methods for encoding, decoding BCF2 type descriptors (size + type)
|
||||
*
|
||||
* @author depristo
|
||||
* @since 5/12
|
||||
*/
|
||||
class TypeDescriptor {
|
||||
public class BCF2Utils {
|
||||
public static final int OVERFLOW_ELEMENT_MARKER = 15;
|
||||
public static final int MAX_INLINE_ELEMENTS = 14;
|
||||
|
||||
public final static BCF2Type[] INTEGER_TYPES_BY_SIZE = new BCF2Type[3];
|
||||
public final static BCF2Type[] DICTIONARY_TYPES_BY_SIZE = INTEGER_TYPES_BY_SIZE;
|
||||
private final static BCF2Type[] LOOKUP = BCF2Type.values();
|
||||
public static final String VERSION_LINE_FORMAT = "fileformat=BCF2v%d.%d";
|
||||
public static final String VERSION_LINE = String.format(VCFHeader.METADATA_INDICATOR + VERSION_LINE_FORMAT, 0, 1);
|
||||
|
||||
// Note that these values are prefixed by FFFFFF for convenience
|
||||
public static final int INT8_MISSING_VALUE = 0xFFFFFF80;
|
||||
public static final int INT16_MISSING_VALUE = 0xFFFF8000;
|
||||
public static final int INT32_MISSING_VALUE = 0x80000000;
|
||||
public static final int FLOAT_MISSING_VALUE = 0x7F800001;
|
||||
|
||||
static {
|
||||
INTEGER_TYPES_BY_SIZE[0] = BCF2Type.INT8;
|
||||
INTEGER_TYPES_BY_SIZE[1] = BCF2Type.INT16;
|
||||
INTEGER_TYPES_BY_SIZE[2] = BCF2Type.INT32;
|
||||
BCF2Utils.INTEGER_TYPES_BY_SIZE[0] = BCF2Type.INT8;
|
||||
BCF2Utils.INTEGER_TYPES_BY_SIZE[1] = BCF2Type.INT16;
|
||||
BCF2Utils.INTEGER_TYPES_BY_SIZE[2] = BCF2Type.INT32;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a strings dictionary from the VCF header
|
||||
*
|
||||
* The dictionary is an ordered list of common VCF identifers (FILTER, INFO, and FORMAT)
|
||||
* fields.
|
||||
*
|
||||
* @param header the VCFHeader from which to build the dictionary
|
||||
* @return a non-null dictionary of elements, may be empty
|
||||
*/
|
||||
public final static ArrayList<String> makeDictionary(final VCFHeader header) {
|
||||
final ArrayList<String> dict = new ArrayList<String>();
|
||||
|
||||
// set up the strings dictionary
|
||||
dict.add(VCFConstants.PASSES_FILTERS_v4); // special case the special PASS field
|
||||
for ( VCFHeaderLine line : header.getMetaData() ) {
|
||||
if ( line instanceof VCFIDHeaderLine) {
|
||||
VCFIDHeaderLine idLine = (VCFIDHeaderLine)line;
|
||||
dict.add(idLine.getID());
|
||||
}
|
||||
}
|
||||
|
||||
return dict;
|
||||
}
|
||||
|
||||
public final static byte encodeTypeDescriptor(final int nElements, final BCF2Type type ) {
|
||||
|
|
@ -26,10 +26,9 @@ package org.broadinstitute.sting.utils.codecs.bcf2.writer;
|
|||
|
||||
import net.sf.samtools.SAMSequenceDictionary;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.codecs.bcf2.BCF2Constants;
|
||||
import org.broadinstitute.sting.utils.codecs.bcf2.BCF2Encoder;
|
||||
import org.broadinstitute.sting.utils.codecs.bcf2.BCF2Type;
|
||||
import org.broadinstitute.sting.utils.codecs.bcf2.BCF2Utils;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.*;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.writer.IndexingVCFWriter;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.writer.StandardVCFWriter;
|
||||
|
|
@ -52,7 +51,7 @@ public class BCF2Writer extends IndexingVCFWriter {
|
|||
private OutputStream outputStream; // Note: do not flush until completely done writing, to avoid issues with eventual BGZF support
|
||||
private VCFHeader header;
|
||||
private Map<String, Integer> contigDictionary = new HashMap<String, Integer>();
|
||||
private Map<String, Integer> stringDictionary = new LinkedHashMap<String, Integer>();
|
||||
private Map<String, Integer> stringDictionaryMap = new LinkedHashMap<String, Integer>();
|
||||
|
||||
private final BCF2Encoder encoder = new BCF2Encoder(); // initialized after the header arrives
|
||||
|
||||
|
|
@ -75,22 +74,14 @@ public class BCF2Writer extends IndexingVCFWriter {
|
|||
for ( final VCFContigHeaderLine contig : header.getContigLines())
|
||||
contigDictionary.put(contig.getID(), contig.getContigIndex());
|
||||
|
||||
// set up the strings dictionary
|
||||
int offset = 0;
|
||||
stringDictionary.put(VCFConstants.PASSES_FILTERS_v4, offset++); // special case the special PASS field
|
||||
for ( VCFHeaderLine line : header.getMetaData() ) {
|
||||
if ( line instanceof VCFIDHeaderLine ) {
|
||||
VCFIDHeaderLine idLine = (VCFIDHeaderLine)line;
|
||||
stringDictionary.put(idLine.getID(), offset++);
|
||||
}
|
||||
// set up the map from dictionary string values -> offset
|
||||
final ArrayList<String> dict = BCF2Utils.makeDictionary(header);
|
||||
for ( int i = 0; i < dict.size(); i++ ) {
|
||||
stringDictionaryMap.put(dict.get(i), i);
|
||||
}
|
||||
|
||||
// add the dictionary ##dictionary=x,y,z line to the header
|
||||
final String dictionaryLineValue = Utils.join(BCF2Constants.DICTIONARY_LINE_ENTRY_SEPARATOR, stringDictionary.keySet());
|
||||
header.addMetaDataLine(new VCFHeaderLine(BCF2Constants.DICTIONARY_LINE_TAG, dictionaryLineValue));
|
||||
|
||||
// write out the header
|
||||
StandardVCFWriter.writeHeader(header, new OutputStreamWriter(outputStream), doNotWriteGenotypes, BCF2Constants.VERSION_LINE, "BCF2 stream");
|
||||
StandardVCFWriter.writeHeader(header, new OutputStreamWriter(outputStream), doNotWriteGenotypes, BCF2Utils.VERSION_LINE, "BCF2 stream");
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
@ -369,7 +360,7 @@ public class BCF2Writer extends IndexingVCFWriter {
|
|||
|
||||
// iterate over strings until we find one that needs 16 bits, and break
|
||||
for ( final String string : strings ) {
|
||||
final int offset = stringDictionary.get(string);
|
||||
final int offset = stringDictionaryMap.get(string);
|
||||
offsets.add(offset);
|
||||
final BCF2Type type1 = encoder.determineIntegerType(offset);
|
||||
switch ( type1 ) {
|
||||
|
|
|
|||
|
|
@ -234,7 +234,7 @@ public class EncoderDecoderUnitTest extends BaseTest {
|
|||
for ( final BCF2TypedValue tv : toEncode ) {
|
||||
if ( tv.type != BCF2Type.CHAR ) {
|
||||
for ( final int length : Arrays.asList(2, 5, 10, 15, 20, 25) ) {
|
||||
final byte td = TypeDescriptor.encodeTypeDescriptor(1, tv.type);
|
||||
final byte td = BCF2Utils.encodeTypeDescriptor(1, tv.type);
|
||||
|
||||
final BCF2Encoder encoder = new BCF2Encoder();
|
||||
for ( int i = 0; i < length; i++ ) {
|
||||
|
|
|
|||
Loading…
Reference in New Issue