Move BCF2 from private utils to public codecs
This commit is contained in:
parent
d13cda6b6f
commit
679ffdd333
|
|
@ -0,0 +1,414 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2012, The Broad Institute
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person
|
||||||
|
* obtaining a copy of this software and associated documentation
|
||||||
|
* files (the "Software"), to deal in the Software without
|
||||||
|
* restriction, including without limitation the rights to use,
|
||||||
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following
|
||||||
|
* conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be
|
||||||
|
* included in all copies or substantial portions of the Software.
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||||
|
* OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.broadinstitute.sting.utils.codecs.bcf2;
|
||||||
|
|
||||||
|
import org.apache.log4j.Logger;
|
||||||
|
import org.broad.tribble.Feature;
|
||||||
|
import org.broad.tribble.FeatureCodec;
|
||||||
|
import org.broad.tribble.FeatureCodecHeader;
|
||||||
|
import org.broad.tribble.readers.AsciiLineReader;
|
||||||
|
import org.broad.tribble.readers.PositionalBufferedStream;
|
||||||
|
import org.broadinstitute.sting.utils.codecs.vcf.*;
|
||||||
|
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||||
|
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||||
|
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||||
|
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
||||||
|
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||||
|
import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder;
|
||||||
|
|
||||||
|
import java.io.FileInputStream;
|
||||||
|
import java.io.FileNotFoundException;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.*;
|
||||||
|
|
||||||
|
public class BCF2Codec implements FeatureCodec<VariantContext> {
|
||||||
|
final protected static Logger logger = Logger.getLogger(BCF2Codec.class);
|
||||||
|
private VCFHeader header = null;
|
||||||
|
private final ArrayList<String> contigNames = new ArrayList<String>();
|
||||||
|
private final ArrayList<String> dictionary = new ArrayList<String>();
|
||||||
|
private final BCF2Decoder decoder = new BCF2Decoder();
|
||||||
|
private boolean skipGenotypes = false;
|
||||||
|
|
||||||
|
// ----------------------------------------------------------------------
|
||||||
|
//
|
||||||
|
// Feature codec interface functions
|
||||||
|
//
|
||||||
|
// ----------------------------------------------------------------------
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Feature decodeLoc( final PositionalBufferedStream inputStream ) {
|
||||||
|
return decode(inputStream);
|
||||||
|
// TODO: a less expensive version of decodeLoc() that doesn't use VariantContext
|
||||||
|
// TODO: very easy -- just decodeSitesBlock, and then skip to end of end of sites block
|
||||||
|
// TODO: and then skip genotypes block
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public VariantContext decode( final PositionalBufferedStream inputStream ) {
|
||||||
|
final VariantContextBuilder builder = new VariantContextBuilder();
|
||||||
|
|
||||||
|
final int sitesBlockSize = decoder.readBlockSize(inputStream);
|
||||||
|
final int genotypeBlockSize = decoder.readBlockSize(inputStream);
|
||||||
|
decoder.readNextBlock(sitesBlockSize, inputStream);
|
||||||
|
final SitesInfoForDecoding info = decodeSitesBlock(builder);
|
||||||
|
|
||||||
|
if ( isSkippingGenotypes() ) {
|
||||||
|
decoder.skipNextBlock(genotypeBlockSize, inputStream);
|
||||||
|
} else {
|
||||||
|
decoder.readNextBlock(genotypeBlockSize, inputStream);
|
||||||
|
decodeGenotypes(info, builder);
|
||||||
|
}
|
||||||
|
|
||||||
|
return builder.make();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Class<VariantContext> getFeatureType() {
|
||||||
|
return VariantContext.class;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public FeatureCodecHeader readHeader( final PositionalBufferedStream inputStream ) {
|
||||||
|
AsciiLineReader headerReader = new AsciiLineReader(inputStream);
|
||||||
|
String headerLine;
|
||||||
|
List<String> headerLines = new ArrayList<String>();
|
||||||
|
boolean foundHeaderEnd = false;
|
||||||
|
|
||||||
|
try {
|
||||||
|
while ( ! foundHeaderEnd && (headerLine = headerReader.readLine()) != null) {
|
||||||
|
if ( headerLine.startsWith(VCFHeader.METADATA_INDICATOR) ) {
|
||||||
|
headerLines.add(headerLine);
|
||||||
|
}
|
||||||
|
else if ( headerLine.startsWith(VCFHeader.HEADER_INDICATOR) ) {
|
||||||
|
headerLines.add(headerLine);
|
||||||
|
foundHeaderEnd = true;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
throw new UserException.MalformedBCF2("Reached end of header without encountering a field layout line");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
catch ( IOException e ) {
|
||||||
|
throw new UserException.CouldNotReadInputFile("I/O error while reading BCF2 header");
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( ! foundHeaderEnd ) {
|
||||||
|
throw new UserException.MalformedBCF2("Reached end of header without encountering a field layout line");
|
||||||
|
}
|
||||||
|
|
||||||
|
// read the header
|
||||||
|
this.header = AbstractVCFCodec.parseHeader(headerLines, VCFHeaderVersion.VCF4_1);
|
||||||
|
|
||||||
|
// create the config offsets
|
||||||
|
for ( final VCFContigHeaderLine contig : header.getContigLines())
|
||||||
|
contigNames.add(contig.getID());
|
||||||
|
|
||||||
|
// create the string dictionary
|
||||||
|
parseDictionary(header);
|
||||||
|
|
||||||
|
// position right before next line (would be right before first real record byte at end of header)
|
||||||
|
return new FeatureCodecHeader(header, inputStream.getPosition());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean canDecode( final String path ) {
|
||||||
|
try {
|
||||||
|
FileInputStream fis = new FileInputStream(path);
|
||||||
|
AsciiLineReader reader = new AsciiLineReader(new PositionalBufferedStream(fis));
|
||||||
|
String firstLine = reader.readLine();
|
||||||
|
if ( firstLine != null && firstLine.equals(BCF2Constants.VERSION_LINE) ) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
} catch ( FileNotFoundException e ) {
|
||||||
|
return false;
|
||||||
|
} catch ( IOException e ) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
private final void parseDictionary(final VCFHeader header) {
|
||||||
|
for ( final VCFHeaderLine line : header.getMetaData() ) {
|
||||||
|
if ( line.getKey().equals(BCF2Constants.DICTIONARY_LINE_TAG) ) {
|
||||||
|
for ( final String string : line.getValue().split(BCF2Constants.DICTIONARY_LINE_ENTRY_SEPARATOR) )
|
||||||
|
dictionary.add(string);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// if we got here we never found a dictionary, or there are no elements in the dictionary
|
||||||
|
if ( dictionary.size() == 0 )
|
||||||
|
throw new UserException.MalformedBCF2("Dictionary header element was absent or empty");
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isSkippingGenotypes() {
|
||||||
|
return skipGenotypes;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setSkipGenotypes(final boolean skipGenotypes) {
|
||||||
|
this.skipGenotypes = skipGenotypes;
|
||||||
|
}
|
||||||
|
|
||||||
|
// --------------------------------------------------------------------------------
|
||||||
|
//
|
||||||
|
// implicit block
|
||||||
|
//
|
||||||
|
// The first four records of BCF are inline untype encoded data of:
|
||||||
|
//
|
||||||
|
// 4 byte integer chrom offset
|
||||||
|
// 4 byte integer start
|
||||||
|
// 4 byte integer ref length
|
||||||
|
// 4 byte float qual
|
||||||
|
//
|
||||||
|
// --------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
private final SitesInfoForDecoding decodeSitesBlock(final VariantContextBuilder builder) {
|
||||||
|
final int contigOffset = decoder.decodeInt(BCFType.INT32.getSizeInBytes());
|
||||||
|
final String contig = lookupContigName(contigOffset);
|
||||||
|
builder.chr(contig);
|
||||||
|
|
||||||
|
final int pos = decoder.decodeInt(BCFType.INT32.getSizeInBytes());
|
||||||
|
final int refLength = decoder.decodeInt(BCFType.INT32.getSizeInBytes());
|
||||||
|
builder.start((long)pos);
|
||||||
|
builder.stop((long)(pos + refLength - 1)); // minus one because of our open intervals
|
||||||
|
|
||||||
|
final Object qual = decoder.decodeSingleValue(BCFType.FLOAT);
|
||||||
|
if ( qual != null ) {
|
||||||
|
builder.log10PError(((Double)qual) / -10.0);
|
||||||
|
}
|
||||||
|
|
||||||
|
final int nAlleleInfo = decoder.decodeInt(BCFType.INT32.getSizeInBytes());
|
||||||
|
final int nFormatSamples = decoder.decodeInt(BCFType.INT32.getSizeInBytes());
|
||||||
|
final int nAlleles = nAlleleInfo >> 16;
|
||||||
|
final int nInfo = nAlleleInfo & 0x00FF;
|
||||||
|
final int nFormatFields = nFormatSamples >> 24;
|
||||||
|
final int nSamples = nFormatSamples & 0x0FFF;
|
||||||
|
|
||||||
|
decodeID(builder);
|
||||||
|
final ArrayList<Allele> alleles = decodeAlleles(builder, pos, nAlleles);
|
||||||
|
decodeFilter(builder);
|
||||||
|
decodeInfo(builder, nInfo);
|
||||||
|
|
||||||
|
return new SitesInfoForDecoding(pos, nFormatFields, nSamples, alleles);
|
||||||
|
}
|
||||||
|
|
||||||
|
private final static class SitesInfoForDecoding {
|
||||||
|
final int pos;
|
||||||
|
final int nFormatFields;
|
||||||
|
final int nSamples;
|
||||||
|
final ArrayList<Allele> alleles;
|
||||||
|
|
||||||
|
private SitesInfoForDecoding(final int pos, final int nFormatFields, final int nSamples, final ArrayList<Allele> alleles) {
|
||||||
|
this.pos = pos;
|
||||||
|
this.nFormatFields = nFormatFields;
|
||||||
|
this.nSamples = nSamples;
|
||||||
|
this.alleles = alleles;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void decodeID( final VariantContextBuilder builder ) {
|
||||||
|
final String id = (String)decoder.decodeTypedValue();
|
||||||
|
|
||||||
|
if ( id == null ) {
|
||||||
|
builder.noID();
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
builder.id(id);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public static ArrayList<Allele> clipAllelesIfNecessary(int position, String ref, ArrayList<Allele> unclippedAlleles) {
|
||||||
|
if ( AbstractVCFCodec.isSingleNucleotideEvent(unclippedAlleles) ) {
|
||||||
|
ArrayList<Allele> clippedAlleles = new ArrayList<Allele>(unclippedAlleles.size());
|
||||||
|
AbstractVCFCodec.clipAlleles(position, ref, unclippedAlleles, clippedAlleles, -1);
|
||||||
|
return clippedAlleles;
|
||||||
|
} else
|
||||||
|
return unclippedAlleles;
|
||||||
|
}
|
||||||
|
|
||||||
|
private ArrayList<Allele> decodeAlleles( final VariantContextBuilder builder, final int pos, final int nAlleles ) {
|
||||||
|
// TODO -- probably need inline decoder for efficiency here (no sense in going bytes -> string -> vector -> bytes
|
||||||
|
ArrayList<Allele> alleles = new ArrayList<Allele>(nAlleles);
|
||||||
|
String ref = null;
|
||||||
|
|
||||||
|
for ( int i = 0; i < nAlleles; i++ ) {
|
||||||
|
final String allele = (String)decoder.decodeTypedValue();
|
||||||
|
|
||||||
|
if ( i == 0 ) {
|
||||||
|
ref = allele;
|
||||||
|
alleles.add(Allele.create(allele, true));
|
||||||
|
} else {
|
||||||
|
alleles.add(Allele.create(allele, false));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
alleles = clipAllelesIfNecessary(pos, ref, alleles);
|
||||||
|
builder.alleles(alleles);
|
||||||
|
|
||||||
|
builder.referenceBaseForIndel(ref.getBytes()[0]);
|
||||||
|
|
||||||
|
return alleles;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void decodeFilter( final VariantContextBuilder builder ) {
|
||||||
|
final Object filters = decoder.decodeTypedValue();
|
||||||
|
|
||||||
|
if ( filters == null ) {
|
||||||
|
builder.unfiltered();
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
builder.filters(new LinkedHashSet<String>(asStrings(filters)));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void decodeInfo( final VariantContextBuilder builder, final int numInfoFields ) {
|
||||||
|
final Map<String, Object> infoFieldEntries = new HashMap<String, Object>(numInfoFields);
|
||||||
|
|
||||||
|
for ( int i = 0; i < numInfoFields; i++ ) {
|
||||||
|
final String key = getDictionaryString();
|
||||||
|
Object value = decoder.decodeTypedValue();
|
||||||
|
final VCFCompoundHeaderLine metaData = VariantContext.getMetaDataForField(header, key);
|
||||||
|
if ( metaData.getType() == VCFHeaderLineType.Flag ) value = true; // special case for flags
|
||||||
|
infoFieldEntries.put(key, value);
|
||||||
|
}
|
||||||
|
|
||||||
|
builder.attributes(infoFieldEntries);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void decodeGenotypes( final SitesInfoForDecoding siteInfo, final VariantContextBuilder builder ) {
|
||||||
|
final List<String> samples = new ArrayList<String>(header.getGenotypeSamples());
|
||||||
|
final int nSamples = siteInfo.nSamples;
|
||||||
|
final int nFields = siteInfo.nFormatFields;
|
||||||
|
final Map<String, List<Object>> fieldValues = decodeGenotypeFieldValues(nFields, nSamples);
|
||||||
|
|
||||||
|
if ( samples.size() != nSamples )
|
||||||
|
throw new UserException.MalformedBCF2("GATK currently doesn't support reading BCF2 files with " +
|
||||||
|
"different numbers of samples per record. Saw " + samples.size() +
|
||||||
|
" samples in header but have a record with " + nSamples + " samples");
|
||||||
|
|
||||||
|
final List<Genotype> genotypes = new ArrayList<Genotype>(nSamples);
|
||||||
|
for ( int i = 0; i < nSamples; i++ ) {
|
||||||
|
final String sampleName = samples.get(i);
|
||||||
|
List<Allele> alleles = null;
|
||||||
|
boolean isPhased = false;
|
||||||
|
double log10PError = VariantContext.NO_LOG10_PERROR;
|
||||||
|
Set<String> filters = null;
|
||||||
|
Map<String, Object> attributes = null;
|
||||||
|
double[] log10Likelihoods = null;
|
||||||
|
|
||||||
|
for ( final Map.Entry<String, List<Object>> entry : fieldValues.entrySet() ) {
|
||||||
|
final String field = entry.getKey();
|
||||||
|
final List<Object> values = entry.getValue();
|
||||||
|
if ( field.equals(VCFConstants.GENOTYPE_KEY) ) {
|
||||||
|
alleles = decodeGenotypeAlleles(siteInfo.alleles, (List<Integer>)values.get(i));
|
||||||
|
} else if ( field.equals(VCFConstants.GENOTYPE_QUALITY_KEY) ) {
|
||||||
|
final Integer value = (Integer)values.get(i);
|
||||||
|
if ( value != BCFType.INT8.getMissingJavaValue() )
|
||||||
|
log10PError = value / -10.0;
|
||||||
|
} else if ( field.equals(VCFConstants.GENOTYPE_FILTER_KEY) ) {
|
||||||
|
throw new ReviewedStingException("Genotype filters not implemented in GATK BCF2");
|
||||||
|
//filters = new HashSet<String>(values.get(i));
|
||||||
|
} else { // add to attributes
|
||||||
|
if ( attributes == null ) attributes = new HashMap<String, Object>(nFields);
|
||||||
|
attributes.put(field, values.get(i));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( alleles == null ) throw new ReviewedStingException("BUG: no alleles found");
|
||||||
|
|
||||||
|
final Genotype g = new Genotype(sampleName, alleles, log10PError, filters, attributes, isPhased, log10Likelihoods);
|
||||||
|
genotypes.add(g);
|
||||||
|
}
|
||||||
|
|
||||||
|
builder.genotypes(genotypes);
|
||||||
|
}
|
||||||
|
|
||||||
|
private final List<Allele> decodeGenotypeAlleles(final ArrayList<Allele> siteAlleles, final List<Integer> encoded) {
|
||||||
|
final List<Allele> gt = new ArrayList<Allele>(encoded.size());
|
||||||
|
for ( final Integer encode : encoded ) {
|
||||||
|
if ( encode == null ) // absent, as are all following by definition
|
||||||
|
return gt;
|
||||||
|
else {
|
||||||
|
final int offset = encode >> 1;
|
||||||
|
if ( offset == 0 )
|
||||||
|
gt.add(Allele.NO_CALL);
|
||||||
|
else
|
||||||
|
gt.add(siteAlleles.get(offset - 1));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return gt;
|
||||||
|
}
|
||||||
|
|
||||||
|
private final Map<String, List<Object>> decodeGenotypeFieldValues(final int nFields, final int nSamples) {
|
||||||
|
final Map<String, List<Object>> map = new LinkedHashMap<String, List<Object>>(nFields);
|
||||||
|
|
||||||
|
for ( int i = 0; i < nFields; i++ ) {
|
||||||
|
final String field = getDictionaryString();
|
||||||
|
final byte typeDescriptor = decoder.readTypeDescriptor();
|
||||||
|
final List<Object> values = new ArrayList<Object>(nSamples);
|
||||||
|
for ( int j = 0; j < nSamples; j++ )
|
||||||
|
values.add(decoder.decodeTypedValue(typeDescriptor));
|
||||||
|
map.put(field, values);
|
||||||
|
}
|
||||||
|
|
||||||
|
return map;
|
||||||
|
}
|
||||||
|
|
||||||
|
private final String getDictionaryString() {
|
||||||
|
final int offset = (Integer)decoder.decodeTypedValue();
|
||||||
|
final String field = dictionary.get(offset);
|
||||||
|
return field;
|
||||||
|
}
|
||||||
|
|
||||||
|
private final String lookupContigName( final int contigOffset ) {
|
||||||
|
if ( contigOffset < contigNames.size() ) {
|
||||||
|
return contigNames.get(contigOffset);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
throw new UserException.MalformedBCF2(String.format("No contig at index %d present in the sequence dictionary from the BCF2 header (%s)", contigOffset, contigNames));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ----------------------------------------------------------------------
|
||||||
|
//
|
||||||
|
// Utility functions
|
||||||
|
//
|
||||||
|
// ----------------------------------------------------------------------
|
||||||
|
|
||||||
|
private final Collection<String> asStrings(final Object o) {
|
||||||
|
return asCollection(String.class, o);
|
||||||
|
}
|
||||||
|
|
||||||
|
private final <T> Collection<T> asCollection(final Class<T> c, final Object o) {
|
||||||
|
if ( o == null )
|
||||||
|
return Collections.emptyList();
|
||||||
|
else if ( o instanceof List ) {
|
||||||
|
return (List<T>)o;
|
||||||
|
} else {
|
||||||
|
return (Set<T>)Collections.singleton(o);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,44 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2012, The Broad Institute
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person
|
||||||
|
* obtaining a copy of this software and associated documentation
|
||||||
|
* files (the "Software"), to deal in the Software without
|
||||||
|
* restriction, including without limitation the rights to use,
|
||||||
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following
|
||||||
|
* conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be
|
||||||
|
* included in all copies or substantial portions of the Software.
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||||
|
* OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.broadinstitute.sting.utils.codecs.bcf2;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
|
||||||
|
|
||||||
|
import java.nio.charset.Charset;
|
||||||
|
|
||||||
|
public class BCF2Constants {
|
||||||
|
public static final String VERSION_LINE_FORMAT = "fileformat=BCF2v%d.%d";
|
||||||
|
public static final String VERSION_LINE = String.format(VCFHeader.METADATA_INDICATOR + VERSION_LINE_FORMAT, 0, 1);
|
||||||
|
public static final String DICTIONARY_LINE_TAG = "dictionary";
|
||||||
|
public static final String DICTIONARY_LINE_ENTRY_SEPARATOR = ",";
|
||||||
|
|
||||||
|
public static final Charset BCF2_TEXT_CHARSET = Charset.forName("US-ASCII"); // TODO: enforce this!
|
||||||
|
|
||||||
|
// Note that these values are prefixed by FFFFFF for convenience
|
||||||
|
public static final int INT8_MISSING_VALUE = 0xFFFFFF80;
|
||||||
|
public static final int INT16_MISSING_VALUE = 0xFFFF8000;
|
||||||
|
public static final int INT32_MISSING_VALUE = 0x80000000;
|
||||||
|
public static final int FLOAT_MISSING_VALUE = 0x7F800001;
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,277 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2012, The Broad Institute
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person
|
||||||
|
* obtaining a copy of this software and associated documentation
|
||||||
|
* files (the "Software"), to deal in the Software without
|
||||||
|
* restriction, including without limitation the rights to use,
|
||||||
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following
|
||||||
|
* conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be
|
||||||
|
* included in all copies or substantial portions of the Software.
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||||
|
* OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.broadinstitute.sting.utils.codecs.bcf2;
|
||||||
|
|
||||||
|
import org.apache.log4j.Logger;
|
||||||
|
import org.broad.tribble.FeatureCodec;
|
||||||
|
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||||
|
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||||
|
|
||||||
|
import java.io.ByteArrayInputStream;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.InputStream;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
|
||||||
|
public class BCF2Decoder {
|
||||||
|
final protected static Logger logger = Logger.getLogger(FeatureCodec.class);
|
||||||
|
|
||||||
|
byte[] recordBytes;
|
||||||
|
ByteArrayInputStream recordStream;
|
||||||
|
|
||||||
|
public BCF2Decoder() {
|
||||||
|
// nothing to do
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a new decoder ready to read BCF2 data from the byte[] recordBytes, for testing purposes
|
||||||
|
*
|
||||||
|
* @param recordBytes
|
||||||
|
*/
|
||||||
|
protected BCF2Decoder(final byte[] recordBytes) {
|
||||||
|
setRecordBytes(recordBytes);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ----------------------------------------------------------------------
|
||||||
|
//
|
||||||
|
// Routines to load, set, skip blocks of underlying data we are decoding
|
||||||
|
//
|
||||||
|
// ----------------------------------------------------------------------
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Reads the next record from input stream and prepare this decoder to decode values from it
|
||||||
|
*
|
||||||
|
* @param stream
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
public void readNextBlock(final int blockSizeInBytes, final InputStream stream) {
|
||||||
|
setRecordBytes(readRecordBytes(blockSizeInBytes, stream));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Skips the next record from input stream, invalidating current block data
|
||||||
|
*
|
||||||
|
* @param stream
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
public void skipNextBlock(final int blockSizeInBytes, final InputStream stream) {
|
||||||
|
try {
|
||||||
|
final int bytesRead = (int)stream.skip(blockSizeInBytes);
|
||||||
|
validateReadBytes(bytesRead, blockSizeInBytes);
|
||||||
|
} catch ( IOException e ) {
|
||||||
|
throw new UserException.CouldNotReadInputFile("I/O error while reading BCF2 file", e);
|
||||||
|
}
|
||||||
|
this.recordBytes = null;
|
||||||
|
this.recordStream = null;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the byte[] for the block of data we are currently decoding
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
public byte[] getRecordBytes() {
|
||||||
|
return recordBytes;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The size of the current block in bytes
|
||||||
|
*
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
public int getBlockSize() {
|
||||||
|
return recordBytes.length;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean blockIsFullyDecoded() {
|
||||||
|
return recordStream.available() == 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Use the recordBytes[] to read BCF2 records from now on
|
||||||
|
*
|
||||||
|
* @param recordBytes
|
||||||
|
*/
|
||||||
|
public void setRecordBytes(final byte[] recordBytes) {
|
||||||
|
this.recordBytes = recordBytes;
|
||||||
|
this.recordStream = new ByteArrayInputStream(recordBytes);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ----------------------------------------------------------------------
|
||||||
|
//
|
||||||
|
// High-level decoder
|
||||||
|
//
|
||||||
|
// ----------------------------------------------------------------------
|
||||||
|
|
||||||
|
public final Object decodeTypedValue() {
|
||||||
|
final byte typeDescriptor = readTypeDescriptor();
|
||||||
|
return decodeTypedValue(typeDescriptor);
|
||||||
|
}
|
||||||
|
|
||||||
|
public final Object decodeTypedValue(final byte typeDescriptor) {
|
||||||
|
final int size = TypeDescriptor.sizeIsOverflow(typeDescriptor) ? decodeVectorSize() : TypeDescriptor.decodeSize(typeDescriptor);
|
||||||
|
final BCFType type = TypeDescriptor.decodeType(typeDescriptor);
|
||||||
|
|
||||||
|
assert size >= 0;
|
||||||
|
|
||||||
|
if ( size == 0 ) {
|
||||||
|
return null;
|
||||||
|
} else if ( type == BCFType.CHAR ) { // special case string decoding for efficiency
|
||||||
|
return decodeLiteralString(size);
|
||||||
|
} else if ( size == 1 ) {
|
||||||
|
return decodeSingleValue(type);
|
||||||
|
} else {
|
||||||
|
final ArrayList<Object> ints = new ArrayList<Object>(size);
|
||||||
|
for ( int i = 0; i < size; i++ ) {
|
||||||
|
ints.add(decodeSingleValue(type));
|
||||||
|
}
|
||||||
|
return ints;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public final Object decodeSingleValue(final BCFType type) {
|
||||||
|
// TODO -- decodeTypedValue should integrate this routine
|
||||||
|
final int value = readInt(type.getSizeInBytes(), recordStream);
|
||||||
|
|
||||||
|
if ( value == type.getMissingBytes() )
|
||||||
|
return null;
|
||||||
|
else {
|
||||||
|
switch (type) {
|
||||||
|
case INT8:
|
||||||
|
case INT16:
|
||||||
|
case INT32: return value;
|
||||||
|
case FLOAT: return (double)rawFloatToFloat(value);
|
||||||
|
case CHAR: return value & 0xFF; // TODO -- I cannot imagine why we'd get here, as string needs to be special cased
|
||||||
|
default: throw new ReviewedStingException("BCF2 codec doesn't know how to decode type " + type );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ----------------------------------------------------------------------
|
||||||
|
//
|
||||||
|
// Decode raw primitive data types (ints, floats, and strings)
|
||||||
|
//
|
||||||
|
// ----------------------------------------------------------------------
|
||||||
|
|
||||||
|
private final String decodeLiteralString(final int size) {
|
||||||
|
// TODO -- assumes size > 0
|
||||||
|
final byte[] bytes = new byte[size]; // TODO -- in principle should just grab bytes from underlying array
|
||||||
|
try {
|
||||||
|
recordStream.read(bytes);
|
||||||
|
return new String(bytes);
|
||||||
|
} catch ( IOException e ) {
|
||||||
|
throw new ReviewedStingException("readByte failure", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private final int decodeVectorSize() {
|
||||||
|
final byte typeDescriptor = readTypeDescriptor();
|
||||||
|
final int size = TypeDescriptor.decodeSize(typeDescriptor);
|
||||||
|
final BCFType type = TypeDescriptor.decodeType(typeDescriptor);
|
||||||
|
|
||||||
|
assert size == 1;
|
||||||
|
assert type == BCFType.INT8 || type == BCFType.INT16 || type == BCFType.INT32;
|
||||||
|
|
||||||
|
return decodeInt(type.getSizeInBytes());
|
||||||
|
}
|
||||||
|
|
||||||
|
public final int decodeInt(int bytesForEachInt) {
|
||||||
|
return readInt(bytesForEachInt, recordStream);
|
||||||
|
}
|
||||||
|
|
||||||
|
public final float rawFloatToFloat(final int rawFloat) {
|
||||||
|
return Float.intBitsToFloat(rawFloat);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ----------------------------------------------------------------------
|
||||||
|
//
|
||||||
|
// Utility functions
|
||||||
|
//
|
||||||
|
// ----------------------------------------------------------------------
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Read the size of the next block from inputStream
|
||||||
|
*
|
||||||
|
* @param inputStream
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
public final int readBlockSize(final InputStream inputStream) {
|
||||||
|
return readInt(4, inputStream);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @param inputStream
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
private final static byte[] readRecordBytes(final int blockSizeInBytes, final InputStream inputStream) {
|
||||||
|
final byte[] record = new byte[blockSizeInBytes];
|
||||||
|
try {
|
||||||
|
final int bytesRead = inputStream.read(record);
|
||||||
|
validateReadBytes(bytesRead, blockSizeInBytes);
|
||||||
|
} catch ( IOException e ) {
|
||||||
|
throw new UserException.CouldNotReadInputFile("I/O error while reading BCF2 file", e);
|
||||||
|
}
|
||||||
|
|
||||||
|
return record;
|
||||||
|
}
|
||||||
|
|
||||||
|
private final static void validateReadBytes(final int actuallyRead, final int expected) {
|
||||||
|
if ( actuallyRead < expected ) {
|
||||||
|
throw new UserException.MalformedBCF2(String.format("Failed to read next complete record: %s",
|
||||||
|
actuallyRead == -1 ?
|
||||||
|
"premature end of input stream" :
|
||||||
|
String.format("expected %d bytes but read only %d", expected, actuallyRead)));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public final byte readTypeDescriptor() {
|
||||||
|
return readByte(recordStream);
|
||||||
|
}
|
||||||
|
|
||||||
|
private final static byte readByte(final InputStream stream) {
|
||||||
|
try {
|
||||||
|
return (byte)(stream.read() & 0xFF);
|
||||||
|
} catch ( IOException e ) {
|
||||||
|
throw new ReviewedStingException("readByte failure", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private final static int readInt(int bytesForEachInt, final InputStream stream) {
|
||||||
|
switch ( bytesForEachInt ) {
|
||||||
|
case 1: {
|
||||||
|
return (byte)(readByte(stream));
|
||||||
|
} case 2: {
|
||||||
|
final int b1 = readByte(stream) & 0xFF;
|
||||||
|
final int b2 = readByte(stream) & 0xFF;
|
||||||
|
return (short)((b1 << 8) | b2);
|
||||||
|
} case 4: {
|
||||||
|
final int b1 = readByte(stream) & 0xFF;
|
||||||
|
final int b2 = readByte(stream) & 0xFF;
|
||||||
|
final int b3 = readByte(stream) & 0xFF;
|
||||||
|
final int b4 = readByte(stream) & 0xFF;
|
||||||
|
return (int)(b1 << 24 | b2 << 16 | b3 << 8 | b4);
|
||||||
|
} default: throw new ReviewedStingException("Unexpected size during decoding");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,234 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2012, The Broad Institute
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person
|
||||||
|
* obtaining a copy of this software and associated documentation
|
||||||
|
* files (the "Software"), to deal in the Software without
|
||||||
|
* restriction, including without limitation the rights to use,
|
||||||
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following
|
||||||
|
* conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be
|
||||||
|
* included in all copies or substantial portions of the Software.
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||||
|
* OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.broadinstitute.sting.utils.codecs.bcf2;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||||
|
|
||||||
|
import java.io.ByteArrayOutputStream;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.OutputStream;
|
||||||
|
import java.util.*;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Simple BCF2 encoder
|
||||||
|
*
|
||||||
|
* @author depristo
|
||||||
|
* @since 5/12
|
||||||
|
*/
|
||||||
|
public class BCF2Encoder {
|
||||||
|
// TODO -- increase default size?
|
||||||
|
public static final int WRITE_BUFFER_INITIAL_SIZE = 16384;
|
||||||
|
private ByteArrayOutputStream encodeStream = new ByteArrayOutputStream(WRITE_BUFFER_INITIAL_SIZE);
|
||||||
|
|
||||||
|
// --------------------------------------------------------------------------------
|
||||||
|
//
|
||||||
|
// Functions to return the data being encoded here
|
||||||
|
//
|
||||||
|
// --------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
public int getRecordSizeInBytes() {
|
||||||
|
return encodeStream.size();
|
||||||
|
}
|
||||||
|
|
||||||
|
public byte[] getRecordBytes() {
|
||||||
|
byte[] bytes = encodeStream.toByteArray();
|
||||||
|
encodeStream.reset();
|
||||||
|
return bytes;
|
||||||
|
}
|
||||||
|
|
||||||
|
// --------------------------------------------------------------------------------
|
||||||
|
//
|
||||||
|
// Super-high level interface
|
||||||
|
//
|
||||||
|
// --------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Totally generic encoder that examines o, determines the best way to encode it, and encodes it
|
||||||
|
* @param o
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
public final BCFType encode(final Object o) throws IOException {
|
||||||
|
if ( o == null ) throw new ReviewedStingException("Generic encode cannot deal with null values");
|
||||||
|
|
||||||
|
if ( o instanceof String ) {
|
||||||
|
return encodeString((String)o);
|
||||||
|
} else if ( o instanceof List ) {
|
||||||
|
final BCFType type = determinePrimitiveType(((List) o).get(0));
|
||||||
|
encodeTypedVector((List) o, type);
|
||||||
|
return type;
|
||||||
|
} else {
|
||||||
|
final BCFType type = determinePrimitiveType(o);
|
||||||
|
encodeTypedSingleton(o, type);
|
||||||
|
return type;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// --------------------------------------------------------------------------------
|
||||||
|
//
|
||||||
|
// Writing typed values (have type byte)
|
||||||
|
//
|
||||||
|
// --------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
public final void encodeTypedMissing(final BCFType type) throws IOException {
|
||||||
|
encodeTypedVector(Collections.emptyList(), type);
|
||||||
|
}
|
||||||
|
|
||||||
|
// todo -- should be specialized for each object type for efficiency
|
||||||
|
public final void encodeTypedSingleton(final Object v, final BCFType type) throws IOException {
|
||||||
|
encodeTypedVector(Collections.singleton(v), type);
|
||||||
|
}
|
||||||
|
|
||||||
|
public final BCFType encodeString(final String v) throws IOException {
|
||||||
|
// TODO -- this needs to be optimized
|
||||||
|
final byte[] bytes = v.getBytes();
|
||||||
|
final List<Byte> l = new ArrayList<Byte>(bytes.length);
|
||||||
|
for ( int i = 0; i < bytes.length; i++) l.add(bytes[i]);
|
||||||
|
encodeTypedVector(l, BCFType.CHAR);
|
||||||
|
return BCFType.CHAR;
|
||||||
|
}
|
||||||
|
|
||||||
|
public final <T extends Object> void encodeTypedVector(final Collection<T> v, final BCFType type) throws IOException {
|
||||||
|
encodeType(v.size(), type);
|
||||||
|
encodeRawValues(v, type);
|
||||||
|
}
|
||||||
|
|
||||||
|
public final BCFType encodeTypedIntOfBestSize(final int value) throws IOException {
|
||||||
|
final BCFType type = determineIntegerType(value);
|
||||||
|
encodeTypedSingleton(value, type);
|
||||||
|
return type;
|
||||||
|
}
|
||||||
|
|
||||||
|
// --------------------------------------------------------------------------------
|
||||||
|
//
|
||||||
|
// Writing raw values (don't have a type byte)
|
||||||
|
//
|
||||||
|
// --------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
public final <T extends Object> void encodeRawValues(final Collection<T> v, final BCFType type) throws IOException {
|
||||||
|
for ( final T v1 : v ) {
|
||||||
|
encodeRawValue(v1, type);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public final <T extends Object> void encodeRawValue(final T value, final BCFType type) throws IOException {
|
||||||
|
if ( value == type.getMissingJavaValue() )
|
||||||
|
encodeRawMissingValue(type);
|
||||||
|
else {
|
||||||
|
switch (type) {
|
||||||
|
case INT8:
|
||||||
|
case INT16:
|
||||||
|
case INT32: encodePrimitive((Integer)value, type); break;
|
||||||
|
case FLOAT: encodeRawFloat((Float) value, type); break;
|
||||||
|
case CHAR: encodeRawChar((Byte) value); break;
|
||||||
|
default: throw new ReviewedStingException("Illegal type encountered " + type);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public final void encodeRawMissingValue(final BCFType type) throws IOException {
|
||||||
|
encodePrimitive(type.getMissingBytes(), type);
|
||||||
|
}
|
||||||
|
|
||||||
|
public final void encodeRawMissingValues(final int size, final BCFType type) throws IOException {
|
||||||
|
for ( int i = 0; i < size; i++ )
|
||||||
|
encodeRawMissingValue(type);
|
||||||
|
}
|
||||||
|
|
||||||
|
// --------------------------------------------------------------------------------
|
||||||
|
//
|
||||||
|
// low-level encoders
|
||||||
|
//
|
||||||
|
// --------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
public final void encodeRawChar(final byte c) throws IOException {
|
||||||
|
encodeStream.write(c);
|
||||||
|
}
|
||||||
|
|
||||||
|
public final void encodeRawFloat(final float value, final BCFType type) throws IOException {
|
||||||
|
encodePrimitive(Float.floatToIntBits(value), type);
|
||||||
|
}
|
||||||
|
|
||||||
|
public final void encodeType(final int size, final BCFType type) throws IOException {
|
||||||
|
final byte typeByte = TypeDescriptor.encodeTypeDescriptor(size, type);
|
||||||
|
encodeStream.write(typeByte);
|
||||||
|
if ( TypeDescriptor.willOverflow(size) )
|
||||||
|
encodeTypedIntOfBestSize(size);
|
||||||
|
}
|
||||||
|
|
||||||
|
public final void encodeRawInt(final int value, final BCFType type) throws IOException {
|
||||||
|
encodePrimitive(value, type, encodeStream);
|
||||||
|
}
|
||||||
|
|
||||||
|
public final void encodePrimitive(final int value, final BCFType type) throws IOException {
|
||||||
|
encodePrimitive(value, type, encodeStream);
|
||||||
|
}
|
||||||
|
|
||||||
|
// --------------------------------------------------------------------------------
|
||||||
|
//
|
||||||
|
// utility functions
|
||||||
|
//
|
||||||
|
// --------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
public final BCFType determineIntegerType(final List<Integer> values) {
|
||||||
|
BCFType maxType = BCFType.INT8;
|
||||||
|
for ( final int value : values ) {
|
||||||
|
final BCFType type1 = determineIntegerType(value);
|
||||||
|
switch ( type1 ) {
|
||||||
|
case INT8: break;
|
||||||
|
case INT16: maxType = BCFType.INT16; break;
|
||||||
|
case INT32: return BCFType.INT32; // fast path for largest possible value
|
||||||
|
default: throw new ReviewedStingException("Unexpected integer type " + type1 );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return maxType;
|
||||||
|
}
|
||||||
|
|
||||||
|
public final BCFType determineIntegerType(final int value) {
|
||||||
|
for ( final BCFType potentialType : TypeDescriptor.INTEGER_TYPES_BY_SIZE ) {
|
||||||
|
if ( potentialType.withinRange(value) )
|
||||||
|
return potentialType;
|
||||||
|
}
|
||||||
|
|
||||||
|
throw new ReviewedStingException("Integer cannot be encoded in allowable range of even INT32: " + value);
|
||||||
|
}
|
||||||
|
|
||||||
|
private final BCFType determinePrimitiveType(final Object v) {
|
||||||
|
if ( v instanceof Integer )
|
||||||
|
return determineIntegerType((Integer)v);
|
||||||
|
else if ( v instanceof Float )
|
||||||
|
return BCFType.FLOAT;
|
||||||
|
else
|
||||||
|
throw new ReviewedStingException("No native encoding for Object of type " + v.getClass().getSimpleName());
|
||||||
|
}
|
||||||
|
|
||||||
|
public final static void encodePrimitive(final int value, final BCFType type, final OutputStream encodeStream) throws IOException {
|
||||||
|
for ( int i = type.getSizeInBytes() - 1; i >= 0; i-- ) {
|
||||||
|
final int shift = i * 8;
|
||||||
|
int mask = 0xFF << shift;
|
||||||
|
int byteValue = (mask & value) >> shift;
|
||||||
|
encodeStream.write(byteValue);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,136 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2012, The Broad Institute
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person
|
||||||
|
* obtaining a copy of this software and associated documentation
|
||||||
|
* files (the "Software"), to deal in the Software without
|
||||||
|
* restriction, including without limitation the rights to use,
|
||||||
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following
|
||||||
|
* conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be
|
||||||
|
* included in all copies or substantial portions of the Software.
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||||
|
* OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.broadinstitute.sting.utils.codecs.bcf2;
|
||||||
|
|
||||||
|
import org.broad.tribble.FeatureCodecHeader;
|
||||||
|
import org.broad.tribble.readers.PositionalBufferedStream;
|
||||||
|
import org.broadinstitute.sting.commandline.*;
|
||||||
|
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||||
|
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||||
|
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||||
|
import org.broadinstitute.sting.gatk.walkers.RodWalker;
|
||||||
|
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
|
||||||
|
import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils;
|
||||||
|
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||||
|
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||||
|
import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder;
|
||||||
|
|
||||||
|
import java.io.*;
|
||||||
|
import java.util.*;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Testing BCF2
|
||||||
|
*
|
||||||
|
* @author Mark DePristo
|
||||||
|
* @since 2012
|
||||||
|
*/
|
||||||
|
public class BCF2TestWalker extends RodWalker<Integer, Integer> {
|
||||||
|
/**
|
||||||
|
* Variants from this VCF file are used by this tool as input.
|
||||||
|
* The file must at least contain the standard VCF header lines, but
|
||||||
|
* can be empty (i.e., no variants are contained in the file).
|
||||||
|
*/
|
||||||
|
@Input(fullName="variant", shortName = "V", doc="Input VCF file", required=true)
|
||||||
|
public RodBinding<VariantContext> variants;
|
||||||
|
|
||||||
|
@Argument(doc="keep variants", required=false)
|
||||||
|
public boolean keepVariants = false;
|
||||||
|
|
||||||
|
@Argument(doc="quiet", required=false)
|
||||||
|
public boolean quiet = false;
|
||||||
|
|
||||||
|
@Argument(doc="dontIndexOnTheFly", required=false)
|
||||||
|
public boolean dontIndexOnTheFly = false;
|
||||||
|
|
||||||
|
@Output(doc="File to which results should be written",required=true)
|
||||||
|
protected File bcfFile;
|
||||||
|
|
||||||
|
private final List<VariantContext> vcs = new ArrayList<VariantContext>();
|
||||||
|
protected BCF2Writer writer;
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void initialize() {
|
||||||
|
final Map<String, VCFHeader> vcfRods = VCFUtils.getVCFHeadersFromRods(getToolkit(), Collections.singletonList(variants));
|
||||||
|
final VCFHeader header = VCFUtils.withUpdatedContigs(vcfRods.values().iterator().next(), getToolkit());
|
||||||
|
try {
|
||||||
|
writer = new BCF2Writer("out", bcfFile, new FileOutputStream(bcfFile),
|
||||||
|
getToolkit().getMasterSequenceDictionary(), ! dontIndexOnTheFly );
|
||||||
|
writer.writeHeader(header);
|
||||||
|
} catch ( FileNotFoundException e ) {
|
||||||
|
throw new UserException.CouldNotCreateOutputFile(bcfFile, e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||||
|
if ( tracker == null ) // RodWalkers can make funky map calls
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
for ( VariantContext vc : tracker.getValues(variants, context.getLocation())) {
|
||||||
|
writer.add(vc);
|
||||||
|
if ( keepVariants ) vcs.add(vc);
|
||||||
|
}
|
||||||
|
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
//
|
||||||
|
// default reduce -- doesn't do anything at all
|
||||||
|
//
|
||||||
|
public Integer reduceInit() { return 0; }
|
||||||
|
public Integer reduce(Integer counter, Integer sum) { return counter + sum; }
|
||||||
|
|
||||||
|
public void onTraversalDone(Integer sum) {
|
||||||
|
try {
|
||||||
|
writer.close();
|
||||||
|
logger.info("Closed writer");
|
||||||
|
|
||||||
|
// read in the BCF records
|
||||||
|
BCF2Codec codec = new BCF2Codec();
|
||||||
|
PositionalBufferedStream pbs = new PositionalBufferedStream(new FileInputStream(bcfFile));
|
||||||
|
FeatureCodecHeader header = codec.readHeader(pbs);
|
||||||
|
pbs.close();
|
||||||
|
|
||||||
|
pbs = new PositionalBufferedStream(new FileInputStream(bcfFile));
|
||||||
|
pbs.skip(header.getHeaderEnd());
|
||||||
|
Iterator<VariantContext> it = vcs.iterator();
|
||||||
|
while ( ! pbs.isDone() ) {
|
||||||
|
if ( keepVariants ) {
|
||||||
|
VariantContext expected = it.next();
|
||||||
|
if ( ! quiet )
|
||||||
|
System.out.printf("vcf = %s %d %s%n", expected.getChr(), expected.getStart(), expected);
|
||||||
|
}
|
||||||
|
VariantContext bcfRaw = codec.decode(pbs);
|
||||||
|
VariantContext bcf = new VariantContextBuilder(bcfRaw).source("variant").make();
|
||||||
|
if ( ! quiet ) {
|
||||||
|
System.out.printf("bcf = %s %d %s%n", bcf.getChr(), bcf.getStart(), bcf.toString());
|
||||||
|
System.out.printf("--------------------------------------------------%n");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
} catch ( IOException e ) {
|
||||||
|
throw new UserException.CouldNotCreateOutputFile(bcfFile, "bad user!");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,387 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2012, The Broad Institute
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person
|
||||||
|
* obtaining a copy of this software and associated documentation
|
||||||
|
* files (the "Software"), to deal in the Software without
|
||||||
|
* restriction, including without limitation the rights to use,
|
||||||
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following
|
||||||
|
* conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be
|
||||||
|
* included in all copies or substantial portions of the Software.
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||||
|
* OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.broadinstitute.sting.utils.codecs.bcf2;
|
||||||
|
|
||||||
|
import net.sf.samtools.SAMSequenceDictionary;
|
||||||
|
import org.apache.log4j.Logger;
|
||||||
|
import org.broadinstitute.sting.utils.Utils;
|
||||||
|
import org.broadinstitute.sting.utils.codecs.vcf.*;
|
||||||
|
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||||
|
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||||
|
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||||
|
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
||||||
|
import org.broadinstitute.sting.utils.variantcontext.GenotypesContext;
|
||||||
|
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.OutputStream;
|
||||||
|
import java.io.OutputStreamWriter;
|
||||||
|
import java.util.*;
|
||||||
|
|
||||||
|
public class BCF2Writer extends IndexingVCFWriter {
|
||||||
|
final protected static Logger logger = Logger.getLogger(BCF2Writer.class);
|
||||||
|
private final static boolean doNotWriteGenotypes = false;
|
||||||
|
private OutputStream outputStream; // Note: do not flush until completely done writing, to avoid issues with eventual BGZF support
|
||||||
|
private VCFHeader header;
|
||||||
|
private Map<String, Integer> contigDictionary = new HashMap<String, Integer>();
|
||||||
|
private Map<String, Integer> stringDictionary = new LinkedHashMap<String, Integer>();
|
||||||
|
|
||||||
|
private final BCF2Encoder encoder = new BCF2Encoder(); // initialized after the header arrives
|
||||||
|
|
||||||
|
public BCF2Writer(final String name, final File location, final OutputStream output, final SAMSequenceDictionary refDict, final boolean enableOnTheFlyIndexing) {
|
||||||
|
super(name, location, output, refDict, enableOnTheFlyIndexing);
|
||||||
|
this.outputStream = getOutputStream();
|
||||||
|
}
|
||||||
|
|
||||||
|
// --------------------------------------------------------------------------------
|
||||||
|
//
|
||||||
|
// Interface functions
|
||||||
|
//
|
||||||
|
// --------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void writeHeader(final VCFHeader header) {
|
||||||
|
this.header = header;
|
||||||
|
|
||||||
|
// create the config offsets map
|
||||||
|
for ( final VCFContigHeaderLine contig : header.getContigLines())
|
||||||
|
contigDictionary.put(contig.getID(), contig.getContigIndex());
|
||||||
|
|
||||||
|
// set up the strings dictionary
|
||||||
|
int offset = 0;
|
||||||
|
stringDictionary.put(VCFConstants.PASSES_FILTERS_v4, offset++); // special case the special PASS field
|
||||||
|
for ( VCFHeaderLine line : header.getMetaData() ) {
|
||||||
|
if ( line instanceof VCFIDHeaderLine ) {
|
||||||
|
VCFIDHeaderLine idLine = (VCFIDHeaderLine)line;
|
||||||
|
stringDictionary.put(idLine.getID(), offset++);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// add the dictionary ##dictionary=x,y,z line to the header
|
||||||
|
final String dictionaryLineValue = Utils.join(BCF2Constants.DICTIONARY_LINE_ENTRY_SEPARATOR, stringDictionary.keySet());
|
||||||
|
header.addMetaDataLine(new VCFHeaderLine(BCF2Constants.DICTIONARY_LINE_TAG, dictionaryLineValue));
|
||||||
|
|
||||||
|
// write out the header
|
||||||
|
StandardVCFWriter.writeHeader(header, new OutputStreamWriter(outputStream), doNotWriteGenotypes, BCF2Constants.VERSION_LINE, "BCF2 stream");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void add( final VariantContext initialVC ) {
|
||||||
|
final VariantContext vc = initialVC.fullyDecode(header);
|
||||||
|
super.add(vc); // allow on the fly indexing
|
||||||
|
|
||||||
|
try {
|
||||||
|
final byte[] infoBlock = buildSitesData(vc);
|
||||||
|
final byte[] genotypesBlock = buildSamplesData(vc);
|
||||||
|
|
||||||
|
// write the two blocks to disk
|
||||||
|
writeBlock(infoBlock, genotypesBlock);
|
||||||
|
}
|
||||||
|
catch ( IOException e ) {
|
||||||
|
throw new UserException("Error writing record to BCF2 file: " + vc.toString(), e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void close() {
|
||||||
|
try {
|
||||||
|
outputStream.flush();
|
||||||
|
outputStream.close();
|
||||||
|
}
|
||||||
|
catch ( IOException e ) {
|
||||||
|
throw new UserException("Failed to close BCF2 file");
|
||||||
|
}
|
||||||
|
super.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
// --------------------------------------------------------------------------------
|
||||||
|
//
|
||||||
|
// implicit block
|
||||||
|
//
|
||||||
|
// The first four records of BCF are inline untype encoded data of:
|
||||||
|
//
|
||||||
|
// 4 byte integer chrom offset
|
||||||
|
// 4 byte integer start
|
||||||
|
// 4 byte integer ref length
|
||||||
|
// 4 byte float qual
|
||||||
|
//
|
||||||
|
// --------------------------------------------------------------------------------
|
||||||
|
private byte[] buildSitesData( VariantContext vc ) throws IOException {
|
||||||
|
final int contigIndex = contigDictionary.get(vc.getChr());
|
||||||
|
if ( contigIndex == -1 )
|
||||||
|
throw new UserException(String.format("Contig %s not found in sequence dictionary from reference", vc.getChr()));
|
||||||
|
|
||||||
|
// note use of encodeRawValue to not insert the typing byte
|
||||||
|
encoder.encodeRawValue(contigIndex, BCFType.INT32);
|
||||||
|
|
||||||
|
// pos
|
||||||
|
encoder.encodeRawValue(vc.getStart(), BCFType.INT32);
|
||||||
|
|
||||||
|
// ref length
|
||||||
|
encoder.encodeRawValue(vc.getEnd() - vc.getStart() + 1, BCFType.INT32);
|
||||||
|
|
||||||
|
// qual
|
||||||
|
if ( vc.hasLog10PError() )
|
||||||
|
encoder.encodeRawFloat((float) vc.getPhredScaledQual(), BCFType.FLOAT);
|
||||||
|
else
|
||||||
|
encoder.encodeRawMissingValue(BCFType.FLOAT);
|
||||||
|
|
||||||
|
// info fields
|
||||||
|
final int nAlleles = vc.getNAlleles();
|
||||||
|
final int nInfo = vc.getAttributes().size();
|
||||||
|
final int nGenotypeFormatFields = StandardVCFWriter.calcVCFGenotypeKeys(vc).size();
|
||||||
|
final int nSamples = vc.getNSamples();
|
||||||
|
|
||||||
|
encoder.encodeRawInt((nAlleles << 16) | (nInfo & 0x00FF), BCFType.INT32);
|
||||||
|
encoder.encodeRawInt((nGenotypeFormatFields << 24) | (nSamples & 0x0FFF), BCFType.INT32);
|
||||||
|
|
||||||
|
buildID(vc);
|
||||||
|
buildAlleles(vc);
|
||||||
|
buildFilter(vc);
|
||||||
|
buildInfo(vc);
|
||||||
|
|
||||||
|
return encoder.getRecordBytes();
|
||||||
|
}
|
||||||
|
|
||||||
|
private void buildID( VariantContext vc ) throws IOException {
|
||||||
|
encoder.encodeString(vc.getID());
|
||||||
|
}
|
||||||
|
|
||||||
|
private void buildAlleles( VariantContext vc ) throws IOException {
|
||||||
|
for ( final Allele allele : vc.getAlleles() ) {
|
||||||
|
final String s = vc.getAlleleWithRefPadding(allele);
|
||||||
|
encoder.encodeString(s);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void buildFilter( VariantContext vc ) throws IOException {
|
||||||
|
if ( vc.isFiltered() ) {
|
||||||
|
encodeStringsByRef(vc.getFilters());
|
||||||
|
} else {
|
||||||
|
encoder.encodeTypedMissing(BCFType.INT32);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void buildInfo( VariantContext vc ) throws IOException {
|
||||||
|
for ( Map.Entry<String, Object> infoFieldEntry : vc.getAttributes().entrySet() ) {
|
||||||
|
final String key = infoFieldEntry.getKey();
|
||||||
|
Object value = infoFieldEntry.getValue();
|
||||||
|
|
||||||
|
final VCFToBCFType typeEquiv = getBCF2TypeFromHeader(key, value);
|
||||||
|
// handle the special FLAG case -- super annoying
|
||||||
|
if ( typeEquiv.vcfType == VCFHeaderLineType.Flag ) value = 1;
|
||||||
|
|
||||||
|
encodeStringByRef(key);
|
||||||
|
if ( value instanceof List ) // NOTE: ONLY WORKS WITH LISTS
|
||||||
|
encoder.encodeTypedVector((List) value, typeEquiv.bcfType);
|
||||||
|
else if ( value instanceof String )
|
||||||
|
encoder.encodeString((String)value);
|
||||||
|
else
|
||||||
|
encoder.encodeTypedSingleton(value, typeEquiv.bcfType);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private byte[] buildSamplesData(final VariantContext vc) throws IOException {
|
||||||
|
// write size
|
||||||
|
List<String> genotypeFields = StandardVCFWriter.calcVCFGenotypeKeys(vc);
|
||||||
|
for ( final String field : genotypeFields ) {
|
||||||
|
if ( field.equals(VCFConstants.GENOTYPE_KEY) ) {
|
||||||
|
addGenotypes(vc);
|
||||||
|
} else if ( field.equals(VCFConstants.GENOTYPE_QUALITY_KEY) ) {
|
||||||
|
addGQ(vc);
|
||||||
|
} else if ( field.equals(VCFConstants.GENOTYPE_FILTER_KEY) ) {
|
||||||
|
addGenotypeFilters(vc);
|
||||||
|
} else {
|
||||||
|
addGenericGenotypeField(vc, field);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return encoder.getRecordBytes();
|
||||||
|
}
|
||||||
|
|
||||||
|
private final int getNGenotypeFieldValues(final String field, final VariantContext vc) {
|
||||||
|
final VCFCompoundHeaderLine metaData = VariantContext.getMetaDataForField(header, field);
|
||||||
|
int nFields = metaData.getCount(vc.getAlternateAlleles().size());
|
||||||
|
if ( nFields == -1 ) { // unbounded, need to look at values
|
||||||
|
return computeMaxSizeOfGenotypeFieldFromValues(field, vc);
|
||||||
|
} else {
|
||||||
|
return nFields;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private final int computeMaxSizeOfGenotypeFieldFromValues(final String field, final VariantContext vc) {
|
||||||
|
int size = 1;
|
||||||
|
final GenotypesContext gc = vc.getGenotypes();
|
||||||
|
|
||||||
|
for ( final Genotype g : gc ) {
|
||||||
|
final Object o = g.getAttribute(field);
|
||||||
|
if ( o == null ) continue;
|
||||||
|
if ( o instanceof List ) {
|
||||||
|
// only do compute if first value is of type list
|
||||||
|
final List values = (List)g.getAttribute(field);
|
||||||
|
if ( values != null )
|
||||||
|
size = Math.max(size, values.size());
|
||||||
|
} else {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return size;
|
||||||
|
}
|
||||||
|
|
||||||
|
private final void addGenericGenotypeField(final VariantContext vc, final String field) throws IOException {
|
||||||
|
final int numInFormatField = getNGenotypeFieldValues(field, vc);
|
||||||
|
final VCFToBCFType type = getBCF2TypeFromHeader(field, null);
|
||||||
|
|
||||||
|
startGenotypeField(field, numInFormatField, type.bcfType);
|
||||||
|
for ( final Genotype g : vc.getGenotypes() ) {
|
||||||
|
if ( ! g.hasAttribute(field) ) {
|
||||||
|
encoder.encodeRawMissingValues(numInFormatField, type.bcfType);
|
||||||
|
} else {
|
||||||
|
final Object val = g.getAttribute(field);
|
||||||
|
final Collection<Object> vals = numInFormatField == 1 ? Collections.singleton(val) : (Collection)val;
|
||||||
|
encoder.encodeRawValues(vals, type.bcfType);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private final class VCFToBCFType {
|
||||||
|
VCFHeaderLineType vcfType;
|
||||||
|
BCFType bcfType;
|
||||||
|
|
||||||
|
private VCFToBCFType(final VCFHeaderLineType vcfType, final BCFType bcfType) {
|
||||||
|
this.vcfType = vcfType;
|
||||||
|
this.bcfType = bcfType;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO -- we really need explicit converters as first class objects
|
||||||
|
private final VCFToBCFType getBCF2TypeFromHeader(final String field, final Object maybeIntValue) {
|
||||||
|
// TODO -- need to generalize so we can enable vectors of compressed genotype ints
|
||||||
|
final VCFCompoundHeaderLine metaData = VariantContext.getMetaDataForField(header, field);
|
||||||
|
|
||||||
|
// TODO -- no sense in allocating these over and over
|
||||||
|
switch ( metaData.getType() ) {
|
||||||
|
case Character: return new VCFToBCFType(metaData.getType(), BCFType.CHAR);
|
||||||
|
case Flag: return new VCFToBCFType(metaData.getType(), BCFType.INT8);
|
||||||
|
case String: return new VCFToBCFType(metaData.getType(), BCFType.CHAR);
|
||||||
|
case Integer: return new VCFToBCFType(metaData.getType(), maybeIntValue != null ? encoder.determineIntegerType((Integer)maybeIntValue) : BCFType.INT32);
|
||||||
|
case Float: return new VCFToBCFType(metaData.getType(), BCFType.FLOAT);
|
||||||
|
default: throw new ReviewedStingException("Unexpected type for field " + field);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private final void addGenotypeFilters(final VariantContext vc) throws IOException {
|
||||||
|
logger.warn("Skipping genotype filter field");
|
||||||
|
// // TODO -- FIXME -- string is wrong here -- need to compute string size...
|
||||||
|
// startGenotypeField(VCFConstants.GENOTYPE_FILTER_KEY, 1, BCFType.CHAR);
|
||||||
|
// for ( final Genotype g : vc.getGenotypes() ) {
|
||||||
|
// if ( g.filtersWereApplied() && g.isFiltered() ) {
|
||||||
|
// encoder.encodeString(ParsingUtils.join(";", ParsingUtils.sortList(g.getFilters())));
|
||||||
|
// } else {
|
||||||
|
// encoder.encodeRawMissingValues(1, BCFType.CHAR); // todo fixme
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
}
|
||||||
|
|
||||||
|
private final void addGQ(final VariantContext vc) throws IOException {
|
||||||
|
startGenotypeField(VCFConstants.GENOTYPE_QUALITY_KEY, 1, BCFType.INT8);
|
||||||
|
for ( final Genotype g : vc.getGenotypes() ) {
|
||||||
|
if ( g.hasLog10PError() ) {
|
||||||
|
final int GQ = (int)Math.round(Math.min(g.getPhredScaledQual(), VCFConstants.MAX_GENOTYPE_QUAL));
|
||||||
|
if ( GQ > VCFConstants.MAX_GENOTYPE_QUAL ) throw new ReviewedStingException("Unexpectedly large GQ " + GQ + " at " + vc);
|
||||||
|
encoder.encodeRawValue(GQ, BCFType.INT8);
|
||||||
|
} else {
|
||||||
|
encoder.encodeRawMissingValues(1, BCFType.INT8);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private final void addGenotypes(final VariantContext vc) throws IOException {
|
||||||
|
if ( vc.getNAlleles() > 127 )
|
||||||
|
throw new ReviewedStingException("Current BCF2 encoder cannot handle sites " +
|
||||||
|
"with > 127 alleles, but you have " + vc.getNAlleles() + " at "
|
||||||
|
+ vc.getChr() + ":" + vc.getStart());
|
||||||
|
|
||||||
|
final Map<Allele, String> alleleMap = StandardVCFWriter.buildAlleleMap(vc);
|
||||||
|
final int requiredPloidy = 2; // TODO -- handle ploidy, will need padding / depadding
|
||||||
|
startGenotypeField(VCFConstants.GENOTYPE_KEY, requiredPloidy, BCFType.INT8);
|
||||||
|
for ( final Genotype g : vc.getGenotypes() ) {
|
||||||
|
if ( g.getPloidy() != requiredPloidy ) throw new ReviewedStingException("Cannot currently handle non-diploid calls!");
|
||||||
|
final List<Integer> encoding = new ArrayList<Integer>(requiredPloidy);
|
||||||
|
for ( final Allele a : g.getAlleles() ) {
|
||||||
|
final int offset = a.isNoCall() ? -1 : Integer.valueOf(alleleMap.get(a));
|
||||||
|
encoding.add(((offset+1) << 1) | (g.isPhased() ? 0x01 : 0x00));
|
||||||
|
}
|
||||||
|
encoder.encodeRawValues(encoding, BCFType.INT8);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Write the data in the encoder to the outputstream as a length encoded
|
||||||
|
* block of data. After this call the encoder stream will be ready to
|
||||||
|
* start a new data block
|
||||||
|
*
|
||||||
|
* @throws IOException
|
||||||
|
*/
|
||||||
|
private void writeBlock(final byte[] infoBlock, final byte[] genotypesBlock) throws IOException {
|
||||||
|
BCF2Encoder.encodePrimitive(infoBlock.length, BCFType.INT32, outputStream);
|
||||||
|
BCF2Encoder.encodePrimitive(genotypesBlock.length, BCFType.INT32, outputStream);
|
||||||
|
outputStream.write(infoBlock);
|
||||||
|
outputStream.write(genotypesBlock);
|
||||||
|
}
|
||||||
|
|
||||||
|
public final BCFType encodeStringByRef(final String string) throws IOException {
|
||||||
|
return encodeStringsByRef(Collections.singleton(string));
|
||||||
|
}
|
||||||
|
|
||||||
|
public final BCFType encodeStringsByRef(final Collection<String> strings) throws IOException {
|
||||||
|
final List<Integer> offsets = new ArrayList<Integer>(strings.size());
|
||||||
|
BCFType maxType = BCFType.INT8; // start with the smallest size
|
||||||
|
|
||||||
|
// iterate over strings until we find one that needs 16 bits, and break
|
||||||
|
for ( final String string : strings ) {
|
||||||
|
final int offset = stringDictionary.get(string);
|
||||||
|
offsets.add(offset);
|
||||||
|
final BCFType type1 = encoder.determineIntegerType(offset);
|
||||||
|
switch ( type1 ) {
|
||||||
|
case INT8: break;
|
||||||
|
case INT16: if ( maxType == BCFType.INT8 ) maxType = BCFType.INT16; break;
|
||||||
|
case INT32: maxType = BCFType.INT32; break;
|
||||||
|
default: throw new ReviewedStingException("Unexpected type " + type1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// we've checked the types for all strings, so write them out
|
||||||
|
encoder.encodeTypedVector(offsets, maxType);
|
||||||
|
return maxType;
|
||||||
|
}
|
||||||
|
|
||||||
|
public final void startGenotypeField(final String key, final int size, final BCFType valueType) throws IOException {
|
||||||
|
encodeStringByRef(key);
|
||||||
|
encoder.encodeType(size, valueType);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,71 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2012, The Broad Institute
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person
|
||||||
|
* obtaining a copy of this software and associated documentation
|
||||||
|
* files (the "Software"), to deal in the Software without
|
||||||
|
* restriction, including without limitation the rights to use,
|
||||||
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following
|
||||||
|
* conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be
|
||||||
|
* included in all copies or substantial portions of the Software.
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||||
|
* OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.broadinstitute.sting.utils.codecs.bcf2;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* BCF2 types and information
|
||||||
|
*
|
||||||
|
* @author depristo
|
||||||
|
* @since 05/12
|
||||||
|
*/
|
||||||
|
public enum BCFType {
|
||||||
|
RESERVED_0,
|
||||||
|
INT8(1, BCF2Constants.INT8_MISSING_VALUE, -127, 127), // todo -- confirm range
|
||||||
|
INT16(2, BCF2Constants.INT16_MISSING_VALUE, -32767, 32767),
|
||||||
|
INT32(4, BCF2Constants.INT32_MISSING_VALUE, -2147483647, 2147483647),
|
||||||
|
RESERVED_4,
|
||||||
|
FLOAT(4, BCF2Constants.FLOAT_MISSING_VALUE),
|
||||||
|
RESERVED_6,
|
||||||
|
CHAR;
|
||||||
|
|
||||||
|
private final Object missingJavaValue;
|
||||||
|
private final int missingBytes;
|
||||||
|
private final int sizeInBytes;
|
||||||
|
private final long minValue, maxValue;
|
||||||
|
|
||||||
|
BCFType() {
|
||||||
|
this(-1, 0, 0, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
BCFType(final int sizeInBytes, final int missingBytes) {
|
||||||
|
this(sizeInBytes, missingBytes, 0, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
BCFType(final int sizeInBytes, final int missingBytes, final long minValue, final long maxValue) {
|
||||||
|
this.sizeInBytes = sizeInBytes;
|
||||||
|
this.missingJavaValue = null;
|
||||||
|
this.missingBytes = missingBytes;
|
||||||
|
this.minValue = minValue;
|
||||||
|
this.maxValue = maxValue;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getSizeInBytes() {
|
||||||
|
return sizeInBytes;
|
||||||
|
}
|
||||||
|
public int getID() { return ordinal(); }
|
||||||
|
public final boolean withinRange(final long v) { return v >= minValue && v <= maxValue; }
|
||||||
|
public Object getMissingJavaValue() { return missingJavaValue; }
|
||||||
|
public int getMissingBytes() { return missingBytes; }
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,71 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2012, The Broad Institute
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person
|
||||||
|
* obtaining a copy of this software and associated documentation
|
||||||
|
* files (the "Software"), to deal in the Software without
|
||||||
|
* restriction, including without limitation the rights to use,
|
||||||
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following
|
||||||
|
* conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be
|
||||||
|
* included in all copies or substantial portions of the Software.
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||||
|
* OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.broadinstitute.sting.utils.codecs.bcf2;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Simple BCF decoder
|
||||||
|
* @author Mark DePristo
|
||||||
|
* @since 5/3/12
|
||||||
|
*/
|
||||||
|
public class TypeDescriptor {
|
||||||
|
public static final int OVERFLOW_ELEMENT_MARKER = 15;
|
||||||
|
public static final int MAX_INLINE_ELEMENTS = 14;
|
||||||
|
|
||||||
|
public final static BCFType[] INTEGER_TYPES_BY_SIZE = new BCFType[3];
|
||||||
|
public final static BCFType[] DICTIONARY_TYPES_BY_SIZE = INTEGER_TYPES_BY_SIZE;
|
||||||
|
private final static BCFType[] lookup = BCFType.values();
|
||||||
|
|
||||||
|
static {
|
||||||
|
INTEGER_TYPES_BY_SIZE[0] = BCFType.INT8;
|
||||||
|
INTEGER_TYPES_BY_SIZE[1] = BCFType.INT16;
|
||||||
|
INTEGER_TYPES_BY_SIZE[2] = BCFType.INT32;
|
||||||
|
}
|
||||||
|
|
||||||
|
public final static byte encodeTypeDescriptor(final int nElements, final BCFType type ) {
|
||||||
|
int encodeSize = Math.min(nElements, OVERFLOW_ELEMENT_MARKER);
|
||||||
|
byte typeByte = (byte)((0x0F & encodeSize) << 4 | (type.getID() & 0x0F));
|
||||||
|
return typeByte;
|
||||||
|
}
|
||||||
|
|
||||||
|
public final static int decodeSize(final byte typeDescriptor) {
|
||||||
|
return (0xF0 & typeDescriptor) >> 4;
|
||||||
|
}
|
||||||
|
|
||||||
|
public final static int decodeTypeID(final byte typeDescriptor) {
|
||||||
|
return typeDescriptor & 0x0F;
|
||||||
|
}
|
||||||
|
|
||||||
|
public final static BCFType decodeType(final byte typeDescriptor) {
|
||||||
|
return lookup[decodeTypeID(typeDescriptor)];
|
||||||
|
}
|
||||||
|
|
||||||
|
public final static boolean sizeIsOverflow(final byte typeDescriptor) {
|
||||||
|
return decodeSize(typeDescriptor) == OVERFLOW_ELEMENT_MARKER;
|
||||||
|
}
|
||||||
|
|
||||||
|
public final static boolean willOverflow(final long nElements) {
|
||||||
|
return nElements > MAX_INLINE_ELEMENTS;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,362 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2012, The Broad Institute
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person
|
||||||
|
* obtaining a copy of this software and associated documentation
|
||||||
|
* files (the "Software"), to deal in the Software without
|
||||||
|
* restriction, including without limitation the rights to use,
|
||||||
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following
|
||||||
|
* conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be
|
||||||
|
* included in all copies or substantial portions of the Software.
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||||
|
* OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
// our package
|
||||||
|
package org.broadinstitute.sting.utils.codecs.bcf2;
|
||||||
|
|
||||||
|
|
||||||
|
// the imports for unit testing.
|
||||||
|
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.BaseTest;
|
||||||
|
import org.testng.Assert;
|
||||||
|
import org.testng.annotations.BeforeSuite;
|
||||||
|
import org.testng.annotations.DataProvider;
|
||||||
|
import org.testng.annotations.Test;
|
||||||
|
|
||||||
|
import java.io.ByteArrayInputStream;
|
||||||
|
import java.io.ByteArrayOutputStream;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.InputStream;
|
||||||
|
import java.util.*;
|
||||||
|
|
||||||
|
|
||||||
|
public class EncoderDecoderUnitTest extends BaseTest {
|
||||||
|
private final float FLOAT_TOLERANCE = (float)1e-8;
|
||||||
|
final List<BCF2TypedValue> primitives = new ArrayList<BCF2TypedValue>();
|
||||||
|
final List<BCF2TypedValue> basicTypes = new ArrayList<BCF2TypedValue>();
|
||||||
|
final List<BCF2TypedValue> forCombinations = new ArrayList<BCF2TypedValue>();
|
||||||
|
|
||||||
|
@BeforeSuite
|
||||||
|
public void before() {
|
||||||
|
basicTypes.add(new BCF2TypedValue(1, BCFType.INT8));
|
||||||
|
basicTypes.add(new BCF2TypedValue(1000, BCFType.INT16));
|
||||||
|
basicTypes.add(new BCF2TypedValue(1000000, BCFType.INT32));
|
||||||
|
basicTypes.add(new BCF2TypedValue(1.2345e6, BCFType.FLOAT));
|
||||||
|
basicTypes.add(new BCF2TypedValue(new Byte((byte)'A'), BCFType.CHAR));
|
||||||
|
|
||||||
|
// small ints
|
||||||
|
primitives.add(new BCF2TypedValue(0, BCFType.INT8));
|
||||||
|
primitives.add(new BCF2TypedValue(10, BCFType.INT8));
|
||||||
|
primitives.add(new BCF2TypedValue(-1, BCFType.INT8));
|
||||||
|
primitives.add(new BCF2TypedValue(100, BCFType.INT8));
|
||||||
|
primitives.add(new BCF2TypedValue(-100, BCFType.INT8));
|
||||||
|
primitives.add(new BCF2TypedValue(-127, BCFType.INT8)); // last value in range
|
||||||
|
primitives.add(new BCF2TypedValue( 127, BCFType.INT8)); // last value in range
|
||||||
|
|
||||||
|
// medium ints
|
||||||
|
primitives.add(new BCF2TypedValue(-1000, BCFType.INT16));
|
||||||
|
primitives.add(new BCF2TypedValue(1000, BCFType.INT16));
|
||||||
|
primitives.add(new BCF2TypedValue(-128, BCFType.INT16)); // first value in range
|
||||||
|
primitives.add(new BCF2TypedValue( 128, BCFType.INT16)); // first value in range
|
||||||
|
primitives.add(new BCF2TypedValue(-32767, BCFType.INT16)); // last value in range
|
||||||
|
primitives.add(new BCF2TypedValue( 32767, BCFType.INT16)); // last value in range
|
||||||
|
|
||||||
|
// larger ints
|
||||||
|
primitives.add(new BCF2TypedValue(-32768, BCFType.INT32)); // first value in range
|
||||||
|
primitives.add(new BCF2TypedValue( 32768, BCFType.INT32)); // first value in range
|
||||||
|
primitives.add(new BCF2TypedValue(-100000, BCFType.INT32));
|
||||||
|
primitives.add(new BCF2TypedValue(100000, BCFType.INT32));
|
||||||
|
primitives.add(new BCF2TypedValue(-2147483647, BCFType.INT32));
|
||||||
|
primitives.add(new BCF2TypedValue(2147483647, BCFType.INT32));
|
||||||
|
|
||||||
|
// floats
|
||||||
|
primitives.add(new BCF2TypedValue(0.0, BCFType.FLOAT));
|
||||||
|
primitives.add(new BCF2TypedValue(-0.0, BCFType.FLOAT));
|
||||||
|
primitives.add(new BCF2TypedValue(1.0, BCFType.FLOAT));
|
||||||
|
primitives.add(new BCF2TypedValue(-1.0, BCFType.FLOAT));
|
||||||
|
primitives.add(new BCF2TypedValue(1.1, BCFType.FLOAT));
|
||||||
|
primitives.add(new BCF2TypedValue(-1.1, BCFType.FLOAT));
|
||||||
|
primitives.add(new BCF2TypedValue(5.0 / 3.0, BCFType.FLOAT));
|
||||||
|
primitives.add(new BCF2TypedValue(-5.0 / 3.0, BCFType.FLOAT));
|
||||||
|
primitives.add(new BCF2TypedValue(1.23e3, BCFType.FLOAT));
|
||||||
|
primitives.add(new BCF2TypedValue(1.23e6, BCFType.FLOAT));
|
||||||
|
primitives.add(new BCF2TypedValue(1.23e9, BCFType.FLOAT));
|
||||||
|
primitives.add(new BCF2TypedValue(1.23e12, BCFType.FLOAT));
|
||||||
|
primitives.add(new BCF2TypedValue(1.23e15, BCFType.FLOAT));
|
||||||
|
primitives.add(new BCF2TypedValue(-1.23e3, BCFType.FLOAT));
|
||||||
|
primitives.add(new BCF2TypedValue(-1.23e6, BCFType.FLOAT));
|
||||||
|
primitives.add(new BCF2TypedValue(-1.23e9, BCFType.FLOAT));
|
||||||
|
primitives.add(new BCF2TypedValue(-1.23e12, BCFType.FLOAT));
|
||||||
|
primitives.add(new BCF2TypedValue(-1.23e15, BCFType.FLOAT));
|
||||||
|
primitives.add(new BCF2TypedValue(Float.MIN_VALUE, BCFType.FLOAT));
|
||||||
|
primitives.add(new BCF2TypedValue(Float.MAX_VALUE, BCFType.FLOAT));
|
||||||
|
primitives.add(new BCF2TypedValue(Float.NEGATIVE_INFINITY, BCFType.FLOAT));
|
||||||
|
primitives.add(new BCF2TypedValue(Float.POSITIVE_INFINITY, BCFType.FLOAT));
|
||||||
|
primitives.add(new BCF2TypedValue(Float.NaN, BCFType.FLOAT));
|
||||||
|
|
||||||
|
// strings
|
||||||
|
//primitives.add(new BCF2TypedValue("", BCFType.CHAR)); <- will be null (which is right)
|
||||||
|
primitives.add(new BCF2TypedValue("S", BCFType.CHAR));
|
||||||
|
primitives.add(new BCF2TypedValue("S2", BCFType.CHAR));
|
||||||
|
primitives.add(new BCF2TypedValue("12345678910", BCFType.CHAR));
|
||||||
|
primitives.add(new BCF2TypedValue("ABCDEFGHIJKLMNOPQRSTUVWXYZ", BCFType.CHAR));
|
||||||
|
primitives.add(new BCF2TypedValue("ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZ", BCFType.CHAR));
|
||||||
|
|
||||||
|
// missing values
|
||||||
|
for ( BCFType type : BCFType.values() ) {
|
||||||
|
primitives.add(new BCF2TypedValue(null, type));
|
||||||
|
}
|
||||||
|
|
||||||
|
forCombinations.add(new BCF2TypedValue(10, BCFType.INT8));
|
||||||
|
forCombinations.add(new BCF2TypedValue(100, BCFType.INT8));
|
||||||
|
forCombinations.add(new BCF2TypedValue(-100, BCFType.INT8));
|
||||||
|
forCombinations.add(new BCF2TypedValue(-128, BCFType.INT16)); // first value in range
|
||||||
|
forCombinations.add(new BCF2TypedValue( 128, BCFType.INT16)); // first value in range
|
||||||
|
forCombinations.add(new BCF2TypedValue(-100000, BCFType.INT32));
|
||||||
|
forCombinations.add(new BCF2TypedValue(100000, BCFType.INT32));
|
||||||
|
forCombinations.add(new BCF2TypedValue(0.0, BCFType.FLOAT));
|
||||||
|
forCombinations.add(new BCF2TypedValue(1.23e6, BCFType.FLOAT));
|
||||||
|
forCombinations.add(new BCF2TypedValue(-1.23e6, BCFType.FLOAT));
|
||||||
|
forCombinations.add(new BCF2TypedValue("S", BCFType.CHAR));
|
||||||
|
forCombinations.add(new BCF2TypedValue("ABCDEFGHIJKLMNOPQRSTUVWXYZ", BCFType.CHAR));
|
||||||
|
forCombinations.add(new BCF2TypedValue("ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZ", BCFType.CHAR));
|
||||||
|
|
||||||
|
// missing values
|
||||||
|
for ( BCFType type : BCFType.values() ) {
|
||||||
|
forCombinations.add(new BCF2TypedValue(null, type));
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
// --------------------------------------------------------------------------------
|
||||||
|
//
|
||||||
|
// merge case Provider
|
||||||
|
//
|
||||||
|
// --------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
private class BCF2TypedValue {
|
||||||
|
final BCFType type;
|
||||||
|
final Object value;
|
||||||
|
|
||||||
|
private BCF2TypedValue(final int value, final BCFType type) {
|
||||||
|
this(new Integer(value), type);
|
||||||
|
}
|
||||||
|
|
||||||
|
private BCF2TypedValue(final double value, final BCFType type) {
|
||||||
|
this(new Float(value), type);
|
||||||
|
}
|
||||||
|
|
||||||
|
private BCF2TypedValue(final Object value, final BCFType type) {
|
||||||
|
this.type = type;
|
||||||
|
this.value = value;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isMissing() { return value == null; }
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
return String.format("%s of %s", value, type);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@DataProvider(name = "BCF2EncodingTestProviderSingletons")
|
||||||
|
public Object[][] BCF2EncodingTestProviderSingletons() {
|
||||||
|
List<Object[]> tests = new ArrayList<Object[]>();
|
||||||
|
for ( BCF2TypedValue tv : primitives )
|
||||||
|
tests.add(new Object[]{Arrays.asList(tv)});
|
||||||
|
return tests.toArray(new Object[][]{});
|
||||||
|
}
|
||||||
|
|
||||||
|
@DataProvider(name = "BCF2EncodingTestProviderBasicTypes")
|
||||||
|
public Object[][] BCF2EncodingTestProviderBasicTypes() {
|
||||||
|
List<Object[]> tests = new ArrayList<Object[]>();
|
||||||
|
for ( BCF2TypedValue tv : basicTypes )
|
||||||
|
tests.add(new Object[]{Arrays.asList(tv)});
|
||||||
|
return tests.toArray(new Object[][]{});
|
||||||
|
}
|
||||||
|
|
||||||
|
@DataProvider(name = "BCF2EncodingTestProviderSequences")
|
||||||
|
public Object[][] BCF2EncodingTestProviderSequences() {
|
||||||
|
List<Object[]> tests = new ArrayList<Object[]>();
|
||||||
|
for ( BCF2TypedValue tv1 : forCombinations )
|
||||||
|
for ( BCF2TypedValue tv2 : forCombinations )
|
||||||
|
for ( BCF2TypedValue tv3 : forCombinations )
|
||||||
|
tests.add(new Object[]{Arrays.asList(tv1, tv2, tv3)});
|
||||||
|
return tests.toArray(new Object[][]{});
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test(dataProvider = "BCF2EncodingTestProviderSingletons")
|
||||||
|
public void testBCF2EncodingSingletons(final List<BCF2TypedValue> toEncode) throws IOException {
|
||||||
|
final byte[] record = encodeRecord(toEncode);
|
||||||
|
decodeRecord(toEncode, record);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test(dataProvider = "BCF2EncodingTestProviderBasicTypes")
|
||||||
|
public void testBCF2EncodingVectors(final List<BCF2TypedValue> toEncode) throws IOException {
|
||||||
|
for ( final BCF2TypedValue tv : toEncode ) {
|
||||||
|
for ( final int length : Arrays.asList(2, 5, 10, 15, 20, 25) ) {
|
||||||
|
BCF2Encoder encoder = new BCF2Encoder();
|
||||||
|
List<Object> expected = Collections.nCopies(length, tv.value);
|
||||||
|
encoder.encodeTypedVector(expected, tv.type);
|
||||||
|
|
||||||
|
BCF2Decoder decoder = new BCF2Decoder(encoder.getRecordBytes());
|
||||||
|
final Object decoded = decoder.decodeTypedValue();
|
||||||
|
|
||||||
|
if ( tv.type == BCFType.CHAR ) {
|
||||||
|
Assert.assertTrue(decoded instanceof String);
|
||||||
|
final String decodedString = (String)decoded;
|
||||||
|
Assert.assertTrue(decodedString.length() == length);
|
||||||
|
} else {
|
||||||
|
Assert.assertTrue(decoded instanceof List);
|
||||||
|
final List<Object> decodedList = (List<Object>)decoded;
|
||||||
|
Assert.assertEquals(decodedList.size(), expected.size());
|
||||||
|
for ( Object decodedValue : decodedList )
|
||||||
|
myAssertEquals(tv, decodedValue);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test(dataProvider = "BCF2EncodingTestProviderBasicTypes")
|
||||||
|
public void testBCF2EncodingVectorsWithMissing(final List<BCF2TypedValue> toEncode) throws IOException {
|
||||||
|
for ( final BCF2TypedValue tv : toEncode ) {
|
||||||
|
if ( tv.type != BCFType.CHAR ) {
|
||||||
|
for ( final int length : Arrays.asList(2, 5, 10, 15, 20, 25) ) {
|
||||||
|
final byte td = TypeDescriptor.encodeTypeDescriptor(1, tv.type);
|
||||||
|
|
||||||
|
final BCF2Encoder encoder = new BCF2Encoder();
|
||||||
|
for ( int i = 0; i < length; i++ ) {
|
||||||
|
encoder.encodeRawValue(i % 2 == 0 ? null : tv.value, tv.type);
|
||||||
|
}
|
||||||
|
|
||||||
|
final BCF2Decoder decoder = new BCF2Decoder(encoder.getRecordBytes());
|
||||||
|
|
||||||
|
for ( int i = 0; i < length; i++ ) {
|
||||||
|
final Object decoded = decoder.decodeTypedValue(td);
|
||||||
|
myAssertEquals(i % 2 == 0 ? new BCF2TypedValue(null, tv.type) : tv, decoded);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Test(dataProvider = "BCF2EncodingTestProviderSequences", dependsOnMethods = "testBCF2EncodingSingletons")
|
||||||
|
public void testBCF2EncodingTestProviderSequences(final List<BCF2TypedValue> toEncode) throws IOException {
|
||||||
|
final byte[] record = encodeRecord(toEncode);
|
||||||
|
decodeRecord(toEncode, record);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test(dataProvider = "BCF2EncodingTestProviderSequences", dependsOnMethods = "testBCF2EncodingTestProviderSequences")
|
||||||
|
public void testReadAndSkipWithMultipleBlocks(final List<BCF2TypedValue> block) throws IOException {
|
||||||
|
testReadAndSkipWithMultipleBlocks(block, forCombinations);
|
||||||
|
testReadAndSkipWithMultipleBlocks(forCombinations, block);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testReadAndSkipWithMultipleBlocks(final List<BCF2TypedValue> block1, final List<BCF2TypedValue> block2) throws IOException {
|
||||||
|
final byte[] record1 = encodeRecord(block1);
|
||||||
|
final byte[] record2 = encodeRecord(block2);
|
||||||
|
|
||||||
|
// each record is individually good
|
||||||
|
decodeRecord(block1, record1);
|
||||||
|
decodeRecord(block2, record2);
|
||||||
|
|
||||||
|
BCF2Decoder decoder = new BCF2Decoder();
|
||||||
|
|
||||||
|
// test setting
|
||||||
|
decoder.setRecordBytes(record1);
|
||||||
|
decodeRecord(block1, decoder);
|
||||||
|
decoder.setRecordBytes(record2);
|
||||||
|
decodeRecord(block2, decoder);
|
||||||
|
|
||||||
|
// test combining the streams
|
||||||
|
final byte[] combined = combineRecords(record1, record2);
|
||||||
|
final List<BCF2TypedValue> combinedObjects = new ArrayList<BCF2TypedValue>(block1);
|
||||||
|
combinedObjects.addAll(block2);
|
||||||
|
|
||||||
|
// the combined bytes is the same as the combined objects
|
||||||
|
InputStream stream = new ByteArrayInputStream(combined);
|
||||||
|
decoder.readNextBlock(record1.length, stream);
|
||||||
|
decodeRecord(block1, decoder);
|
||||||
|
decoder.readNextBlock(record2.length, stream);
|
||||||
|
decodeRecord(block2, decoder);
|
||||||
|
|
||||||
|
// skipping the first block allows us to read the second block directly
|
||||||
|
stream = new ByteArrayInputStream(combined);
|
||||||
|
decoder.skipNextBlock(record1.length, stream);
|
||||||
|
decoder.readNextBlock(record2.length, stream);
|
||||||
|
decodeRecord(block2, decoder);
|
||||||
|
}
|
||||||
|
|
||||||
|
private final byte[] combineRecords(final byte[] record1, final byte[] record2) throws IOException {
|
||||||
|
ByteArrayOutputStream baos = new ByteArrayOutputStream();
|
||||||
|
baos.write(record1);
|
||||||
|
baos.write(record2);
|
||||||
|
return baos.toByteArray();
|
||||||
|
}
|
||||||
|
|
||||||
|
private final byte[] encodeRecord(final List<BCF2TypedValue> toEncode) throws IOException {
|
||||||
|
BCF2Encoder encoder = new BCF2Encoder();
|
||||||
|
|
||||||
|
for ( final BCF2TypedValue tv : toEncode ) {
|
||||||
|
if ( tv.isMissing() )
|
||||||
|
encoder.encodeTypedMissing(tv.type);
|
||||||
|
else {
|
||||||
|
final BCFType encodedType = encoder.encode(tv.value);
|
||||||
|
if ( tv.type != null ) // only if we have an expectation
|
||||||
|
Assert.assertEquals(encodedType, tv.type);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// check output
|
||||||
|
final byte[] record = encoder.getRecordBytes();
|
||||||
|
Assert.assertNotNull(record);
|
||||||
|
Assert.assertTrue(record.length > 0);
|
||||||
|
return record;
|
||||||
|
}
|
||||||
|
|
||||||
|
private final void decodeRecord(final List<BCF2TypedValue> toEncode, final byte[] record) {
|
||||||
|
decodeRecord(toEncode, new BCF2Decoder(record));
|
||||||
|
}
|
||||||
|
|
||||||
|
private final void decodeRecord(final List<BCF2TypedValue> toEncode, final BCF2Decoder decoder) {
|
||||||
|
for ( final BCF2TypedValue tv : toEncode ) {
|
||||||
|
Assert.assertFalse(decoder.blockIsFullyDecoded());
|
||||||
|
final Object decoded = decoder.decodeTypedValue();
|
||||||
|
|
||||||
|
myAssertEquals(tv, decoded);
|
||||||
|
}
|
||||||
|
|
||||||
|
Assert.assertTrue(decoder.blockIsFullyDecoded());
|
||||||
|
}
|
||||||
|
|
||||||
|
private final void myAssertEquals(final BCF2TypedValue tv, final Object decoded) {
|
||||||
|
if ( tv.value == null ) { // special needs for instanceof double
|
||||||
|
Assert.assertEquals(decoded, tv.value);
|
||||||
|
} else if ( tv.type == BCFType.FLOAT ) { // need tolerance for floats, and they aren't null
|
||||||
|
Assert.assertTrue(decoded instanceof Double);
|
||||||
|
|
||||||
|
final float valueFloat = (float)(Float)tv.value;
|
||||||
|
final float decodedFloat = (float)(double)(Double)decoded;
|
||||||
|
|
||||||
|
if ( Float.isNaN(valueFloat) ) // NaN == NaN => false unfortunately
|
||||||
|
Assert.assertTrue(Float.isNaN(decodedFloat));
|
||||||
|
else {
|
||||||
|
Assert.assertEquals(decodedFloat, valueFloat, FLOAT_TOLERANCE);
|
||||||
|
}
|
||||||
|
} else
|
||||||
|
Assert.assertEquals(decoded, tv.value);
|
||||||
|
}
|
||||||
|
}
|
||||||
Loading…
Reference in New Issue