Efficient reading of genotype fields v1

-- decodeIntArray in BCF2 decoder allows us to more efficiently read ints and int[] from stream directly into Genotype object
-- Code cleanup / contracts added were appropriate
-- V2 will have a yet more optimized path...
This commit is contained in:
Mark DePristo 2012-06-04 12:17:15 -04:00
parent 3a70f8216b
commit 1d4eb46606
5 changed files with 128 additions and 47 deletions

View File

@ -191,12 +191,12 @@ public class BCF2Codec implements FeatureCodec<VariantContext>, ReferenceDepende
// -------------------------------------------------------------------------------- // --------------------------------------------------------------------------------
private final SitesInfoForDecoding decodeSitesBlock(final VariantContextBuilder builder) { private final SitesInfoForDecoding decodeSitesBlock(final VariantContextBuilder builder) {
final int contigOffset = decoder.decodeInt(BCF2Type.INT32.getSizeInBytes()); final int contigOffset = decoder.decodeInt(BCF2Type.INT32);
final String contig = lookupContigName(contigOffset); final String contig = lookupContigName(contigOffset);
builder.chr(contig); builder.chr(contig);
final int pos = decoder.decodeInt(BCF2Type.INT32.getSizeInBytes()); final int pos = decoder.decodeInt(BCF2Type.INT32);
final int refLength = decoder.decodeInt(BCF2Type.INT32.getSizeInBytes()); final int refLength = decoder.decodeInt(BCF2Type.INT32);
builder.start((long)pos); builder.start((long)pos);
builder.stop((long)(pos + refLength - 1)); // minus one because of our open intervals builder.stop((long)(pos + refLength - 1)); // minus one because of our open intervals
@ -205,8 +205,8 @@ public class BCF2Codec implements FeatureCodec<VariantContext>, ReferenceDepende
builder.log10PError(((Double)qual) / -10.0); builder.log10PError(((Double)qual) / -10.0);
} }
final int nAlleleInfo = decoder.decodeInt(BCF2Type.INT32.getSizeInBytes()); final int nAlleleInfo = decoder.decodeInt(BCF2Type.INT32);
final int nFormatSamples = decoder.decodeInt(BCF2Type.INT32.getSizeInBytes()); final int nFormatSamples = decoder.decodeInt(BCF2Type.INT32);
final int nAlleles = nAlleleInfo >> 16; final int nAlleles = nAlleleInfo >> 16;
final int nInfo = nAlleleInfo & 0x00FF; final int nInfo = nAlleleInfo & 0x00FF;
final int nFormatFields = nFormatSamples >> 24; final int nFormatFields = nFormatSamples >> 24;

View File

@ -24,6 +24,8 @@
package org.broadinstitute.sting.utils.codecs.bcf2; package org.broadinstitute.sting.utils.codecs.bcf2;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import org.apache.log4j.Logger; import org.apache.log4j.Logger;
import org.broad.tribble.FeatureCodec; import org.broad.tribble.FeatureCodec;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
@ -33,6 +35,7 @@ import java.io.ByteArrayInputStream;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Arrays;
public class BCF2Decoder { public class BCF2Decoder {
final protected static Logger logger = Logger.getLogger(FeatureCodec.class); final protected static Logger logger = Logger.getLogger(FeatureCodec.class);
@ -131,7 +134,7 @@ public class BCF2Decoder {
} }
public final Object decodeTypedValue(final byte typeDescriptor) { public final Object decodeTypedValue(final byte typeDescriptor) {
final int size = BCF2Utils.sizeIsOverflow(typeDescriptor) ? decodeVectorSize() : BCF2Utils.decodeSize(typeDescriptor); final int size = decodeNumberOfElements(typeDescriptor);
final BCF2Type type = BCF2Utils.decodeType(typeDescriptor); final BCF2Type type = BCF2Utils.decodeType(typeDescriptor);
assert size >= 0; assert size >= 0;
@ -155,7 +158,7 @@ public class BCF2Decoder {
public final Object decodeSingleValue(final BCF2Type type) { public final Object decodeSingleValue(final BCF2Type type) {
// TODO -- decodeTypedValue should integrate this routine // TODO -- decodeTypedValue should integrate this routine
final int value = BCF2Utils.readInt(type.getSizeInBytes(), recordStream); final int value = decodeInt(type);
if ( value == type.getMissingBytes() ) if ( value == type.getMissingBytes() )
return null; return null;
@ -191,22 +194,94 @@ public class BCF2Decoder {
} }
} }
private final int decodeVectorSize() { @Ensures("result >= 0")
final byte typeDescriptor = readTypeDescriptor(); public final int decodeNumberOfElements(final byte typeDescriptor) {
final int size = BCF2Utils.decodeSize(typeDescriptor); if ( BCF2Utils.sizeIsOverflow(typeDescriptor) )
// -1 ensures we explode immediately with a bad size if the result is missing
return decodeInt(readTypeDescriptor(), -1);
else
// the size is inline, so just decode it
return BCF2Utils.decodeSize(typeDescriptor);
}
/**
* Decode an int from the stream. If the value in the stream is missing,
* returns missingValue. Requires the typeDescriptor indicate an inline
* single element event
*
* @param typeDescriptor
* @return
*/
@Requires("BCF2Utils.decodeSize(typeDescriptor) == 1")
public final int decodeInt(final byte typeDescriptor, final int missingValue) {
final BCF2Type type = BCF2Utils.decodeType(typeDescriptor); final BCF2Type type = BCF2Utils.decodeType(typeDescriptor);
final int i = decodeInt(type);
assert size == 1; return i == type.getMissingBytes() ? missingValue : i;
assert type == BCF2Type.INT8 || type == BCF2Type.INT16 || type == BCF2Type.INT32;
return decodeInt(type.getSizeInBytes());
} }
public final int decodeInt(int bytesForEachInt) { @Requires("type != null")
return BCF2Utils.readInt(bytesForEachInt, recordStream); public final int decodeInt(final BCF2Type type) {
return BCF2Utils.readInt(type.getSizeInBytes(), recordStream);
} }
public final double rawFloatToFloat(final int rawFloat) { /**
* Low-level reader for int[]
*
* Requires a typeDescriptor so the function knows how many elements to read,
* and how they are encoded.
*
* If size == 0 => result is null
* If size > 0 => result depends on the actual values in the stream
* -- If the first element read is MISSING, result is null (all values are missing)
* -- Else result = int[N] where N is the first N non-missing values decoded
*
* @param maybeDest if not null we'll not allocate space for the vector, but instead use
* the externally allocated array of ints to store values. If the
* size of this vector is < the actual size of the elements, we'll be
* forced to use freshly allocated arrays. Also note that padded
* int elements are still forced to do a fresh allocation as well.
* @return see description
*/
@Requires({"BCF2Type.INTEGERS.contains(type)", "size >= 0"})
public final int[] decodeIntArray(final int size, final BCF2Type type, int[] maybeDest) {
if ( size == 0 ) {
return null;
} else {
if ( maybeDest != null && maybeDest.length < size )
maybeDest = null; // by nulling this out we ensure that we do fresh allocations as maybeDest is too small
final int val1 = decodeInt(type);
if ( val1 == type.getMissingBytes() ) {
// fast path for first element being missing
for ( int i = 1; i < size; i++ ) decodeInt(type);
return null;
} else {
// we know we will have at least 1 element, so making the int[] is worth it
final int[] ints = maybeDest == null ? new int[size] : maybeDest;
ints[0] = val1; // we already read the first one
for ( int i = 1; i < size; i++ ) {
ints[i] = decodeInt(type);
if ( ints[i] == type.getMissingBytes() ) {
// read the rest of the missing values, dropping them
for ( int j = i + 1; j < size; j++ ) decodeInt(type);
// deal with auto-pruning by returning an int[] containing
// only the non-MISSING values. We do this by copying the first
// i elements, as i itself is missing
return Arrays.copyOf(ints, i);
}
}
return ints; // all of the elements were non-MISSING
}
}
}
public final int[] decodeIntArray(final byte typeDescriptor) {
final int size = decodeNumberOfElements(typeDescriptor);
final BCF2Type type = BCF2Utils.decodeType(typeDescriptor);
return decodeIntArray(size, type, null);
}
public final double rawFloatToFloat(final int rawFloat) {
return (double)Float.intBitsToFloat(rawFloat); return (double)Float.intBitsToFloat(rawFloat);
} }

View File

@ -102,27 +102,29 @@ public class BCF2GenotypeFieldDecoders {
private class GTDecoder implements Decoder { private class GTDecoder implements Decoder {
@Override @Override
public void decode(final List<Allele> siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final List<GenotypeBuilder> gbs) { public void decode(final List<Allele> siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final List<GenotypeBuilder> gbs) {
// we have to do a bit of low-level processing here as we want to know the size upfronta
final int size = decoder.decodeNumberOfElements(typeDescriptor);
final BCF2Type type = BCF2Utils.decodeType(typeDescriptor);
// a single cache for the encoded genotypes, since we don't actually need this vector
final int[] tmp = new int[size];
// TODO -- fast path for size == 2 (diploid) and many samples
// TODO -- by creating all 4 allele combinations and doing a straight lookup instead of allocations them
for ( final GenotypeBuilder gb : gbs ) { for ( final GenotypeBuilder gb : gbs ) {
// TODO -- fast path for size == 2 (diploid) final int[] encoded = decoder.decodeIntArray(size, type, tmp);
final List<Integer> encoded = (List<Integer>)decoder.decodeTypedValue(typeDescriptor);
if ( encoded == null ) if ( encoded == null )
// no called sample GT = . // no called sample GT = .
gb.alleles(null); gb.alleles(null);
else { else {
// we have at least some alleles to decode assert encoded.length > 0;
final List<Allele> gt = new ArrayList<Allele>(encoded.size());
for ( final Integer encode : encoded ) { // we have at least some alleles to decode
if ( encode == null ) { final List<Allele> gt = new ArrayList<Allele>(encoded.length);
// absent, as are all following by definition
break; for ( final int encode : encoded ) {
} else { final int offset = encode >> 1;
final int offset = encode >> 1; gt.add(offset == 0 ? Allele.NO_CALL : siteAlleles.get(offset - 1));
if ( offset == 0 )
gt.add(Allele.NO_CALL);
else
gt.add(siteAlleles.get(offset - 1));
}
} }
gb.alleles(gt); gb.alleles(gt);
@ -135,9 +137,8 @@ public class BCF2GenotypeFieldDecoders {
@Override @Override
public void decode(final List<Allele> siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final List<GenotypeBuilder> gbs) { public void decode(final List<Allele> siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final List<GenotypeBuilder> gbs) {
for ( final GenotypeBuilder gb : gbs ) { for ( final GenotypeBuilder gb : gbs ) {
final Object value = decoder.decodeTypedValue(typeDescriptor); // the -1 is for missing
if ( value != null ) gb.DP(decoder.decodeInt(typeDescriptor, -1));
gb.DP((Integer)value);
} }
} }
} }
@ -146,9 +147,8 @@ public class BCF2GenotypeFieldDecoders {
@Override @Override
public void decode(final List<Allele> siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final List<GenotypeBuilder> gbs) { public void decode(final List<Allele> siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final List<GenotypeBuilder> gbs) {
for ( final GenotypeBuilder gb : gbs ) { for ( final GenotypeBuilder gb : gbs ) {
final Object value = decoder.decodeTypedValue(typeDescriptor); // the -1 is for missing
if ( value != null ) gb.GQ(decoder.decodeInt(typeDescriptor, -1));
gb.GQ((Integer)value);
} }
} }
} }
@ -157,9 +157,7 @@ public class BCF2GenotypeFieldDecoders {
@Override @Override
public void decode(final List<Allele> siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final List<GenotypeBuilder> gbs) { public void decode(final List<Allele> siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final List<GenotypeBuilder> gbs) {
for ( final GenotypeBuilder gb : gbs ) { for ( final GenotypeBuilder gb : gbs ) {
final int[] AD = decodeIntArray(decoder.decodeTypedValue(typeDescriptor)); gb.AD(decoder.decodeIntArray(typeDescriptor));
if ( AD != null )
gb.AD(AD);
} }
} }
} }
@ -168,9 +166,7 @@ public class BCF2GenotypeFieldDecoders {
@Override @Override
public void decode(final List<Allele> siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final List<GenotypeBuilder> gbs) { public void decode(final List<Allele> siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final List<GenotypeBuilder> gbs) {
for ( final GenotypeBuilder gb : gbs ) { for ( final GenotypeBuilder gb : gbs ) {
final int[] PL = decodeIntArray(decoder.decodeTypedValue(typeDescriptor)); gb.PL(decoder.decodeIntArray(typeDescriptor));
if ( PL != null )
gb.PL(PL);
} }
} }
} }
@ -188,8 +184,13 @@ public class BCF2GenotypeFieldDecoders {
for ( final GenotypeBuilder gb : gbs ) { for ( final GenotypeBuilder gb : gbs ) {
Object value = decoder.decodeTypedValue(typeDescriptor); Object value = decoder.decodeTypedValue(typeDescriptor);
if ( value != null ) { // don't add missing values if ( value != null ) { // don't add missing values
if ( value instanceof List && ((List)value).size() == 1) if ( value instanceof List && ((List)value).size() == 1) {
// todo -- I really hate this, and it suggests that the code isn't completely right
// the reason it's here is that it's possible to prune down a vector to a singleton
// value and there we have the contract that the value comes back as an atomic value
// not a vector of size 1
value = ((List)value).get(0); value = ((List)value).get(0);
}
gb.attribute(field, value); gb.attribute(field, value);
} }
} }

View File

@ -56,7 +56,8 @@ class BCF2LazyGenotypesDecoder implements LazyGenotypesContext.LazyParser {
@Override @Override
public LazyGenotypesContext.LazyData parse(final Object data) { public LazyGenotypesContext.LazyData parse(final Object data) {
logger.info("Decoding BCF genotypes for " + nSamples + " samples with " + nFields + " fields each"); if ( logger.isDebugEnabled() )
logger.debug("Decoding BCF genotypes for " + nSamples + " samples with " + nFields + " fields each");
// load our byte[] data into the decoder // load our byte[] data into the decoder
final BCF2Decoder decoder = new BCF2Decoder(((BCF2Codec.LazyData)data).bytes); final BCF2Decoder decoder = new BCF2Decoder(((BCF2Codec.LazyData)data).bytes);

View File

@ -24,6 +24,8 @@
package org.broadinstitute.sting.utils.codecs.bcf2; package org.broadinstitute.sting.utils.codecs.bcf2;
import java.util.EnumSet;
/** /**
* BCF2 types and information * BCF2 types and information
* *
@ -67,4 +69,6 @@ public enum BCF2Type {
public final boolean withinRange(final long v) { return v >= minValue && v <= maxValue; } public final boolean withinRange(final long v) { return v >= minValue && v <= maxValue; }
public Object getMissingJavaValue() { return missingJavaValue; } public Object getMissingJavaValue() { return missingJavaValue; }
public int getMissingBytes() { return missingBytes; } public int getMissingBytes() { return missingBytes; }
public final static EnumSet<BCF2Type> INTEGERS = EnumSet.of(INT8, INT16, INT32);
} }