Efficient reading of genotype fields v1

-- decodeIntArray in BCF2 decoder allows us to more efficiently read ints and int[] from stream directly into Genotype object -- Code cleanup / contracts added were appropriate -- V2 will have a yet more optimized path...
2012-06-04 12:17:15 -04:00 · 2012-06-04 12:17:15 -04:00 · 1d4eb46606
parent 3a70f8216b
commit 1d4eb46606
5 changed files with 128 additions and 47 deletions
--- a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java
+++ b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java
@ -191,12 +191,12 @@ public class BCF2Codec implements FeatureCodec<VariantContext>, ReferenceDepende
    // --------------------------------------------------------------------------------

    private final SitesInfoForDecoding decodeSitesBlock(final VariantContextBuilder builder) {
-        final int contigOffset = decoder.decodeInt(BCF2Type.INT32.getSizeInBytes());
+        final int contigOffset = decoder.decodeInt(BCF2Type.INT32);
        final String contig = lookupContigName(contigOffset);
        builder.chr(contig);

-        final int pos = decoder.decodeInt(BCF2Type.INT32.getSizeInBytes());
-        final int refLength = decoder.decodeInt(BCF2Type.INT32.getSizeInBytes());
+        final int pos = decoder.decodeInt(BCF2Type.INT32);
+        final int refLength = decoder.decodeInt(BCF2Type.INT32);
        builder.start((long)pos);
        builder.stop((long)(pos + refLength - 1)); // minus one because of our open intervals

@ -205,8 +205,8 @@ public class BCF2Codec implements FeatureCodec<VariantContext>, ReferenceDepende
            builder.log10PError(((Double)qual) / -10.0);
        }

-        final int nAlleleInfo = decoder.decodeInt(BCF2Type.INT32.getSizeInBytes());
-        final int nFormatSamples = decoder.decodeInt(BCF2Type.INT32.getSizeInBytes());
+        final int nAlleleInfo = decoder.decodeInt(BCF2Type.INT32);
+        final int nFormatSamples = decoder.decodeInt(BCF2Type.INT32);
        final int nAlleles = nAlleleInfo >> 16;
        final int nInfo = nAlleleInfo & 0x00FF;
        final int nFormatFields = nFormatSamples >>  24;
--- a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Decoder.java
+++ b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Decoder.java
@ -24,6 +24,8 @@

 package org.broadinstitute.sting.utils.codecs.bcf2;

+import com.google.java.contract.Ensures;
+import com.google.java.contract.Requires;
 import org.apache.log4j.Logger;
 import org.broad.tribble.FeatureCodec;
 import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
@ -33,6 +35,7 @@ import java.io.ByteArrayInputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.util.ArrayList;
+import java.util.Arrays;

 public class BCF2Decoder {
    final protected static Logger logger = Logger.getLogger(FeatureCodec.class);
@ -131,7 +134,7 @@ public class BCF2Decoder {
    }

    public final Object decodeTypedValue(final byte typeDescriptor) {
-        final int size = BCF2Utils.sizeIsOverflow(typeDescriptor) ? decodeVectorSize() : BCF2Utils.decodeSize(typeDescriptor);
+        final int size = decodeNumberOfElements(typeDescriptor);
        final BCF2Type type = BCF2Utils.decodeType(typeDescriptor);

        assert size >= 0;
@ -155,7 +158,7 @@ public class BCF2Decoder {

    public final Object decodeSingleValue(final BCF2Type type) {
        // TODO -- decodeTypedValue should integrate this routine
-        final int value = BCF2Utils.readInt(type.getSizeInBytes(), recordStream);
+        final int value = decodeInt(type);

        if ( value == type.getMissingBytes() )
            return null;
@ -191,22 +194,94 @@ public class BCF2Decoder {
        }
    }

-    private final int decodeVectorSize() {
-        final byte typeDescriptor = readTypeDescriptor();
-        final int size = BCF2Utils.decodeSize(typeDescriptor);
+    @Ensures("result >= 0")
+    public final int decodeNumberOfElements(final byte typeDescriptor) {
+        if ( BCF2Utils.sizeIsOverflow(typeDescriptor) )
+            // -1 ensures we explode immediately with a bad size if the result is missing
+            return decodeInt(readTypeDescriptor(), -1);
+        else
+            // the size is inline, so just decode it
+            return BCF2Utils.decodeSize(typeDescriptor);
+    }
+
+    /**
+     * Decode an int from the stream.  If the value in the stream is missing,
+     * returns missingValue.  Requires the typeDescriptor indicate an inline
+     * single element event
+     *
+     * @param typeDescriptor
+     * @return
+     */
+    @Requires("BCF2Utils.decodeSize(typeDescriptor) == 1")
+    public final int decodeInt(final byte typeDescriptor, final int missingValue) {
        final BCF2Type type = BCF2Utils.decodeType(typeDescriptor);
-
-        assert size == 1;
-        assert type == BCF2Type.INT8 || type == BCF2Type.INT16 || type == BCF2Type.INT32;
-
-        return decodeInt(type.getSizeInBytes());
+        final int i = decodeInt(type);
+        return i == type.getMissingBytes() ? missingValue : i;
    }

-    public final int decodeInt(int bytesForEachInt) {
-        return BCF2Utils.readInt(bytesForEachInt, recordStream);
+    @Requires("type != null")
+    public final int decodeInt(final BCF2Type type) {
+        return BCF2Utils.readInt(type.getSizeInBytes(), recordStream);
    }

-    public final double rawFloatToFloat(final int rawFloat) {
+    /**
+     * Low-level reader for int[]
+     *
+     * Requires a typeDescriptor so the function knows how many elements to read,
+     * and how they are encoded.
+     *
+     * If size == 0 => result is null
+     * If size > 0 => result depends on the actual values in the stream
+     *      -- If the first element read is MISSING, result is null (all values are missing)
+     *      -- Else result = int[N] where N is the first N non-missing values decoded
+     *
+     * @param maybeDest if not null we'll not allocate space for the vector, but instead use
+     *                  the externally allocated array of ints to store values.  If the
+     *                  size of this vector is < the actual size of the elements, we'll be
+     *                  forced to use freshly allocated arrays.  Also note that padded
+     *                  int elements are still forced to do a fresh allocation as well.
+     * @return see description
+     */
+    @Requires({"BCF2Type.INTEGERS.contains(type)", "size >= 0"})
+    public final int[] decodeIntArray(final int size, final BCF2Type type, int[] maybeDest) {
+        if ( size == 0 ) {
+            return null;
+        } else {
+            if ( maybeDest != null && maybeDest.length < size )
+                maybeDest = null; // by nulling this out we ensure that we do fresh allocations as maybeDest is too small
+
+            final int val1 = decodeInt(type);
+            if ( val1 == type.getMissingBytes() ) {
+                // fast path for first element being missing
+                for ( int i = 1; i < size; i++ ) decodeInt(type);
+                return null;
+            } else {
+                // we know we will have at least 1 element, so making the int[] is worth it
+                final int[] ints = maybeDest == null ? new int[size] : maybeDest;
+                ints[0] = val1; // we already read the first one
+                for ( int i = 1; i < size; i++ ) {
+                    ints[i] = decodeInt(type);
+                    if ( ints[i] == type.getMissingBytes() ) {
+                        // read the rest of the missing values, dropping them
+                        for ( int j = i + 1; j < size; j++ ) decodeInt(type);
+                        // deal with auto-pruning by returning an int[] containing
+                        // only the non-MISSING values.  We do this by copying the first
+                        // i elements, as i itself is missing
+                        return Arrays.copyOf(ints, i);
+                    }
+                }
+                return ints; // all of the elements were non-MISSING
+            }
+        }
+    }
+
+    public final int[] decodeIntArray(final byte typeDescriptor) {
+        final int size = decodeNumberOfElements(typeDescriptor);
+        final BCF2Type type = BCF2Utils.decodeType(typeDescriptor);
+        return decodeIntArray(size, type, null);
+    }
+
+        public final double rawFloatToFloat(final int rawFloat) {
        return (double)Float.intBitsToFloat(rawFloat);
    }

--- a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2GenotypeFieldDecoders.java
+++ b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2GenotypeFieldDecoders.java
@ -102,27 +102,29 @@ public class BCF2GenotypeFieldDecoders {
    private class GTDecoder implements Decoder {
        @Override
        public void decode(final List<Allele> siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final List<GenotypeBuilder> gbs) {
+            // we have to do a bit of low-level processing here as we want to know the size upfronta
+            final int size = decoder.decodeNumberOfElements(typeDescriptor);
+            final BCF2Type type = BCF2Utils.decodeType(typeDescriptor);
+
+            // a single cache for the encoded genotypes, since we don't actually need this vector
+            final int[] tmp = new int[size];
+
+            // TODO -- fast path for size == 2 (diploid) and many samples
+            // TODO -- by creating all 4 allele combinations and doing a straight lookup instead of allocations them
            for ( final GenotypeBuilder gb : gbs ) {
-                // TODO -- fast path for size == 2 (diploid)
-                final List<Integer> encoded = (List<Integer>)decoder.decodeTypedValue(typeDescriptor);
+                final int[] encoded = decoder.decodeIntArray(size, type, tmp);
                if ( encoded == null )
                    // no called sample GT = .
                    gb.alleles(null);
                else {
-                    // we have at least some alleles to decode
-                    final List<Allele> gt = new ArrayList<Allele>(encoded.size());
+                    assert encoded.length > 0;

-                    for ( final Integer encode : encoded ) {
-                        if ( encode == null ) {
-                            // absent, as are all following by definition
-                            break;
-                        } else {
-                            final int offset = encode >> 1;
-                            if ( offset == 0 )
-                                gt.add(Allele.NO_CALL);
-                            else
-                                gt.add(siteAlleles.get(offset - 1));
-                        }
+                    // we have at least some alleles to decode
+                    final List<Allele> gt = new ArrayList<Allele>(encoded.length);
+
+                    for ( final int encode : encoded ) {
+                        final int offset = encode >> 1;
+                        gt.add(offset == 0 ? Allele.NO_CALL : siteAlleles.get(offset - 1));
                    }

                    gb.alleles(gt);
@ -135,9 +137,8 @@ public class BCF2GenotypeFieldDecoders {
        @Override
        public void decode(final List<Allele> siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final List<GenotypeBuilder> gbs) {
            for ( final GenotypeBuilder gb : gbs ) {
-                final Object value = decoder.decodeTypedValue(typeDescriptor);
-                if ( value != null )
-                    gb.DP((Integer)value);
+                // the -1 is for missing
+                gb.DP(decoder.decodeInt(typeDescriptor, -1));
            }
        }
    }
@ -146,9 +147,8 @@ public class BCF2GenotypeFieldDecoders {
        @Override
        public void decode(final List<Allele> siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final List<GenotypeBuilder> gbs) {
            for ( final GenotypeBuilder gb : gbs ) {
-                final Object value = decoder.decodeTypedValue(typeDescriptor);
-                if ( value != null )
-                    gb.GQ((Integer)value);
+                // the -1 is for missing
+                gb.GQ(decoder.decodeInt(typeDescriptor, -1));
            }
        }
    }
@ -157,9 +157,7 @@ public class BCF2GenotypeFieldDecoders {
        @Override
        public void decode(final List<Allele> siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final List<GenotypeBuilder> gbs) {
            for ( final GenotypeBuilder gb : gbs ) {
-                final int[] AD = decodeIntArray(decoder.decodeTypedValue(typeDescriptor));
-                if ( AD != null )
-                    gb.AD(AD);
+                gb.AD(decoder.decodeIntArray(typeDescriptor));
            }
        }
    }
@ -168,9 +166,7 @@ public class BCF2GenotypeFieldDecoders {
        @Override
        public void decode(final List<Allele> siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final List<GenotypeBuilder> gbs) {
            for ( final GenotypeBuilder gb : gbs ) {
-                final int[] PL = decodeIntArray(decoder.decodeTypedValue(typeDescriptor));
-                if ( PL != null )
-                    gb.PL(PL);
+                gb.PL(decoder.decodeIntArray(typeDescriptor));
            }
        }
    }
@ -188,8 +184,13 @@ public class BCF2GenotypeFieldDecoders {
            for ( final GenotypeBuilder gb : gbs ) {
                Object value = decoder.decodeTypedValue(typeDescriptor);
                if ( value != null ) { // don't add missing values
-                    if ( value instanceof List && ((List)value).size() == 1)
+                    if ( value instanceof List && ((List)value).size() == 1) {
+                        // todo -- I really hate this, and it suggests that the code isn't completely right
+                        // the reason it's here is that it's possible to prune down a vector to a singleton
+                        // value and there we have the contract that the value comes back as an atomic value
+                        // not a vector of size 1
                        value = ((List)value).get(0);
+                    }
                    gb.attribute(field, value);
                }
            }
--- a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2LazyGenotypesDecoder.java
+++ b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2LazyGenotypesDecoder.java
@ -56,7 +56,8 @@ class BCF2LazyGenotypesDecoder implements LazyGenotypesContext.LazyParser {

    @Override
    public LazyGenotypesContext.LazyData parse(final Object data) {
-        logger.info("Decoding BCF genotypes for " + nSamples + " samples with " + nFields + " fields each");
+        if ( logger.isDebugEnabled() )
+            logger.debug("Decoding BCF genotypes for " + nSamples + " samples with " + nFields + " fields each");

        // load our byte[] data into the decoder
        final BCF2Decoder decoder = new BCF2Decoder(((BCF2Codec.LazyData)data).bytes);
--- a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Type.java
+++ b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Type.java
@ -24,6 +24,8 @@

 package org.broadinstitute.sting.utils.codecs.bcf2;

+import java.util.EnumSet;
+
 /**
 * BCF2 types and information
 *
@ -67,4 +69,6 @@ public enum BCF2Type {
    public final boolean withinRange(final long v) { return v >= minValue && v <= maxValue; }
    public Object getMissingJavaValue() { return missingJavaValue; }
    public int getMissingBytes() { return missingBytes; }
+
+    public final static EnumSet<BCF2Type> INTEGERS = EnumSet.of(INT8, INT16, INT32);
 }