Performance optimizations for Genotype field decoding for GT field

-- Fast path decoder for biallelic diploid GT fields that avoids allocating the same genotypes over and over -- Contracts -- final classes
2012-06-06 14:10:05 -04:00 · 2012-06-06 14:10:05 -04:00 · 01ddf9555a
parent 7fbca7013e
commit 01ddf9555a
4 changed files with 111 additions and 31 deletions
--- a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Decoder.java
+++ b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Decoder.java
@ -37,7 +37,7 @@ import java.io.InputStream;
 import java.util.ArrayList;
 import java.util.Arrays;

-public class BCF2Decoder {
+public final class BCF2Decoder {
    final protected static Logger logger = Logger.getLogger(FeatureCodec.class);

    byte[] recordBytes;
@ -242,7 +242,7 @@ public class BCF2Decoder {
     *                  int elements are still forced to do a fresh allocation as well.
     * @return see description
     */
-    @Requires({"BCF2Type.INTEGERS.contains(type)", "size >= 0"})
+    @Requires({"BCF2Type.INTEGERS.contains(type)", "size >= 0", "type != null"})
    public final int[] decodeIntArray(final int size, final BCF2Type type, int[] maybeDest) {
        if ( size == 0 ) {
            return null;
@ -281,7 +281,7 @@ public class BCF2Decoder {
        return decodeIntArray(size, type, null);
    }

-        public final double rawFloatToFloat(final int rawFloat) {
+    public final double rawFloatToFloat(final int rawFloat) {
        return (double)Float.intBitsToFloat(rawFloat);
    }

--- a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Encoder.java
+++ b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Encoder.java
@ -37,7 +37,7 @@ import java.util.*;
 * @author depristo
 * @since 5/12
 */
-public class BCF2Encoder {
+public final class BCF2Encoder {
    // TODO -- increase default size?
    public static final int WRITE_BUFFER_INITIAL_SIZE = 16384;
    private ByteArrayOutputStream encodeStream = new ByteArrayOutputStream(WRITE_BUFFER_INITIAL_SIZE);
--- a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2GenotypeFieldDecoders.java
+++ b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2GenotypeFieldDecoders.java
@ -26,15 +26,14 @@ package org.broadinstitute.sting.utils.codecs.bcf2;

 import com.google.java.contract.Ensures;
 import com.google.java.contract.Requires;
+import org.apache.log4j.Logger;
 import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
 import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
 import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
 import org.broadinstitute.sting.utils.variantcontext.Allele;
 import org.broadinstitute.sting.utils.variantcontext.GenotypeBuilder;

-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.List;
+import java.util.*;

 /**
 * An efficient
@ -43,6 +42,10 @@ import java.util.List;
 * @since Date created
 */
 public class BCF2GenotypeFieldDecoders {
+    final protected static Logger logger = Logger.getLogger(BCF2GenotypeFieldDecoders.class);
+    private final static boolean ENABLE_FASTPATH_GT = true;
+    private final static int MIN_SAMPLES_FOR_FASTPATH_GENOTYPES = 0; // TODO -- update to reasonable number
+
    // initialized once per writer to allow parallel writers to work
    private final HashMap<String, Decoder> genotypeFieldDecoder = new HashMap<String, Decoder>();
    private final Decoder defaultDecoder = new GenericDecoder();
@ -92,6 +95,8 @@ public class BCF2GenotypeFieldDecoders {
     * the PL field into a int[] rather than the generic List of Integer
     */
    public interface Decoder {
+        @Requires({"siteAlleles != null", "! siteAlleles.isEmpty()",
+                "field != null", "decoder != null", "gbs != null", "! gbs.isEmpty()"})
        public void decode(final List<Allele> siteAlleles,
                           final String field,
                           final BCF2Decoder decoder,
@ -103,25 +108,80 @@ public class BCF2GenotypeFieldDecoders {
        @Override
        public void decode(final List<Allele> siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final List<GenotypeBuilder> gbs) {
            // we have to do a bit of low-level processing here as we want to know the size upfronta
-            final int size = decoder.decodeNumberOfElements(typeDescriptor);
+            final int ploidy = decoder.decodeNumberOfElements(typeDescriptor);
+
+            if ( ENABLE_FASTPATH_GT && siteAlleles.size() == 2 && ploidy == 2 && gbs.size() >= MIN_SAMPLES_FOR_FASTPATH_GENOTYPES )
+                fastBiallelicDiploidDecode(siteAlleles, decoder, typeDescriptor, gbs);
+            else {
+                generalDecode(siteAlleles, ploidy, decoder, typeDescriptor, gbs);
+            }
+        }
+
+        /**
+         * fast path for many samples with diploid genotypes
+         *
+         * The way this would work is simple.  Create a List<Allele> diploidGenotypes[] object
+         * After decoding the offset, if that sample is diploid compute the
+         * offset into the alleles vector which is simply offset = allele0 * nAlleles + allele1
+         * if there's a value at diploidGenotypes[offset], use it, otherwise create the genotype
+         * cache it and use that
+         *
+         * Some notes.  If there are nAlleles at the site, there are implicitly actually
+         * n + 1 options including
+         */
+        @Requires("siteAlleles.size() == 2")
+        @SuppressWarnings({"unchecked"})
+        private final void fastBiallelicDiploidDecode(final List<Allele> siteAlleles,
+                                                      final BCF2Decoder decoder,
+                                                      final byte typeDescriptor,
+                                                      final List<GenotypeBuilder> gbs) {
+            logger.info("fastBiallelicDiploidDecode");
+            final BCF2Type type = BCF2Utils.decodeType(typeDescriptor);
+
+            final int nPossibleGenotypes = 3 * 3;
+            final Object allGenotypes[] = new Object[nPossibleGenotypes];
+
+            for ( final GenotypeBuilder gb : gbs ) {
+                final int a1 = decoder.decodeInt(type);
+                final int a2 = decoder.decodeInt(type);
+
+                if ( a1 == type.getMissingBytes() ) {
+                    assert a2 == type.getMissingBytes();
+                    // no called sample GT = .
+                    gb.alleles(null);
+                } else if ( a2 == type.getMissingBytes() ) {
+                    gb.alleles(Arrays.asList(getAlleleFromEncoded(siteAlleles, a1)));
+                } else {
+                    // downshift to remove phase
+                    final int offset = (a1 >> 1) * 3 + (a2 >> 1);
+                    assert offset < allGenotypes.length;
+
+                    // TODO -- how can I get rid of this cast?
+                    List<Allele> gt = (List<Allele>)allGenotypes[offset];
+                    if ( gt == null ) {
+                        final Allele allele1 = getAlleleFromEncoded(siteAlleles, a1);
+                        final Allele allele2 = getAlleleFromEncoded(siteAlleles, a2);
+                        gt = Arrays.asList(allele1, allele2);
+                        allGenotypes[offset] = gt;
+                    }
+
+                    gb.alleles(gt);
+                }
+            }
+        }
+
+        private final void generalDecode(final List<Allele> siteAlleles,
+                                         final int ploidy,
+                                         final BCF2Decoder decoder,
+                                         final byte typeDescriptor,
+                                         final List<GenotypeBuilder> gbs) {
            final BCF2Type type = BCF2Utils.decodeType(typeDescriptor);

            // a single cache for the encoded genotypes, since we don't actually need this vector
-            final int[] tmp = new int[size];
-
-            // TODO -- fast path for many samples with diploid genotypes
-            //
-            // The way this would work is simple.  Create a List<Allele> diploidGenotypes[] object
-            // After decoding the offset, if that sample is diploid compute the
-            // offset into the alleles vector which is simply offset = allele0 * nAlleles + allele1
-            // if there's a value at diploidGenotypes[offset], use it, otherwise create the genotype
-            // cache it and use that
-            //
-            // Some notes.  If there are nAlleles at the site, there are implicitly actually
-            // n + 1 options including
+            final int[] tmp = new int[ploidy];

            for ( final GenotypeBuilder gb : gbs ) {
-                final int[] encoded = decoder.decodeIntArray(size, type, tmp);
+                final int[] encoded = decoder.decodeIntArray(ploidy, type, tmp);
                if ( encoded == null )
                    // no called sample GT = .
                    gb.alleles(null);
@ -131,16 +191,22 @@ public class BCF2GenotypeFieldDecoders {
                    // we have at least some alleles to decode
                    final List<Allele> gt = new ArrayList<Allele>(encoded.length);

-                    for ( final int encode : encoded ) {
-                        // TODO -- handle padding!
-                        final int offset = encode >> 1;
-                        gt.add(offset == 0 ? Allele.NO_CALL : siteAlleles.get(offset - 1));
-                    }
+                    // note that the auto-pruning of fields magically handles different
+                    // ploidy per sample at a site
+                    for ( final int encode : encoded )
+                        gt.add(getAlleleFromEncoded(siteAlleles, encode));

                    gb.alleles(gt);
                }
            }
        }
+
+        @Requires({"siteAlleles != null && ! siteAlleles.isEmpty()", "encode >= 0"})
+        @Ensures("result != null")
+        private final Allele getAlleleFromEncoded(final List<Allele> siteAlleles, final int encode) {
+            final int offset = encode >> 1;
+            return offset == 0 ? Allele.NO_CALL : siteAlleles.get(offset - 1);
+        }
    }

    private class DPDecoder implements Decoder {
--- a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Utils.java
+++ b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Utils.java
@ -25,6 +25,7 @@
 package org.broadinstitute.sting.utils.codecs.bcf2;

 import com.google.java.contract.Ensures;
+import com.google.java.contract.Requires;
 import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
 import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
 import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLine;
@ -44,7 +45,7 @@ import java.util.*;
 * @author depristo
 * @since 5/12
 */
-public class BCF2Utils {
+public final class BCF2Utils {
    public static final byte[] MAGIC_HEADER_LINE = "BCF\2".getBytes();

    public static final int MAX_ALLELES_IN_GENOTYPES = 127;
@ -83,7 +84,8 @@ public class BCF2Utils {
     * @param header the VCFHeader from which to build the dictionary
     * @return a non-null dictionary of elements, may be empty
     */
-    @Ensures("new HashSet(result).size() == result.size()")
+    @Requires("header != null")
+    @Ensures({"result != null", "new HashSet(result).size() == result.size()"})
    public final static ArrayList<String> makeDictionary(final VCFHeader header) {
        final Set<String> dict = new TreeSet<String>();

@ -99,20 +101,24 @@ public class BCF2Utils {
        return new ArrayList<String>(dict);
    }

+    @Requires({"nElements >= 0", "type != null"})
    public final static byte encodeTypeDescriptor(final int nElements, final BCF2Type type ) {
        int encodeSize = Math.min(nElements, OVERFLOW_ELEMENT_MARKER);
        byte typeByte = (byte)((0x0F & encodeSize) << 4 | (type.getID() & 0x0F));
        return typeByte;
    }

+    @Ensures("result >= 0")
    public final static int decodeSize(final byte typeDescriptor) {
        return (0xF0 & typeDescriptor) >> 4;
    }

+    @Ensures("result >= 0")
    public final static int decodeTypeID(final byte typeDescriptor) {
        return typeDescriptor & 0x0F;
    }

+    @Ensures("result != null")
    public final static BCF2Type decodeType(final byte typeDescriptor) {
        return ID_TO_ENUM[decodeTypeID(typeDescriptor)];
    }
@ -121,6 +127,7 @@ public class BCF2Utils {
        return decodeSize(typeDescriptor) == OVERFLOW_ELEMENT_MARKER;
    }

+    @Requires("nElements >= 0")
    public final static boolean willOverflow(final long nElements) {
        return nElements > MAX_INLINE_ELEMENTS;
    }
@ -132,6 +139,7 @@ public class BCF2Utils {
    }

    public final static byte readByte(final InputStream stream) {
+        // TODO -- shouldn't be capturing error here
        try {
            return (byte)(stream.read() & 0xFF);
        } catch ( IOException e ) {
@ -139,6 +147,7 @@ public class BCF2Utils {
        }
    }

+    @Requires({"stream != null", "bytesForEachInt > 0"})
    public final static int readInt(int bytesForEachInt, final InputStream stream) {
        switch ( bytesForEachInt ) {
            case 1: {
@ -165,10 +174,10 @@ public class BCF2Utils {
     * @param strings size > 1 list of strings
     * @return
     */
+    @Requires({"strings != null", "strings.size() > 1"})
+    @Ensures("result != null")
    public static final String collapseStringList(final List<String> strings) {
-        assert strings.size() > 1;
-
-        StringBuilder b = new StringBuilder();
+        final StringBuilder b = new StringBuilder();
        for ( final String s : strings ) {
            assert s.indexOf(",") == -1; // no commas in individual strings
            b.append(",").append(s);
@ -185,12 +194,15 @@ public class BCF2Utils {
     * @param collapsed
     * @return
     */
+    @Requires({"collapsed != null", "isCollapsedString(collapsed)"})
+    @Ensures("result != null")
    public static final List<String> exploreStringList(final String collapsed) {
        assert isCollapsedString(collapsed);
        final String[] exploded = collapsed.substring(1).split(",");
        return Arrays.asList(exploded);
    }

+    @Requires("s != null")
    public static final boolean isCollapsedString(final String s) {
        return s.charAt(0) == ',';
    }
@ -204,6 +216,8 @@ public class BCF2Utils {
     * @param vcfFile
     * @return
     */
+    @Requires("vcfFile != null")
+    @Ensures("result != null")
    public static final File shadowBCF(final File vcfFile) {
        final String path = vcfFile.getAbsolutePath();
        if ( path.contains(".vcf") )