diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Decoder.java b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Decoder.java index 695e8656e..b8d30addd 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Decoder.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Decoder.java @@ -37,7 +37,7 @@ import java.io.InputStream; import java.util.ArrayList; import java.util.Arrays; -public class BCF2Decoder { +public final class BCF2Decoder { final protected static Logger logger = Logger.getLogger(FeatureCodec.class); byte[] recordBytes; @@ -242,7 +242,7 @@ public class BCF2Decoder { * int elements are still forced to do a fresh allocation as well. * @return see description */ - @Requires({"BCF2Type.INTEGERS.contains(type)", "size >= 0"}) + @Requires({"BCF2Type.INTEGERS.contains(type)", "size >= 0", "type != null"}) public final int[] decodeIntArray(final int size, final BCF2Type type, int[] maybeDest) { if ( size == 0 ) { return null; @@ -281,7 +281,7 @@ public class BCF2Decoder { return decodeIntArray(size, type, null); } - public final double rawFloatToFloat(final int rawFloat) { + public final double rawFloatToFloat(final int rawFloat) { return (double)Float.intBitsToFloat(rawFloat); } diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Encoder.java b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Encoder.java index d9cf1b1d0..2f8a99715 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Encoder.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Encoder.java @@ -37,7 +37,7 @@ import java.util.*; * @author depristo * @since 5/12 */ -public class BCF2Encoder { +public final class BCF2Encoder { // TODO -- increase default size? public static final int WRITE_BUFFER_INITIAL_SIZE = 16384; private ByteArrayOutputStream encodeStream = new ByteArrayOutputStream(WRITE_BUFFER_INITIAL_SIZE); diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2GenotypeFieldDecoders.java b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2GenotypeFieldDecoders.java index fe59f0026..ccbf33f35 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2GenotypeFieldDecoders.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2GenotypeFieldDecoders.java @@ -26,15 +26,14 @@ package org.broadinstitute.sting.utils.codecs.bcf2; import com.google.java.contract.Ensures; import com.google.java.contract.Requires; +import org.apache.log4j.Logger; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.GenotypeBuilder; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; +import java.util.*; /** * An efficient @@ -43,6 +42,10 @@ import java.util.List; * @since Date created */ public class BCF2GenotypeFieldDecoders { + final protected static Logger logger = Logger.getLogger(BCF2GenotypeFieldDecoders.class); + private final static boolean ENABLE_FASTPATH_GT = true; + private final static int MIN_SAMPLES_FOR_FASTPATH_GENOTYPES = 0; // TODO -- update to reasonable number + // initialized once per writer to allow parallel writers to work private final HashMap genotypeFieldDecoder = new HashMap(); private final Decoder defaultDecoder = new GenericDecoder(); @@ -92,6 +95,8 @@ public class BCF2GenotypeFieldDecoders { * the PL field into a int[] rather than the generic List of Integer */ public interface Decoder { + @Requires({"siteAlleles != null", "! siteAlleles.isEmpty()", + "field != null", "decoder != null", "gbs != null", "! gbs.isEmpty()"}) public void decode(final List siteAlleles, final String field, final BCF2Decoder decoder, @@ -103,25 +108,80 @@ public class BCF2GenotypeFieldDecoders { @Override public void decode(final List siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final List gbs) { // we have to do a bit of low-level processing here as we want to know the size upfronta - final int size = decoder.decodeNumberOfElements(typeDescriptor); + final int ploidy = decoder.decodeNumberOfElements(typeDescriptor); + + if ( ENABLE_FASTPATH_GT && siteAlleles.size() == 2 && ploidy == 2 && gbs.size() >= MIN_SAMPLES_FOR_FASTPATH_GENOTYPES ) + fastBiallelicDiploidDecode(siteAlleles, decoder, typeDescriptor, gbs); + else { + generalDecode(siteAlleles, ploidy, decoder, typeDescriptor, gbs); + } + } + + /** + * fast path for many samples with diploid genotypes + * + * The way this would work is simple. Create a List diploidGenotypes[] object + * After decoding the offset, if that sample is diploid compute the + * offset into the alleles vector which is simply offset = allele0 * nAlleles + allele1 + * if there's a value at diploidGenotypes[offset], use it, otherwise create the genotype + * cache it and use that + * + * Some notes. If there are nAlleles at the site, there are implicitly actually + * n + 1 options including + */ + @Requires("siteAlleles.size() == 2") + @SuppressWarnings({"unchecked"}) + private final void fastBiallelicDiploidDecode(final List siteAlleles, + final BCF2Decoder decoder, + final byte typeDescriptor, + final List gbs) { + logger.info("fastBiallelicDiploidDecode"); + final BCF2Type type = BCF2Utils.decodeType(typeDescriptor); + + final int nPossibleGenotypes = 3 * 3; + final Object allGenotypes[] = new Object[nPossibleGenotypes]; + + for ( final GenotypeBuilder gb : gbs ) { + final int a1 = decoder.decodeInt(type); + final int a2 = decoder.decodeInt(type); + + if ( a1 == type.getMissingBytes() ) { + assert a2 == type.getMissingBytes(); + // no called sample GT = . + gb.alleles(null); + } else if ( a2 == type.getMissingBytes() ) { + gb.alleles(Arrays.asList(getAlleleFromEncoded(siteAlleles, a1))); + } else { + // downshift to remove phase + final int offset = (a1 >> 1) * 3 + (a2 >> 1); + assert offset < allGenotypes.length; + + // TODO -- how can I get rid of this cast? + List gt = (List)allGenotypes[offset]; + if ( gt == null ) { + final Allele allele1 = getAlleleFromEncoded(siteAlleles, a1); + final Allele allele2 = getAlleleFromEncoded(siteAlleles, a2); + gt = Arrays.asList(allele1, allele2); + allGenotypes[offset] = gt; + } + + gb.alleles(gt); + } + } + } + + private final void generalDecode(final List siteAlleles, + final int ploidy, + final BCF2Decoder decoder, + final byte typeDescriptor, + final List gbs) { final BCF2Type type = BCF2Utils.decodeType(typeDescriptor); // a single cache for the encoded genotypes, since we don't actually need this vector - final int[] tmp = new int[size]; - - // TODO -- fast path for many samples with diploid genotypes - // - // The way this would work is simple. Create a List diploidGenotypes[] object - // After decoding the offset, if that sample is diploid compute the - // offset into the alleles vector which is simply offset = allele0 * nAlleles + allele1 - // if there's a value at diploidGenotypes[offset], use it, otherwise create the genotype - // cache it and use that - // - // Some notes. If there are nAlleles at the site, there are implicitly actually - // n + 1 options including + final int[] tmp = new int[ploidy]; for ( final GenotypeBuilder gb : gbs ) { - final int[] encoded = decoder.decodeIntArray(size, type, tmp); + final int[] encoded = decoder.decodeIntArray(ploidy, type, tmp); if ( encoded == null ) // no called sample GT = . gb.alleles(null); @@ -131,16 +191,22 @@ public class BCF2GenotypeFieldDecoders { // we have at least some alleles to decode final List gt = new ArrayList(encoded.length); - for ( final int encode : encoded ) { - // TODO -- handle padding! - final int offset = encode >> 1; - gt.add(offset == 0 ? Allele.NO_CALL : siteAlleles.get(offset - 1)); - } + // note that the auto-pruning of fields magically handles different + // ploidy per sample at a site + for ( final int encode : encoded ) + gt.add(getAlleleFromEncoded(siteAlleles, encode)); gb.alleles(gt); } } } + + @Requires({"siteAlleles != null && ! siteAlleles.isEmpty()", "encode >= 0"}) + @Ensures("result != null") + private final Allele getAlleleFromEncoded(final List siteAlleles, final int encode) { + final int offset = encode >> 1; + return offset == 0 ? Allele.NO_CALL : siteAlleles.get(offset - 1); + } } private class DPDecoder implements Decoder { diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Utils.java b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Utils.java index 8fb38460d..ac669667f 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Utils.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Utils.java @@ -25,6 +25,7 @@ package org.broadinstitute.sting.utils.codecs.bcf2; import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLine; @@ -44,7 +45,7 @@ import java.util.*; * @author depristo * @since 5/12 */ -public class BCF2Utils { +public final class BCF2Utils { public static final byte[] MAGIC_HEADER_LINE = "BCF\2".getBytes(); public static final int MAX_ALLELES_IN_GENOTYPES = 127; @@ -83,7 +84,8 @@ public class BCF2Utils { * @param header the VCFHeader from which to build the dictionary * @return a non-null dictionary of elements, may be empty */ - @Ensures("new HashSet(result).size() == result.size()") + @Requires("header != null") + @Ensures({"result != null", "new HashSet(result).size() == result.size()"}) public final static ArrayList makeDictionary(final VCFHeader header) { final Set dict = new TreeSet(); @@ -99,20 +101,24 @@ public class BCF2Utils { return new ArrayList(dict); } + @Requires({"nElements >= 0", "type != null"}) public final static byte encodeTypeDescriptor(final int nElements, final BCF2Type type ) { int encodeSize = Math.min(nElements, OVERFLOW_ELEMENT_MARKER); byte typeByte = (byte)((0x0F & encodeSize) << 4 | (type.getID() & 0x0F)); return typeByte; } + @Ensures("result >= 0") public final static int decodeSize(final byte typeDescriptor) { return (0xF0 & typeDescriptor) >> 4; } + @Ensures("result >= 0") public final static int decodeTypeID(final byte typeDescriptor) { return typeDescriptor & 0x0F; } + @Ensures("result != null") public final static BCF2Type decodeType(final byte typeDescriptor) { return ID_TO_ENUM[decodeTypeID(typeDescriptor)]; } @@ -121,6 +127,7 @@ public class BCF2Utils { return decodeSize(typeDescriptor) == OVERFLOW_ELEMENT_MARKER; } + @Requires("nElements >= 0") public final static boolean willOverflow(final long nElements) { return nElements > MAX_INLINE_ELEMENTS; } @@ -132,6 +139,7 @@ public class BCF2Utils { } public final static byte readByte(final InputStream stream) { + // TODO -- shouldn't be capturing error here try { return (byte)(stream.read() & 0xFF); } catch ( IOException e ) { @@ -139,6 +147,7 @@ public class BCF2Utils { } } + @Requires({"stream != null", "bytesForEachInt > 0"}) public final static int readInt(int bytesForEachInt, final InputStream stream) { switch ( bytesForEachInt ) { case 1: { @@ -165,10 +174,10 @@ public class BCF2Utils { * @param strings size > 1 list of strings * @return */ + @Requires({"strings != null", "strings.size() > 1"}) + @Ensures("result != null") public static final String collapseStringList(final List strings) { - assert strings.size() > 1; - - StringBuilder b = new StringBuilder(); + final StringBuilder b = new StringBuilder(); for ( final String s : strings ) { assert s.indexOf(",") == -1; // no commas in individual strings b.append(",").append(s); @@ -185,12 +194,15 @@ public class BCF2Utils { * @param collapsed * @return */ + @Requires({"collapsed != null", "isCollapsedString(collapsed)"}) + @Ensures("result != null") public static final List exploreStringList(final String collapsed) { assert isCollapsedString(collapsed); final String[] exploded = collapsed.substring(1).split(","); return Arrays.asList(exploded); } + @Requires("s != null") public static final boolean isCollapsedString(final String s) { return s.charAt(0) == ','; } @@ -204,6 +216,8 @@ public class BCF2Utils { * @param vcfFile * @return */ + @Requires("vcfFile != null") + @Ensures("result != null") public static final File shadowBCF(final File vcfFile) { final String path = vcfFile.getAbsolutePath(); if ( path.contains(".vcf") )