diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java index 565de4e25..5456d47ed 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java @@ -191,12 +191,12 @@ public class BCF2Codec implements FeatureCodec, ReferenceDepende // -------------------------------------------------------------------------------- private final SitesInfoForDecoding decodeSitesBlock(final VariantContextBuilder builder) { - final int contigOffset = decoder.decodeInt(BCF2Type.INT32.getSizeInBytes()); + final int contigOffset = decoder.decodeInt(BCF2Type.INT32); final String contig = lookupContigName(contigOffset); builder.chr(contig); - final int pos = decoder.decodeInt(BCF2Type.INT32.getSizeInBytes()); - final int refLength = decoder.decodeInt(BCF2Type.INT32.getSizeInBytes()); + final int pos = decoder.decodeInt(BCF2Type.INT32); + final int refLength = decoder.decodeInt(BCF2Type.INT32); builder.start((long)pos); builder.stop((long)(pos + refLength - 1)); // minus one because of our open intervals @@ -205,8 +205,8 @@ public class BCF2Codec implements FeatureCodec, ReferenceDepende builder.log10PError(((Double)qual) / -10.0); } - final int nAlleleInfo = decoder.decodeInt(BCF2Type.INT32.getSizeInBytes()); - final int nFormatSamples = decoder.decodeInt(BCF2Type.INT32.getSizeInBytes()); + final int nAlleleInfo = decoder.decodeInt(BCF2Type.INT32); + final int nFormatSamples = decoder.decodeInt(BCF2Type.INT32); final int nAlleles = nAlleleInfo >> 16; final int nInfo = nAlleleInfo & 0x00FF; final int nFormatFields = nFormatSamples >> 24; diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Decoder.java b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Decoder.java index 1ae84483d..695e8656e 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Decoder.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Decoder.java @@ -24,6 +24,8 @@ package org.broadinstitute.sting.utils.codecs.bcf2; +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; import org.apache.log4j.Logger; import org.broad.tribble.FeatureCodec; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; @@ -33,6 +35,7 @@ import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; +import java.util.Arrays; public class BCF2Decoder { final protected static Logger logger = Logger.getLogger(FeatureCodec.class); @@ -131,7 +134,7 @@ public class BCF2Decoder { } public final Object decodeTypedValue(final byte typeDescriptor) { - final int size = BCF2Utils.sizeIsOverflow(typeDescriptor) ? decodeVectorSize() : BCF2Utils.decodeSize(typeDescriptor); + final int size = decodeNumberOfElements(typeDescriptor); final BCF2Type type = BCF2Utils.decodeType(typeDescriptor); assert size >= 0; @@ -155,7 +158,7 @@ public class BCF2Decoder { public final Object decodeSingleValue(final BCF2Type type) { // TODO -- decodeTypedValue should integrate this routine - final int value = BCF2Utils.readInt(type.getSizeInBytes(), recordStream); + final int value = decodeInt(type); if ( value == type.getMissingBytes() ) return null; @@ -191,22 +194,94 @@ public class BCF2Decoder { } } - private final int decodeVectorSize() { - final byte typeDescriptor = readTypeDescriptor(); - final int size = BCF2Utils.decodeSize(typeDescriptor); + @Ensures("result >= 0") + public final int decodeNumberOfElements(final byte typeDescriptor) { + if ( BCF2Utils.sizeIsOverflow(typeDescriptor) ) + // -1 ensures we explode immediately with a bad size if the result is missing + return decodeInt(readTypeDescriptor(), -1); + else + // the size is inline, so just decode it + return BCF2Utils.decodeSize(typeDescriptor); + } + + /** + * Decode an int from the stream. If the value in the stream is missing, + * returns missingValue. Requires the typeDescriptor indicate an inline + * single element event + * + * @param typeDescriptor + * @return + */ + @Requires("BCF2Utils.decodeSize(typeDescriptor) == 1") + public final int decodeInt(final byte typeDescriptor, final int missingValue) { final BCF2Type type = BCF2Utils.decodeType(typeDescriptor); - - assert size == 1; - assert type == BCF2Type.INT8 || type == BCF2Type.INT16 || type == BCF2Type.INT32; - - return decodeInt(type.getSizeInBytes()); + final int i = decodeInt(type); + return i == type.getMissingBytes() ? missingValue : i; } - public final int decodeInt(int bytesForEachInt) { - return BCF2Utils.readInt(bytesForEachInt, recordStream); + @Requires("type != null") + public final int decodeInt(final BCF2Type type) { + return BCF2Utils.readInt(type.getSizeInBytes(), recordStream); } - public final double rawFloatToFloat(final int rawFloat) { + /** + * Low-level reader for int[] + * + * Requires a typeDescriptor so the function knows how many elements to read, + * and how they are encoded. + * + * If size == 0 => result is null + * If size > 0 => result depends on the actual values in the stream + * -- If the first element read is MISSING, result is null (all values are missing) + * -- Else result = int[N] where N is the first N non-missing values decoded + * + * @param maybeDest if not null we'll not allocate space for the vector, but instead use + * the externally allocated array of ints to store values. If the + * size of this vector is < the actual size of the elements, we'll be + * forced to use freshly allocated arrays. Also note that padded + * int elements are still forced to do a fresh allocation as well. + * @return see description + */ + @Requires({"BCF2Type.INTEGERS.contains(type)", "size >= 0"}) + public final int[] decodeIntArray(final int size, final BCF2Type type, int[] maybeDest) { + if ( size == 0 ) { + return null; + } else { + if ( maybeDest != null && maybeDest.length < size ) + maybeDest = null; // by nulling this out we ensure that we do fresh allocations as maybeDest is too small + + final int val1 = decodeInt(type); + if ( val1 == type.getMissingBytes() ) { + // fast path for first element being missing + for ( int i = 1; i < size; i++ ) decodeInt(type); + return null; + } else { + // we know we will have at least 1 element, so making the int[] is worth it + final int[] ints = maybeDest == null ? new int[size] : maybeDest; + ints[0] = val1; // we already read the first one + for ( int i = 1; i < size; i++ ) { + ints[i] = decodeInt(type); + if ( ints[i] == type.getMissingBytes() ) { + // read the rest of the missing values, dropping them + for ( int j = i + 1; j < size; j++ ) decodeInt(type); + // deal with auto-pruning by returning an int[] containing + // only the non-MISSING values. We do this by copying the first + // i elements, as i itself is missing + return Arrays.copyOf(ints, i); + } + } + return ints; // all of the elements were non-MISSING + } + } + } + + public final int[] decodeIntArray(final byte typeDescriptor) { + final int size = decodeNumberOfElements(typeDescriptor); + final BCF2Type type = BCF2Utils.decodeType(typeDescriptor); + return decodeIntArray(size, type, null); + } + + public final double rawFloatToFloat(final int rawFloat) { return (double)Float.intBitsToFloat(rawFloat); } diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2GenotypeFieldDecoders.java b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2GenotypeFieldDecoders.java index e0c89fd77..27d36cc57 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2GenotypeFieldDecoders.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2GenotypeFieldDecoders.java @@ -102,27 +102,29 @@ public class BCF2GenotypeFieldDecoders { private class GTDecoder implements Decoder { @Override public void decode(final List siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final List gbs) { + // we have to do a bit of low-level processing here as we want to know the size upfronta + final int size = decoder.decodeNumberOfElements(typeDescriptor); + final BCF2Type type = BCF2Utils.decodeType(typeDescriptor); + + // a single cache for the encoded genotypes, since we don't actually need this vector + final int[] tmp = new int[size]; + + // TODO -- fast path for size == 2 (diploid) and many samples + // TODO -- by creating all 4 allele combinations and doing a straight lookup instead of allocations them for ( final GenotypeBuilder gb : gbs ) { - // TODO -- fast path for size == 2 (diploid) - final List encoded = (List)decoder.decodeTypedValue(typeDescriptor); + final int[] encoded = decoder.decodeIntArray(size, type, tmp); if ( encoded == null ) // no called sample GT = . gb.alleles(null); else { - // we have at least some alleles to decode - final List gt = new ArrayList(encoded.size()); + assert encoded.length > 0; - for ( final Integer encode : encoded ) { - if ( encode == null ) { - // absent, as are all following by definition - break; - } else { - final int offset = encode >> 1; - if ( offset == 0 ) - gt.add(Allele.NO_CALL); - else - gt.add(siteAlleles.get(offset - 1)); - } + // we have at least some alleles to decode + final List gt = new ArrayList(encoded.length); + + for ( final int encode : encoded ) { + final int offset = encode >> 1; + gt.add(offset == 0 ? Allele.NO_CALL : siteAlleles.get(offset - 1)); } gb.alleles(gt); @@ -135,9 +137,8 @@ public class BCF2GenotypeFieldDecoders { @Override public void decode(final List siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final List gbs) { for ( final GenotypeBuilder gb : gbs ) { - final Object value = decoder.decodeTypedValue(typeDescriptor); - if ( value != null ) - gb.DP((Integer)value); + // the -1 is for missing + gb.DP(decoder.decodeInt(typeDescriptor, -1)); } } } @@ -146,9 +147,8 @@ public class BCF2GenotypeFieldDecoders { @Override public void decode(final List siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final List gbs) { for ( final GenotypeBuilder gb : gbs ) { - final Object value = decoder.decodeTypedValue(typeDescriptor); - if ( value != null ) - gb.GQ((Integer)value); + // the -1 is for missing + gb.GQ(decoder.decodeInt(typeDescriptor, -1)); } } } @@ -157,9 +157,7 @@ public class BCF2GenotypeFieldDecoders { @Override public void decode(final List siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final List gbs) { for ( final GenotypeBuilder gb : gbs ) { - final int[] AD = decodeIntArray(decoder.decodeTypedValue(typeDescriptor)); - if ( AD != null ) - gb.AD(AD); + gb.AD(decoder.decodeIntArray(typeDescriptor)); } } } @@ -168,9 +166,7 @@ public class BCF2GenotypeFieldDecoders { @Override public void decode(final List siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final List gbs) { for ( final GenotypeBuilder gb : gbs ) { - final int[] PL = decodeIntArray(decoder.decodeTypedValue(typeDescriptor)); - if ( PL != null ) - gb.PL(PL); + gb.PL(decoder.decodeIntArray(typeDescriptor)); } } } @@ -188,8 +184,13 @@ public class BCF2GenotypeFieldDecoders { for ( final GenotypeBuilder gb : gbs ) { Object value = decoder.decodeTypedValue(typeDescriptor); if ( value != null ) { // don't add missing values - if ( value instanceof List && ((List)value).size() == 1) + if ( value instanceof List && ((List)value).size() == 1) { + // todo -- I really hate this, and it suggests that the code isn't completely right + // the reason it's here is that it's possible to prune down a vector to a singleton + // value and there we have the contract that the value comes back as an atomic value + // not a vector of size 1 value = ((List)value).get(0); + } gb.attribute(field, value); } } diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2LazyGenotypesDecoder.java b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2LazyGenotypesDecoder.java index bafda761d..9f5035086 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2LazyGenotypesDecoder.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2LazyGenotypesDecoder.java @@ -56,7 +56,8 @@ class BCF2LazyGenotypesDecoder implements LazyGenotypesContext.LazyParser { @Override public LazyGenotypesContext.LazyData parse(final Object data) { - logger.info("Decoding BCF genotypes for " + nSamples + " samples with " + nFields + " fields each"); + if ( logger.isDebugEnabled() ) + logger.debug("Decoding BCF genotypes for " + nSamples + " samples with " + nFields + " fields each"); // load our byte[] data into the decoder final BCF2Decoder decoder = new BCF2Decoder(((BCF2Codec.LazyData)data).bytes); diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Type.java b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Type.java index cc7debc00..9e21ddecd 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Type.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Type.java @@ -24,6 +24,8 @@ package org.broadinstitute.sting.utils.codecs.bcf2; +import java.util.EnumSet; + /** * BCF2 types and information * @@ -67,4 +69,6 @@ public enum BCF2Type { public final boolean withinRange(final long v) { return v >= minValue && v <= maxValue; } public Object getMissingJavaValue() { return missingJavaValue; } public int getMissingBytes() { return missingBytes; } + + public final static EnumSet INTEGERS = EnumSet.of(INT8, INT16, INT32); }