Performance optimizations for Genotype field decoding for GT field

-- Fast path decoder for biallelic diploid GT fields that avoids allocating the same genotypes over and over
-- Contracts
-- final classes
This commit is contained in:
Mark DePristo 2012-06-06 14:10:05 -04:00
parent 7fbca7013e
commit 01ddf9555a
4 changed files with 111 additions and 31 deletions

View File

@ -37,7 +37,7 @@ import java.io.InputStream;
import java.util.ArrayList;
import java.util.Arrays;
public class BCF2Decoder {
public final class BCF2Decoder {
final protected static Logger logger = Logger.getLogger(FeatureCodec.class);
byte[] recordBytes;
@ -242,7 +242,7 @@ public class BCF2Decoder {
* int elements are still forced to do a fresh allocation as well.
* @return see description
*/
@Requires({"BCF2Type.INTEGERS.contains(type)", "size >= 0"})
@Requires({"BCF2Type.INTEGERS.contains(type)", "size >= 0", "type != null"})
public final int[] decodeIntArray(final int size, final BCF2Type type, int[] maybeDest) {
if ( size == 0 ) {
return null;
@ -281,7 +281,7 @@ public class BCF2Decoder {
return decodeIntArray(size, type, null);
}
public final double rawFloatToFloat(final int rawFloat) {
public final double rawFloatToFloat(final int rawFloat) {
return (double)Float.intBitsToFloat(rawFloat);
}

View File

@ -37,7 +37,7 @@ import java.util.*;
* @author depristo
* @since 5/12
*/
public class BCF2Encoder {
public final class BCF2Encoder {
// TODO -- increase default size?
public static final int WRITE_BUFFER_INITIAL_SIZE = 16384;
private ByteArrayOutputStream encodeStream = new ByteArrayOutputStream(WRITE_BUFFER_INITIAL_SIZE);

View File

@ -26,15 +26,14 @@ package org.broadinstitute.sting.utils.codecs.bcf2;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.GenotypeBuilder;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.*;
/**
* An efficient
@ -43,6 +42,10 @@ import java.util.List;
* @since Date created
*/
public class BCF2GenotypeFieldDecoders {
final protected static Logger logger = Logger.getLogger(BCF2GenotypeFieldDecoders.class);
private final static boolean ENABLE_FASTPATH_GT = true;
private final static int MIN_SAMPLES_FOR_FASTPATH_GENOTYPES = 0; // TODO -- update to reasonable number
// initialized once per writer to allow parallel writers to work
private final HashMap<String, Decoder> genotypeFieldDecoder = new HashMap<String, Decoder>();
private final Decoder defaultDecoder = new GenericDecoder();
@ -92,6 +95,8 @@ public class BCF2GenotypeFieldDecoders {
* the PL field into a int[] rather than the generic List of Integer
*/
public interface Decoder {
@Requires({"siteAlleles != null", "! siteAlleles.isEmpty()",
"field != null", "decoder != null", "gbs != null", "! gbs.isEmpty()"})
public void decode(final List<Allele> siteAlleles,
final String field,
final BCF2Decoder decoder,
@ -103,25 +108,80 @@ public class BCF2GenotypeFieldDecoders {
@Override
public void decode(final List<Allele> siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final List<GenotypeBuilder> gbs) {
// we have to do a bit of low-level processing here as we want to know the size upfronta
final int size = decoder.decodeNumberOfElements(typeDescriptor);
final int ploidy = decoder.decodeNumberOfElements(typeDescriptor);
if ( ENABLE_FASTPATH_GT && siteAlleles.size() == 2 && ploidy == 2 && gbs.size() >= MIN_SAMPLES_FOR_FASTPATH_GENOTYPES )
fastBiallelicDiploidDecode(siteAlleles, decoder, typeDescriptor, gbs);
else {
generalDecode(siteAlleles, ploidy, decoder, typeDescriptor, gbs);
}
}
/**
* fast path for many samples with diploid genotypes
*
* The way this would work is simple. Create a List<Allele> diploidGenotypes[] object
* After decoding the offset, if that sample is diploid compute the
* offset into the alleles vector which is simply offset = allele0 * nAlleles + allele1
* if there's a value at diploidGenotypes[offset], use it, otherwise create the genotype
* cache it and use that
*
* Some notes. If there are nAlleles at the site, there are implicitly actually
* n + 1 options including
*/
@Requires("siteAlleles.size() == 2")
@SuppressWarnings({"unchecked"})
private final void fastBiallelicDiploidDecode(final List<Allele> siteAlleles,
final BCF2Decoder decoder,
final byte typeDescriptor,
final List<GenotypeBuilder> gbs) {
logger.info("fastBiallelicDiploidDecode");
final BCF2Type type = BCF2Utils.decodeType(typeDescriptor);
final int nPossibleGenotypes = 3 * 3;
final Object allGenotypes[] = new Object[nPossibleGenotypes];
for ( final GenotypeBuilder gb : gbs ) {
final int a1 = decoder.decodeInt(type);
final int a2 = decoder.decodeInt(type);
if ( a1 == type.getMissingBytes() ) {
assert a2 == type.getMissingBytes();
// no called sample GT = .
gb.alleles(null);
} else if ( a2 == type.getMissingBytes() ) {
gb.alleles(Arrays.asList(getAlleleFromEncoded(siteAlleles, a1)));
} else {
// downshift to remove phase
final int offset = (a1 >> 1) * 3 + (a2 >> 1);
assert offset < allGenotypes.length;
// TODO -- how can I get rid of this cast?
List<Allele> gt = (List<Allele>)allGenotypes[offset];
if ( gt == null ) {
final Allele allele1 = getAlleleFromEncoded(siteAlleles, a1);
final Allele allele2 = getAlleleFromEncoded(siteAlleles, a2);
gt = Arrays.asList(allele1, allele2);
allGenotypes[offset] = gt;
}
gb.alleles(gt);
}
}
}
private final void generalDecode(final List<Allele> siteAlleles,
final int ploidy,
final BCF2Decoder decoder,
final byte typeDescriptor,
final List<GenotypeBuilder> gbs) {
final BCF2Type type = BCF2Utils.decodeType(typeDescriptor);
// a single cache for the encoded genotypes, since we don't actually need this vector
final int[] tmp = new int[size];
// TODO -- fast path for many samples with diploid genotypes
//
// The way this would work is simple. Create a List<Allele> diploidGenotypes[] object
// After decoding the offset, if that sample is diploid compute the
// offset into the alleles vector which is simply offset = allele0 * nAlleles + allele1
// if there's a value at diploidGenotypes[offset], use it, otherwise create the genotype
// cache it and use that
//
// Some notes. If there are nAlleles at the site, there are implicitly actually
// n + 1 options including
final int[] tmp = new int[ploidy];
for ( final GenotypeBuilder gb : gbs ) {
final int[] encoded = decoder.decodeIntArray(size, type, tmp);
final int[] encoded = decoder.decodeIntArray(ploidy, type, tmp);
if ( encoded == null )
// no called sample GT = .
gb.alleles(null);
@ -131,16 +191,22 @@ public class BCF2GenotypeFieldDecoders {
// we have at least some alleles to decode
final List<Allele> gt = new ArrayList<Allele>(encoded.length);
for ( final int encode : encoded ) {
// TODO -- handle padding!
final int offset = encode >> 1;
gt.add(offset == 0 ? Allele.NO_CALL : siteAlleles.get(offset - 1));
}
// note that the auto-pruning of fields magically handles different
// ploidy per sample at a site
for ( final int encode : encoded )
gt.add(getAlleleFromEncoded(siteAlleles, encode));
gb.alleles(gt);
}
}
}
@Requires({"siteAlleles != null && ! siteAlleles.isEmpty()", "encode >= 0"})
@Ensures("result != null")
private final Allele getAlleleFromEncoded(final List<Allele> siteAlleles, final int encode) {
final int offset = encode >> 1;
return offset == 0 ? Allele.NO_CALL : siteAlleles.get(offset - 1);
}
}
private class DPDecoder implements Decoder {

View File

@ -25,6 +25,7 @@
package org.broadinstitute.sting.utils.codecs.bcf2;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLine;
@ -44,7 +45,7 @@ import java.util.*;
* @author depristo
* @since 5/12
*/
public class BCF2Utils {
public final class BCF2Utils {
public static final byte[] MAGIC_HEADER_LINE = "BCF\2".getBytes();
public static final int MAX_ALLELES_IN_GENOTYPES = 127;
@ -83,7 +84,8 @@ public class BCF2Utils {
* @param header the VCFHeader from which to build the dictionary
* @return a non-null dictionary of elements, may be empty
*/
@Ensures("new HashSet(result).size() == result.size()")
@Requires("header != null")
@Ensures({"result != null", "new HashSet(result).size() == result.size()"})
public final static ArrayList<String> makeDictionary(final VCFHeader header) {
final Set<String> dict = new TreeSet<String>();
@ -99,20 +101,24 @@ public class BCF2Utils {
return new ArrayList<String>(dict);
}
@Requires({"nElements >= 0", "type != null"})
public final static byte encodeTypeDescriptor(final int nElements, final BCF2Type type ) {
int encodeSize = Math.min(nElements, OVERFLOW_ELEMENT_MARKER);
byte typeByte = (byte)((0x0F & encodeSize) << 4 | (type.getID() & 0x0F));
return typeByte;
}
@Ensures("result >= 0")
public final static int decodeSize(final byte typeDescriptor) {
return (0xF0 & typeDescriptor) >> 4;
}
@Ensures("result >= 0")
public final static int decodeTypeID(final byte typeDescriptor) {
return typeDescriptor & 0x0F;
}
@Ensures("result != null")
public final static BCF2Type decodeType(final byte typeDescriptor) {
return ID_TO_ENUM[decodeTypeID(typeDescriptor)];
}
@ -121,6 +127,7 @@ public class BCF2Utils {
return decodeSize(typeDescriptor) == OVERFLOW_ELEMENT_MARKER;
}
@Requires("nElements >= 0")
public final static boolean willOverflow(final long nElements) {
return nElements > MAX_INLINE_ELEMENTS;
}
@ -132,6 +139,7 @@ public class BCF2Utils {
}
public final static byte readByte(final InputStream stream) {
// TODO -- shouldn't be capturing error here
try {
return (byte)(stream.read() & 0xFF);
} catch ( IOException e ) {
@ -139,6 +147,7 @@ public class BCF2Utils {
}
}
@Requires({"stream != null", "bytesForEachInt > 0"})
public final static int readInt(int bytesForEachInt, final InputStream stream) {
switch ( bytesForEachInt ) {
case 1: {
@ -165,10 +174,10 @@ public class BCF2Utils {
* @param strings size > 1 list of strings
* @return
*/
@Requires({"strings != null", "strings.size() > 1"})
@Ensures("result != null")
public static final String collapseStringList(final List<String> strings) {
assert strings.size() > 1;
StringBuilder b = new StringBuilder();
final StringBuilder b = new StringBuilder();
for ( final String s : strings ) {
assert s.indexOf(",") == -1; // no commas in individual strings
b.append(",").append(s);
@ -185,12 +194,15 @@ public class BCF2Utils {
* @param collapsed
* @return
*/
@Requires({"collapsed != null", "isCollapsedString(collapsed)"})
@Ensures("result != null")
public static final List<String> exploreStringList(final String collapsed) {
assert isCollapsedString(collapsed);
final String[] exploded = collapsed.substring(1).split(",");
return Arrays.asList(exploded);
}
@Requires("s != null")
public static final boolean isCollapsedString(final String s) {
return s.charAt(0) == ',';
}
@ -204,6 +216,8 @@ public class BCF2Utils {
* @param vcfFile
* @return
*/
@Requires("vcfFile != null")
@Ensures("result != null")
public static final File shadowBCF(final File vcfFile) {
final String path = vcfFile.getAbsolutePath();
if ( path.contains(".vcf") )