Simple optimizations for BCF2Encoder

-- Inline encodeString that doesn't go via List<Byte> intermediate
-- Inline encodeString that uses byte[] directly so that we can go from Allele.getBytes() => BCF2
-- Fast paths for Atomic Float and Atomic Integer values avoiding intermediate list creation
-- Final UG integration test update
This commit is contained in:
Mark DePristo 2012-06-14 16:40:00 -04:00
parent 68eed7b313
commit 0384ce5d34
6 changed files with 60 additions and 11 deletions

View File

@ -100,13 +100,17 @@ public final class BCF2Encoder {
@Ensures("encodeStream.size() > old(encodeStream.size())")
public final void encodeTypedString(final String s) throws IOException {
encodeTypedString(s.getBytes());
}
@Ensures("encodeStream.size() > old(encodeStream.size())")
public final void encodeTypedString(final byte[] s) throws IOException {
if ( s == null )
encodeType(0, BCF2Type.CHAR);
else {
encodeType(s.length(), BCF2Type.CHAR);
for ( int i = 0; i < s.length(); i++ ) {
final byte c = (byte)s.charAt(i);
encodeRawChar(c);
encodeType(s.length, BCF2Type.CHAR);
for ( int i = 0; i < s.length; i++ ) {
encodeRawChar(s[i]);
}
}
}

View File

@ -389,17 +389,30 @@ public abstract class BCF2FieldEncoder {
// ----------------------------------------------------------------------
public static class Float extends BCF2FieldEncoder {
final boolean isAtomic;
public Float(final VCFCompoundHeaderLine headerLine, final Map<String, Integer> dict ) {
super(headerLine, dict, BCF2Type.FLOAT);
isAtomic = hasConstantNumElements() && numElements() == 1;
}
@Override
public void encodeValue(final BCF2Encoder encoder, final Object value, final BCF2Type type, final int minValues) throws IOException {
final List<Double> doubles = toList(Double.class, value);
int count = 0;
for ( final double d : doubles ) {
encoder.encodeRawFloat(d);
count++;
// TODO -- can be restructured to avoid toList operation
if ( isAtomic ) {
// fast path for fields with 1 fixed float value
if ( value != null ) {
encoder.encodeRawFloat((Double)value);
count++;
}
} else {
// handle generic case
final List<Double> doubles = toList(Double.class, value);
for ( final double d : doubles ) {
encoder.encodeRawFloat(d);
count++;
}
}
for ( ; count < minValues; count++ ) encoder.encodeRawMissingValue(type);
}
@ -445,6 +458,30 @@ public abstract class BCF2FieldEncoder {
//
// ----------------------------------------------------------------------
/**
* Specialized int encoder for atomic (non-list) integers
*/
public static class AtomicInt extends BCF2FieldEncoder {
public AtomicInt(final VCFCompoundHeaderLine headerLine, final Map<String, Integer> dict ) {
super(headerLine, dict, null);
}
@Override
public BCF2Type getDynamicType(final Object value) {
return value == null ? BCF2Type.INT8 : BCF2Utils.determineIntegerType((Integer)value);
}
@Override
public void encodeValue(final BCF2Encoder encoder, final Object value, final BCF2Type type, final int minValues) throws IOException {
int count = 0;
if ( value != null ) {
encoder.encodeRawInt((Integer)value, type);
count++;
}
for ( ; count < minValues; count++ ) encoder.encodeRawMissingValue(type);
}
}
public static class GenericInts extends BCF2FieldEncoder {
public GenericInts(final VCFCompoundHeaderLine headerLine, final Map<String, Integer> dict ) {
super(headerLine, dict, null);

View File

@ -201,6 +201,9 @@ public abstract class BCF2FieldWriter {
@Override
public void start(final BCF2Encoder encoder, final VariantContext vc) throws IOException {
// TODO
// TODO this piece of code consumes like 10% of the runtime alone because fo the vc.getGenotypes() iteration
// TODO
encodingType = BCF2Type.INT8;
for ( final Genotype g : vc.getGenotypes() ) {
final int[] pls = ige.getValues(g);

View File

@ -114,7 +114,10 @@ public class BCF2FieldWriterManager {
case Float:
return new BCF2FieldEncoder.Float(line, dict);
case Integer:
return new BCF2FieldEncoder.GenericInts(line, dict);
if ( line.isFixedCount() && line.getCount() == 1 )
return new BCF2FieldEncoder.AtomicInt(line, dict);
else
return new BCF2FieldEncoder.GenericInts(line, dict);
default:
throw new ReviewedStingException("Unexpected type for field " + line.getID());
}

View File

@ -257,7 +257,9 @@ class BCF2Writer extends IndexingVariantContextWriter {
private void buildAlleles( VariantContext vc ) throws IOException {
final boolean needsPadding = VariantContextUtils.needsPadding(vc);
for ( final Allele allele : vc.getAlleles() ) {
final String s = needsPadding ? VariantContextUtils.padAllele(vc,allele) : allele.getDisplayString();
byte[] s = allele.getBases();
if ( needsPadding )
s = VariantContextUtils.padAllele(vc,allele).getBytes();
encoder.encodeTypedString(s);
}
}

View File

@ -342,7 +342,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec(
baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + result.get(0).getAbsolutePath() + " -I " + validationDataLocation +
"low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10450700-10551000", 1,
Arrays.asList("adcf53b8dcfde7f2c657745751549bfe"));
Arrays.asList("5c7db047ae9417d37c6bbda1d8ea6019"));
executeTest("test MultiSample Pilot1 CEU indels using GENOTYPE_GIVEN_ALLELES", spec2);
}