Smarter infrastructure to decode genotypes in BCF

-- Eliminated the large intermediate map from field name to list of list<Integer> values needed to create genotypes without the GenotypeBuilder.  The new code is cleaner and simply fills in an array of GenotypeBuilders as it moves through the column layout in BCF2
-- Now we create once decoders specialized for each GT field (GT, AD, etc) that can be optimized for putting data into the GenotypeBuilder.  In a subsequent commit these will actually use lower level BCF2 decoders to create the low-level ints and int[], avoiding the intermediate List<Integer> form
-- Reduced the amount of data further to be computed in the DiffEngine.  The DiffEngine algorithm needs to be rethought to be efficient...
This commit is contained in:
Mark DePristo 2012-06-03 12:20:08 -04:00
parent 889e3c4583
commit 17fbd103d0
5 changed files with 250 additions and 117 deletions

View File

@ -51,6 +51,7 @@ public class BCF2Codec implements FeatureCodec<VariantContext>, ReferenceDepende
private final BCF2Decoder decoder = new BCF2Decoder();
private boolean skipGenotypes = false;
private final static int MAX_HEADER_SIZE = 0x08000000;
private BCF2GenotypeFieldDecoders gtFieldDecoders = null;
// ----------------------------------------------------------------------
//
@ -128,6 +129,9 @@ public class BCF2Codec implements FeatureCodec<VariantContext>, ReferenceDepende
// create the string dictionary
dictionary = parseDictionary(header);
// prepare the genotype field decoders
gtFieldDecoders = new BCF2GenotypeFieldDecoders(header);
// position right before next line (would be right before first real record byte at end of header)
return new FeatureCodecHeader(header, inputStream.getPosition());
}
@ -216,7 +220,7 @@ public class BCF2Codec implements FeatureCodec<VariantContext>, ReferenceDepende
return new SitesInfoForDecoding(pos, nFormatFields, nSamples, alleles);
}
private final static class SitesInfoForDecoding {
protected final static class SitesInfoForDecoding {
final int pos;
final int nFormatFields;
final int nSamples;
@ -361,6 +365,7 @@ public class BCF2Codec implements FeatureCodec<VariantContext>, ReferenceDepende
}
}
private final ArrayList<String> parseDictionary(final VCFHeader header) {
final ArrayList<String> dict = BCF2Utils.makeDictionary(header);
@ -374,4 +379,8 @@ public class BCF2Codec implements FeatureCodec<VariantContext>, ReferenceDepende
protected VCFHeader getHeader() {
return header;
}
protected BCF2GenotypeFieldDecoders.Decoder getGenotypeFieldDecoder(final String field) {
return gtFieldDecoders.getDecoder(field);
}
}

View File

@ -0,0 +1,210 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.codecs.bcf2;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.GenotypeBuilder;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
/**
* An efficient
*
* @author Your Name
* @since Date created
*/
public class BCF2GenotypeFieldDecoders {
// initialized once per writer to allow parallel writers to work
private final HashMap<String, Decoder> genotypeFieldDecoder = new HashMap<String, Decoder>();
private final Decoder defaultDecoder = new GenericDecoder();
public BCF2GenotypeFieldDecoders(final VCFHeader header) {
// TODO -- fill in appropriate decoders for each FORMAT field in the header
genotypeFieldDecoder.put(VCFConstants.GENOTYPE_KEY, new GTDecoder());
genotypeFieldDecoder.put(VCFConstants.GENOTYPE_FILTER_KEY, new FLDecoder());
genotypeFieldDecoder.put(VCFConstants.DEPTH_KEY, new DPDecoder());
genotypeFieldDecoder.put(VCFConstants.GENOTYPE_ALLELE_DEPTHS, new ADDecoder());
genotypeFieldDecoder.put(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, new PLDecoder());
genotypeFieldDecoder.put(VCFConstants.GENOTYPE_QUALITY_KEY, new GQDecoder());
}
// -----------------------------------------------------------------
//
// Genotype field decoder
//
// -----------------------------------------------------------------
/**
* Return decoder appropriate for field, or the generic decoder if no
* specialized one is bound
* @param field the GT field to decode
* @return a non-null decoder
*/
@Requires("field != null")
@Ensures("result != null")
public Decoder getDecoder(final String field) {
final Decoder d = genotypeFieldDecoder.get(field);
return d == null ? defaultDecoder : d;
}
/**
* Decoder a field (implicit from creation) encoded as
* typeDescriptor in the decoder object in the GenotypeBuilders
* one for each sample in order.
*
* The way this works is that this decode method
* iterates over the builders, decoding a genotype field
* in BCF2 for each sample from decoder.
*
* This system allows us to easily use specialized
* decoders for specific genotype field values. For example,
* we use a special decoder to directly read the BCF2 data for
* the PL field into a int[] rather than the generic List of Integer
*/
public interface Decoder {
public void decode(final List<Allele> siteAlleles,
final String field,
final BCF2Decoder decoder,
final byte typeDescriptor,
final List<GenotypeBuilder> gbs);
}
private class GTDecoder implements Decoder {
@Override
public void decode(final List<Allele> siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final List<GenotypeBuilder> gbs) {
for ( final GenotypeBuilder gb : gbs ) {
// TODO -- fast path for size == 2 (diploid)
final List<Integer> encoded = (List<Integer>)decoder.decodeTypedValue(typeDescriptor);
if ( encoded == null )
// no called sample GT = .
gb.alleles(null);
else {
// we have at least some alleles to decode
final List<Allele> gt = new ArrayList<Allele>(encoded.size());
for ( final Integer encode : encoded ) {
if ( encode == null ) {
// absent, as are all following by definition
break;
} else {
final int offset = encode >> 1;
if ( offset == 0 )
gt.add(Allele.NO_CALL);
else
gt.add(siteAlleles.get(offset - 1));
}
}
gb.alleles(gt);
}
}
}
}
private class DPDecoder implements Decoder {
@Override
public void decode(final List<Allele> siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final List<GenotypeBuilder> gbs) {
for ( final GenotypeBuilder gb : gbs ) {
final Object value = decoder.decodeTypedValue(typeDescriptor);
if ( value != null )
gb.DP((Integer)value);
}
}
}
private class GQDecoder implements Decoder {
@Override
public void decode(final List<Allele> siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final List<GenotypeBuilder> gbs) {
for ( final GenotypeBuilder gb : gbs ) {
final Object value = decoder.decodeTypedValue(typeDescriptor);
if ( value != null )
gb.GQ((Integer)value);
}
}
}
private class ADDecoder implements Decoder {
@Override
public void decode(final List<Allele> siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final List<GenotypeBuilder> gbs) {
for ( final GenotypeBuilder gb : gbs ) {
final int[] AD = decodeIntArray(decoder.decodeTypedValue(typeDescriptor));
if ( AD != null )
gb.AD(AD);
}
}
}
private class PLDecoder implements Decoder {
@Override
public void decode(final List<Allele> siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final List<GenotypeBuilder> gbs) {
for ( final GenotypeBuilder gb : gbs ) {
final int[] PL = decodeIntArray(decoder.decodeTypedValue(typeDescriptor));
if ( PL != null )
gb.PL(PL);
}
}
}
private class FLDecoder implements Decoder {
@Override
public void decode(final List<Allele> siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final List<GenotypeBuilder> gbs) {
throw new ReviewedStingException("Genotype filter not implemented in BCF2 yet");
}
}
private class GenericDecoder implements Decoder {
@Override
public void decode(final List<Allele> siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final List<GenotypeBuilder> gbs) {
for ( final GenotypeBuilder gb : gbs ) {
Object value = decoder.decodeTypedValue(typeDescriptor);
if ( value != null ) { // don't add missing values
if ( value instanceof List && ((List)value).size() == 1)
value = ((List)value).get(0);
gb.attribute(field, value);
}
}
}
}
private static final int[] decodeIntArray(final Object value) {
// todo -- decode directly into int[]
final List<Integer> pls = (List<Integer>)value;
if ( pls != null ) { // we have a PL field
final int[] x = new int[pls.size()];
for ( int j = 0; j < x.length; j++ )
x[j] = pls.get(j);
return x;
} else
return null;
}
}

View File

@ -25,8 +25,6 @@
package org.broadinstitute.sting.utils.codecs.bcf2;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.variantcontext.*;
@ -60,9 +58,11 @@ class BCF2LazyGenotypesDecoder implements LazyGenotypesContext.LazyParser {
public LazyGenotypesContext.LazyData parse(final Object data) {
logger.info("Decoding BCF genotypes for " + nSamples + " samples with " + nFields + " fields each");
// load our bytep[] data into the decoder
// load our byte[] data into the decoder
final BCF2Decoder decoder = new BCF2Decoder(((BCF2Codec.LazyData)data).bytes);
// TODO -- fast path for sites only
// go ahead and decode everyone
final List<String> samples = new ArrayList<String>(codec.getHeader().getGenotypeSamples());
@ -71,119 +71,32 @@ class BCF2LazyGenotypesDecoder implements LazyGenotypesContext.LazyParser {
"different numbers of samples per record. Saw " + samples.size() +
" samples in header but have a record with " + nSamples + " samples");
final Map<String, List<Object>> fieldValues = decodeGenotypeFieldValues(decoder, nFields, nSamples);
final ArrayList<Genotype> genotypes = new ArrayList<Genotype>(nSamples);
final GenotypeBuilder gb = new GenotypeBuilder();
// create and initialize the genotypes array
final ArrayList<GenotypeBuilder> builders = new ArrayList<GenotypeBuilder>(nSamples);
for ( int i = 0; i < nSamples; i++ ) {
// all of the information we need for each genotype, with default values
gb.reset();
gb.name(samples.get(i));
for ( final Map.Entry<String, List<Object>> entry : fieldValues.entrySet() ) {
final String field = entry.getKey();
Object value = entry.getValue().get(i);
try {
if ( field.equals(VCFConstants.GENOTYPE_KEY) ) {
gb.alleles(decodeGenotypeAlleles(siteAlleles, (List<Integer>)value));
} else if ( field.equals(VCFConstants.DEPTH_KEY) ) {
if ( value != BCF2Type.INT8.getMissingJavaValue() )
gb.DP((Integer)value);
} else if ( field.equals(VCFConstants.GENOTYPE_QUALITY_KEY) ) {
if ( value != BCF2Type.INT8.getMissingJavaValue() )
gb.GQ((Integer)value);
} else if ( field.equals(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY) ) {
final int[] PLs = decodeIntArray(value);
if ( PLs != null )
gb.PL(PLs);
} else if ( field.equals(VCFConstants.GENOTYPE_ALLELE_DEPTHS) ) {
final int[] AD = decodeIntArray(value);
if ( AD != null )
gb.AD(AD);
} else if ( field.equals(VCFConstants.GENOTYPE_FILTER_KEY) ) {
throw new ReviewedStingException("Genotype filters not implemented in GATK BCF2");
//filters = new HashSet<String>(values.get(i));
} else { // add to attributes
if ( value != null ) { // don't add missing values
if ( value instanceof List && ((List)value).size() == 1)
value = ((List)value).get(0);
gb.attribute(field, value);
}
}
} catch ( ClassCastException e ) {
throw new UserException.MalformedBCF2("BUG: expected encoding of field " + field
+ " inconsistent with the value observed in the decoded value in the "
+ " BCF file. Value was " + value);
}
}
final Genotype g = gb.make();
genotypes.add(g);
builders.add(new GenotypeBuilder(samples.get(i)));
}
for ( int i = 0; i < nFields; i++ ) {
// get the field name
final int offset = (Integer) decoder.decodeTypedValue();
final String field = codec.getDictionaryString(offset);
// the type of each element
final byte typeDescriptor = decoder.readTypeDescriptor();
final BCF2GenotypeFieldDecoders.Decoder fieldDecoder = codec.getGenotypeFieldDecoder(field);
try {
fieldDecoder.decode(siteAlleles, field, decoder, typeDescriptor, builders);
} catch ( ClassCastException e ) {
throw new UserException.MalformedBCF2("BUG: expected encoding of field " + field
+ " inconsistent with the value observed in the decoded value");
}
}
final ArrayList<Genotype> genotypes = new ArrayList<Genotype>(nSamples);
for ( final GenotypeBuilder gb : builders )
genotypes.add(gb.make());
return new LazyGenotypesContext.LazyData(genotypes, codec.getHeader().getSampleNamesInOrder(), codec.getHeader().getSampleNameToOffset());
}
private final int[] decodeIntArray(final Object value) {
// todo -- decode directly into int[]
final List<Integer> pls = (List<Integer>)value;
if ( pls != null ) { // we have a PL field
final int[] x = new int[pls.size()];
for ( int j = 0; j < x.length; j++ )
x[j] = pls.get(j);
return x;
} else
return null;
}
private final List<Allele> decodeGenotypeAlleles(final ArrayList<Allele> siteAlleles, final List<Integer> encoded) {
if ( encoded == null )
// no called sample GT = .
return Collections.emptyList();
else {
// we have at least some alleles to decode
final List<Allele> gt = new ArrayList<Allele>(encoded.size());
for ( final Integer encode : encoded ) {
if ( encode == null ) // absent, as are all following by definition
return gt;
else {
final int offset = encode >> 1;
if ( offset == 0 )
gt.add(Allele.NO_CALL);
else
gt.add(siteAlleles.get(offset - 1));
}
}
return gt;
}
}
private final Map<String, List<Object>> decodeGenotypeFieldValues(final BCF2Decoder decoder,
final int nFields,
final int nSamples) {
assert (nFields > 0 && nSamples > 0) || (nFields == 0 && nSamples == 0);
if ( nFields == 0 ) // fast path exit for sites only file
return Collections.emptyMap();
else {
final Map<String, List<Object>> map = new LinkedHashMap<String, List<Object>>(nFields);
for ( int i = 0; i < nFields; i++ ) {
final int offset = (Integer) decoder.decodeTypedValue();
final String field = codec.getDictionaryString(offset);
// the type of each element
final byte typeDescriptor = decoder.readTypeDescriptor();
final List<Object> values = new ArrayList<Object>(nSamples);
for ( int j = 0; j < nSamples; j++ )
values.add(decoder.decodeTypedValue(typeDescriptor));
assert ! map.containsKey(field);
map.put(field, values);
}
return map;
}
}
}

View File

@ -265,6 +265,7 @@ public abstract class Genotype implements Comparable<Genotype> {
/**
* @return Returns true if this Genotype has PL field values
*/
@Ensures("(result && getLikelihoods() != null) || (! result && getLikelihoods() == null)")
public boolean hasLikelihoods() {
return getPL() != null;
}
@ -284,7 +285,7 @@ public abstract class Genotype implements Comparable<Genotype> {
* Returns the GenotypesLikelihoods data associated with this Genotype, or null if missing
* @return null or a GenotypesLikelihood object for this sample's PL field
*/
@Ensures({"hasLikelihoods() && result != null", "! hasLikelihoods() && result == null"})
@Ensures("(hasLikelihoods() && result != null) || (! hasLikelihoods() && result == null)")
public GenotypeLikelihoods getLikelihoods() {
return hasLikelihoods() ? GenotypeLikelihoods.fromPLs(getPL()) : null;
}

View File

@ -47,8 +47,8 @@ public class MD5DB {
/**
* Subdirectory under the ant build directory where we store integration test md5 results
*/
private static final int MAX_RECORDS_TO_READ = 10000;
private static final int MAX_RAW_DIFFS_TO_SUMMARIZE = 1000;
private static final int MAX_RECORDS_TO_READ = 1000;
private static final int MAX_RAW_DIFFS_TO_SUMMARIZE = 100;
public static final String LOCAL_MD5_DB_DIR = "integrationtests";
public static final String GLOBAL_MD5_DB_DIR = "/humgen/gsa-hpprojects/GATK/data/integrationtests";