Smarter infrastructure to decode genotypes in BCF
-- Eliminated the large intermediate map from field name to list of list<Integer> values needed to create genotypes without the GenotypeBuilder. The new code is cleaner and simply fills in an array of GenotypeBuilders as it moves through the column layout in BCF2 -- Now we create once decoders specialized for each GT field (GT, AD, etc) that can be optimized for putting data into the GenotypeBuilder. In a subsequent commit these will actually use lower level BCF2 decoders to create the low-level ints and int[], avoiding the intermediate List<Integer> form -- Reduced the amount of data further to be computed in the DiffEngine. The DiffEngine algorithm needs to be rethought to be efficient...
This commit is contained in:
parent
889e3c4583
commit
17fbd103d0
|
|
@ -51,6 +51,7 @@ public class BCF2Codec implements FeatureCodec<VariantContext>, ReferenceDepende
|
|||
private final BCF2Decoder decoder = new BCF2Decoder();
|
||||
private boolean skipGenotypes = false;
|
||||
private final static int MAX_HEADER_SIZE = 0x08000000;
|
||||
private BCF2GenotypeFieldDecoders gtFieldDecoders = null;
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
//
|
||||
|
|
@ -128,6 +129,9 @@ public class BCF2Codec implements FeatureCodec<VariantContext>, ReferenceDepende
|
|||
// create the string dictionary
|
||||
dictionary = parseDictionary(header);
|
||||
|
||||
// prepare the genotype field decoders
|
||||
gtFieldDecoders = new BCF2GenotypeFieldDecoders(header);
|
||||
|
||||
// position right before next line (would be right before first real record byte at end of header)
|
||||
return new FeatureCodecHeader(header, inputStream.getPosition());
|
||||
}
|
||||
|
|
@ -216,7 +220,7 @@ public class BCF2Codec implements FeatureCodec<VariantContext>, ReferenceDepende
|
|||
return new SitesInfoForDecoding(pos, nFormatFields, nSamples, alleles);
|
||||
}
|
||||
|
||||
private final static class SitesInfoForDecoding {
|
||||
protected final static class SitesInfoForDecoding {
|
||||
final int pos;
|
||||
final int nFormatFields;
|
||||
final int nSamples;
|
||||
|
|
@ -361,6 +365,7 @@ public class BCF2Codec implements FeatureCodec<VariantContext>, ReferenceDepende
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
private final ArrayList<String> parseDictionary(final VCFHeader header) {
|
||||
final ArrayList<String> dict = BCF2Utils.makeDictionary(header);
|
||||
|
||||
|
|
@ -374,4 +379,8 @@ public class BCF2Codec implements FeatureCodec<VariantContext>, ReferenceDepende
|
|||
protected VCFHeader getHeader() {
|
||||
return header;
|
||||
}
|
||||
|
||||
protected BCF2GenotypeFieldDecoders.Decoder getGenotypeFieldDecoder(final String field) {
|
||||
return gtFieldDecoders.getDecoder(field);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,210 @@
|
|||
/*
|
||||
* Copyright (c) 2012, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.utils.codecs.bcf2;
|
||||
|
||||
import com.google.java.contract.Ensures;
|
||||
import com.google.java.contract.Requires;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.utils.variantcontext.GenotypeBuilder;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* An efficient
|
||||
*
|
||||
* @author Your Name
|
||||
* @since Date created
|
||||
*/
|
||||
public class BCF2GenotypeFieldDecoders {
|
||||
// initialized once per writer to allow parallel writers to work
|
||||
private final HashMap<String, Decoder> genotypeFieldDecoder = new HashMap<String, Decoder>();
|
||||
private final Decoder defaultDecoder = new GenericDecoder();
|
||||
|
||||
public BCF2GenotypeFieldDecoders(final VCFHeader header) {
|
||||
// TODO -- fill in appropriate decoders for each FORMAT field in the header
|
||||
|
||||
genotypeFieldDecoder.put(VCFConstants.GENOTYPE_KEY, new GTDecoder());
|
||||
genotypeFieldDecoder.put(VCFConstants.GENOTYPE_FILTER_KEY, new FLDecoder());
|
||||
genotypeFieldDecoder.put(VCFConstants.DEPTH_KEY, new DPDecoder());
|
||||
genotypeFieldDecoder.put(VCFConstants.GENOTYPE_ALLELE_DEPTHS, new ADDecoder());
|
||||
genotypeFieldDecoder.put(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, new PLDecoder());
|
||||
genotypeFieldDecoder.put(VCFConstants.GENOTYPE_QUALITY_KEY, new GQDecoder());
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------
|
||||
//
|
||||
// Genotype field decoder
|
||||
//
|
||||
// -----------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Return decoder appropriate for field, or the generic decoder if no
|
||||
* specialized one is bound
|
||||
* @param field the GT field to decode
|
||||
* @return a non-null decoder
|
||||
*/
|
||||
@Requires("field != null")
|
||||
@Ensures("result != null")
|
||||
public Decoder getDecoder(final String field) {
|
||||
final Decoder d = genotypeFieldDecoder.get(field);
|
||||
return d == null ? defaultDecoder : d;
|
||||
}
|
||||
|
||||
/**
|
||||
* Decoder a field (implicit from creation) encoded as
|
||||
* typeDescriptor in the decoder object in the GenotypeBuilders
|
||||
* one for each sample in order.
|
||||
*
|
||||
* The way this works is that this decode method
|
||||
* iterates over the builders, decoding a genotype field
|
||||
* in BCF2 for each sample from decoder.
|
||||
*
|
||||
* This system allows us to easily use specialized
|
||||
* decoders for specific genotype field values. For example,
|
||||
* we use a special decoder to directly read the BCF2 data for
|
||||
* the PL field into a int[] rather than the generic List of Integer
|
||||
*/
|
||||
public interface Decoder {
|
||||
public void decode(final List<Allele> siteAlleles,
|
||||
final String field,
|
||||
final BCF2Decoder decoder,
|
||||
final byte typeDescriptor,
|
||||
final List<GenotypeBuilder> gbs);
|
||||
}
|
||||
|
||||
private class GTDecoder implements Decoder {
|
||||
@Override
|
||||
public void decode(final List<Allele> siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final List<GenotypeBuilder> gbs) {
|
||||
for ( final GenotypeBuilder gb : gbs ) {
|
||||
// TODO -- fast path for size == 2 (diploid)
|
||||
final List<Integer> encoded = (List<Integer>)decoder.decodeTypedValue(typeDescriptor);
|
||||
if ( encoded == null )
|
||||
// no called sample GT = .
|
||||
gb.alleles(null);
|
||||
else {
|
||||
// we have at least some alleles to decode
|
||||
final List<Allele> gt = new ArrayList<Allele>(encoded.size());
|
||||
|
||||
for ( final Integer encode : encoded ) {
|
||||
if ( encode == null ) {
|
||||
// absent, as are all following by definition
|
||||
break;
|
||||
} else {
|
||||
final int offset = encode >> 1;
|
||||
if ( offset == 0 )
|
||||
gt.add(Allele.NO_CALL);
|
||||
else
|
||||
gt.add(siteAlleles.get(offset - 1));
|
||||
}
|
||||
}
|
||||
|
||||
gb.alleles(gt);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private class DPDecoder implements Decoder {
|
||||
@Override
|
||||
public void decode(final List<Allele> siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final List<GenotypeBuilder> gbs) {
|
||||
for ( final GenotypeBuilder gb : gbs ) {
|
||||
final Object value = decoder.decodeTypedValue(typeDescriptor);
|
||||
if ( value != null )
|
||||
gb.DP((Integer)value);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private class GQDecoder implements Decoder {
|
||||
@Override
|
||||
public void decode(final List<Allele> siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final List<GenotypeBuilder> gbs) {
|
||||
for ( final GenotypeBuilder gb : gbs ) {
|
||||
final Object value = decoder.decodeTypedValue(typeDescriptor);
|
||||
if ( value != null )
|
||||
gb.GQ((Integer)value);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private class ADDecoder implements Decoder {
|
||||
@Override
|
||||
public void decode(final List<Allele> siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final List<GenotypeBuilder> gbs) {
|
||||
for ( final GenotypeBuilder gb : gbs ) {
|
||||
final int[] AD = decodeIntArray(decoder.decodeTypedValue(typeDescriptor));
|
||||
if ( AD != null )
|
||||
gb.AD(AD);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private class PLDecoder implements Decoder {
|
||||
@Override
|
||||
public void decode(final List<Allele> siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final List<GenotypeBuilder> gbs) {
|
||||
for ( final GenotypeBuilder gb : gbs ) {
|
||||
final int[] PL = decodeIntArray(decoder.decodeTypedValue(typeDescriptor));
|
||||
if ( PL != null )
|
||||
gb.PL(PL);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private class FLDecoder implements Decoder {
|
||||
@Override
|
||||
public void decode(final List<Allele> siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final List<GenotypeBuilder> gbs) {
|
||||
throw new ReviewedStingException("Genotype filter not implemented in BCF2 yet");
|
||||
}
|
||||
}
|
||||
|
||||
private class GenericDecoder implements Decoder {
|
||||
@Override
|
||||
public void decode(final List<Allele> siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final List<GenotypeBuilder> gbs) {
|
||||
for ( final GenotypeBuilder gb : gbs ) {
|
||||
Object value = decoder.decodeTypedValue(typeDescriptor);
|
||||
if ( value != null ) { // don't add missing values
|
||||
if ( value instanceof List && ((List)value).size() == 1)
|
||||
value = ((List)value).get(0);
|
||||
gb.attribute(field, value);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static final int[] decodeIntArray(final Object value) {
|
||||
// todo -- decode directly into int[]
|
||||
final List<Integer> pls = (List<Integer>)value;
|
||||
if ( pls != null ) { // we have a PL field
|
||||
final int[] x = new int[pls.size()];
|
||||
for ( int j = 0; j < x.length; j++ )
|
||||
x[j] = pls.get(j);
|
||||
return x;
|
||||
} else
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
|
@ -25,8 +25,6 @@
|
|||
package org.broadinstitute.sting.utils.codecs.bcf2;
|
||||
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.variantcontext.*;
|
||||
|
||||
|
|
@ -60,9 +58,11 @@ class BCF2LazyGenotypesDecoder implements LazyGenotypesContext.LazyParser {
|
|||
public LazyGenotypesContext.LazyData parse(final Object data) {
|
||||
logger.info("Decoding BCF genotypes for " + nSamples + " samples with " + nFields + " fields each");
|
||||
|
||||
// load our bytep[] data into the decoder
|
||||
// load our byte[] data into the decoder
|
||||
final BCF2Decoder decoder = new BCF2Decoder(((BCF2Codec.LazyData)data).bytes);
|
||||
|
||||
// TODO -- fast path for sites only
|
||||
|
||||
// go ahead and decode everyone
|
||||
final List<String> samples = new ArrayList<String>(codec.getHeader().getGenotypeSamples());
|
||||
|
||||
|
|
@ -71,119 +71,32 @@ class BCF2LazyGenotypesDecoder implements LazyGenotypesContext.LazyParser {
|
|||
"different numbers of samples per record. Saw " + samples.size() +
|
||||
" samples in header but have a record with " + nSamples + " samples");
|
||||
|
||||
final Map<String, List<Object>> fieldValues = decodeGenotypeFieldValues(decoder, nFields, nSamples);
|
||||
final ArrayList<Genotype> genotypes = new ArrayList<Genotype>(nSamples);
|
||||
final GenotypeBuilder gb = new GenotypeBuilder();
|
||||
// create and initialize the genotypes array
|
||||
final ArrayList<GenotypeBuilder> builders = new ArrayList<GenotypeBuilder>(nSamples);
|
||||
for ( int i = 0; i < nSamples; i++ ) {
|
||||
// all of the information we need for each genotype, with default values
|
||||
gb.reset();
|
||||
gb.name(samples.get(i));
|
||||
|
||||
for ( final Map.Entry<String, List<Object>> entry : fieldValues.entrySet() ) {
|
||||
final String field = entry.getKey();
|
||||
Object value = entry.getValue().get(i);
|
||||
try {
|
||||
if ( field.equals(VCFConstants.GENOTYPE_KEY) ) {
|
||||
gb.alleles(decodeGenotypeAlleles(siteAlleles, (List<Integer>)value));
|
||||
} else if ( field.equals(VCFConstants.DEPTH_KEY) ) {
|
||||
if ( value != BCF2Type.INT8.getMissingJavaValue() )
|
||||
gb.DP((Integer)value);
|
||||
} else if ( field.equals(VCFConstants.GENOTYPE_QUALITY_KEY) ) {
|
||||
if ( value != BCF2Type.INT8.getMissingJavaValue() )
|
||||
gb.GQ((Integer)value);
|
||||
} else if ( field.equals(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY) ) {
|
||||
final int[] PLs = decodeIntArray(value);
|
||||
if ( PLs != null )
|
||||
gb.PL(PLs);
|
||||
} else if ( field.equals(VCFConstants.GENOTYPE_ALLELE_DEPTHS) ) {
|
||||
final int[] AD = decodeIntArray(value);
|
||||
if ( AD != null )
|
||||
gb.AD(AD);
|
||||
} else if ( field.equals(VCFConstants.GENOTYPE_FILTER_KEY) ) {
|
||||
throw new ReviewedStingException("Genotype filters not implemented in GATK BCF2");
|
||||
//filters = new HashSet<String>(values.get(i));
|
||||
} else { // add to attributes
|
||||
if ( value != null ) { // don't add missing values
|
||||
if ( value instanceof List && ((List)value).size() == 1)
|
||||
value = ((List)value).get(0);
|
||||
gb.attribute(field, value);
|
||||
}
|
||||
}
|
||||
} catch ( ClassCastException e ) {
|
||||
throw new UserException.MalformedBCF2("BUG: expected encoding of field " + field
|
||||
+ " inconsistent with the value observed in the decoded value in the "
|
||||
+ " BCF file. Value was " + value);
|
||||
}
|
||||
}
|
||||
|
||||
final Genotype g = gb.make();
|
||||
genotypes.add(g);
|
||||
builders.add(new GenotypeBuilder(samples.get(i)));
|
||||
}
|
||||
|
||||
for ( int i = 0; i < nFields; i++ ) {
|
||||
// get the field name
|
||||
final int offset = (Integer) decoder.decodeTypedValue();
|
||||
final String field = codec.getDictionaryString(offset);
|
||||
|
||||
// the type of each element
|
||||
final byte typeDescriptor = decoder.readTypeDescriptor();
|
||||
final BCF2GenotypeFieldDecoders.Decoder fieldDecoder = codec.getGenotypeFieldDecoder(field);
|
||||
try {
|
||||
fieldDecoder.decode(siteAlleles, field, decoder, typeDescriptor, builders);
|
||||
} catch ( ClassCastException e ) {
|
||||
throw new UserException.MalformedBCF2("BUG: expected encoding of field " + field
|
||||
+ " inconsistent with the value observed in the decoded value");
|
||||
}
|
||||
}
|
||||
|
||||
final ArrayList<Genotype> genotypes = new ArrayList<Genotype>(nSamples);
|
||||
for ( final GenotypeBuilder gb : builders )
|
||||
genotypes.add(gb.make());
|
||||
|
||||
return new LazyGenotypesContext.LazyData(genotypes, codec.getHeader().getSampleNamesInOrder(), codec.getHeader().getSampleNameToOffset());
|
||||
}
|
||||
|
||||
private final int[] decodeIntArray(final Object value) {
|
||||
// todo -- decode directly into int[]
|
||||
final List<Integer> pls = (List<Integer>)value;
|
||||
if ( pls != null ) { // we have a PL field
|
||||
final int[] x = new int[pls.size()];
|
||||
for ( int j = 0; j < x.length; j++ )
|
||||
x[j] = pls.get(j);
|
||||
return x;
|
||||
} else
|
||||
return null;
|
||||
}
|
||||
|
||||
private final List<Allele> decodeGenotypeAlleles(final ArrayList<Allele> siteAlleles, final List<Integer> encoded) {
|
||||
if ( encoded == null )
|
||||
// no called sample GT = .
|
||||
return Collections.emptyList();
|
||||
else {
|
||||
// we have at least some alleles to decode
|
||||
final List<Allele> gt = new ArrayList<Allele>(encoded.size());
|
||||
for ( final Integer encode : encoded ) {
|
||||
if ( encode == null ) // absent, as are all following by definition
|
||||
return gt;
|
||||
else {
|
||||
final int offset = encode >> 1;
|
||||
if ( offset == 0 )
|
||||
gt.add(Allele.NO_CALL);
|
||||
else
|
||||
gt.add(siteAlleles.get(offset - 1));
|
||||
}
|
||||
}
|
||||
|
||||
return gt;
|
||||
}
|
||||
}
|
||||
|
||||
private final Map<String, List<Object>> decodeGenotypeFieldValues(final BCF2Decoder decoder,
|
||||
final int nFields,
|
||||
final int nSamples) {
|
||||
assert (nFields > 0 && nSamples > 0) || (nFields == 0 && nSamples == 0);
|
||||
|
||||
if ( nFields == 0 ) // fast path exit for sites only file
|
||||
return Collections.emptyMap();
|
||||
else {
|
||||
final Map<String, List<Object>> map = new LinkedHashMap<String, List<Object>>(nFields);
|
||||
|
||||
for ( int i = 0; i < nFields; i++ ) {
|
||||
final int offset = (Integer) decoder.decodeTypedValue();
|
||||
final String field = codec.getDictionaryString(offset);
|
||||
|
||||
// the type of each element
|
||||
final byte typeDescriptor = decoder.readTypeDescriptor();
|
||||
final List<Object> values = new ArrayList<Object>(nSamples);
|
||||
for ( int j = 0; j < nSamples; j++ )
|
||||
values.add(decoder.decodeTypedValue(typeDescriptor));
|
||||
|
||||
assert ! map.containsKey(field);
|
||||
|
||||
map.put(field, values);
|
||||
}
|
||||
|
||||
return map;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -265,6 +265,7 @@ public abstract class Genotype implements Comparable<Genotype> {
|
|||
/**
|
||||
* @return Returns true if this Genotype has PL field values
|
||||
*/
|
||||
@Ensures("(result && getLikelihoods() != null) || (! result && getLikelihoods() == null)")
|
||||
public boolean hasLikelihoods() {
|
||||
return getPL() != null;
|
||||
}
|
||||
|
|
@ -284,7 +285,7 @@ public abstract class Genotype implements Comparable<Genotype> {
|
|||
* Returns the GenotypesLikelihoods data associated with this Genotype, or null if missing
|
||||
* @return null or a GenotypesLikelihood object for this sample's PL field
|
||||
*/
|
||||
@Ensures({"hasLikelihoods() && result != null", "! hasLikelihoods() && result == null"})
|
||||
@Ensures("(hasLikelihoods() && result != null) || (! hasLikelihoods() && result == null)")
|
||||
public GenotypeLikelihoods getLikelihoods() {
|
||||
return hasLikelihoods() ? GenotypeLikelihoods.fromPLs(getPL()) : null;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -47,8 +47,8 @@ public class MD5DB {
|
|||
/**
|
||||
* Subdirectory under the ant build directory where we store integration test md5 results
|
||||
*/
|
||||
private static final int MAX_RECORDS_TO_READ = 10000;
|
||||
private static final int MAX_RAW_DIFFS_TO_SUMMARIZE = 1000;
|
||||
private static final int MAX_RECORDS_TO_READ = 1000;
|
||||
private static final int MAX_RAW_DIFFS_TO_SUMMARIZE = 100;
|
||||
public static final String LOCAL_MD5_DB_DIR = "integrationtests";
|
||||
public static final String GLOBAL_MD5_DB_DIR = "/humgen/gsa-hpprojects/GATK/data/integrationtests";
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue