BCF2 genotype decoding is now lazy

-- Refactored BCF2Codec into a LazyGenotypesDecoder object that provides on-demand genotype decoding of BCF2 data blocks a la VCFCodec.
-- VCFHeader has getters for sampleNamesInOrder and sampleNameToOffset instead of protected variables directly accessed by vcfcodec
This commit is contained in:
Mark DePristo 2012-05-31 13:47:31 -04:00
parent 9eb83a0771
commit ff9ac4b5f8
5 changed files with 221 additions and 112 deletions

View File

@ -33,7 +33,6 @@ import org.broad.tribble.readers.AsciiLineReader;
import org.broad.tribble.readers.PositionalBufferedStream;
import org.broadinstitute.sting.gatk.refdata.ReferenceDependentFeatureCodec;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.codecs.vcf.*;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
@ -81,7 +80,7 @@ public class BCF2Codec implements FeatureCodec<VariantContext>, ReferenceDepende
decoder.skipNextBlock(genotypeBlockSize, inputStream);
} else {
decoder.readNextBlock(genotypeBlockSize, inputStream);
decodeGenotypes(info, builder);
createLazyGenotypesDecoder(info, builder);
}
return builder.fullyDecoded(true).make();
@ -297,114 +296,32 @@ public class BCF2Codec implements FeatureCodec<VariantContext>, ReferenceDepende
builder.attributes(infoFieldEntries);
}
private void decodeGenotypes( final SitesInfoForDecoding siteInfo, final VariantContextBuilder builder ) {
final List<String> samples = new ArrayList<String>(header.getGenotypeSamples());
final int nSamples = siteInfo.nSamples;
final int nFields = siteInfo.nFormatFields;
// --------------------------------------------------------------------------------
//
// Decoding Genotypes
//
// --------------------------------------------------------------------------------
if ( samples.size() != nSamples )
throw new UserException.MalformedBCF2("GATK currently doesn't support reading BCF2 files with " +
"different numbers of samples per record. Saw " + samples.size() +
" samples in header but have a record with " + nSamples + " samples");
/**
* Create the lazy loader for the genotypes data, and store it in the builder
* so that the VC will be able to decode on demand the genotypes data
*
* @param siteInfo
* @param builder
*/
private void createLazyGenotypesDecoder( final SitesInfoForDecoding siteInfo,
final VariantContextBuilder builder ) {
if (siteInfo.nSamples > 0) {
final LazyGenotypesContext.LazyParser lazyParser =
new LazyGenotypesDecoder(this, siteInfo.alleles, siteInfo.nSamples, siteInfo.nFormatFields);
final int nGenotypes = header.getGenotypeSamples().size();
LazyGenotypesContext lazy = new LazyGenotypesContext(lazyParser, decoder.getRecordBytes(), nGenotypes);
final Map<String, List<Object>> fieldValues = decodeGenotypeFieldValues(nFields, nSamples);
final List<Genotype> genotypes = new ArrayList<Genotype>(nSamples);
for ( int i = 0; i < nSamples; i++ ) {
// all of the information we need for each genotype, with default values
final String sampleName = samples.get(i);
List<Allele> alleles = null;
boolean isPhased = false;
double log10PError = VariantContext.NO_LOG10_PERROR;
Set<String> filters = null;
Map<String, Object> attributes = null;
double[] log10Likelihoods = null;
// did we resort the sample names? If so, we need to load the genotype data
if ( !header.samplesWereAlreadySorted() )
lazy.decode();
for ( final Map.Entry<String, List<Object>> entry : fieldValues.entrySet() ) {
final String field = entry.getKey();
Object value = entry.getValue().get(i);
try {
if ( field.equals(VCFConstants.GENOTYPE_KEY) ) {
alleles = decodeGenotypeAlleles(siteInfo.alleles, (List<Integer>)value);
} else if ( field.equals(VCFConstants.GENOTYPE_QUALITY_KEY) ) {
if ( value != BCF2Type.INT8.getMissingJavaValue() )
log10PError = ((Integer)value) / -10.0;
} else if ( field.equals(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY) ) {
final List<Integer> pls = (List<Integer>)value;
if ( pls != null ) { // we have a PL field
log10Likelihoods = new double[pls.size()];
for ( int j = 0; j < log10Likelihoods.length; j++ ) {
final double d = pls.get(j);
log10Likelihoods[j] = d == -0.0 ? 0.0 : d / -10.0;
}
}
} else if ( field.equals(VCFConstants.GENOTYPE_FILTER_KEY) ) {
throw new ReviewedStingException("Genotype filters not implemented in GATK BCF2");
//filters = new HashSet<String>(values.get(i));
} else { // add to attributes
if ( value != null ) { // don't add missing values
if ( attributes == null ) attributes = new HashMap<String, Object>(nFields);
if ( value instanceof List && ((List)value).size() == 1)
value = ((List)value).get(0);
attributes.put(field, value);
}
}
} catch ( ClassCastException e ) {
throw new UserException.MalformedBCF2("BUG: expected encoding of field " + field
+ " inconsistent with the value observed in the decoded value in the "
+ " BCF file. Value was " + value);
}
}
if ( alleles == null ) throw new UserException.MalformedBCF2("BUG: no alleles found");
final Genotype g = new Genotype(sampleName, alleles, log10PError, filters, attributes, isPhased, log10Likelihoods);
genotypes.add(g);
}
builder.genotypes(genotypes);
}
private final List<Allele> decodeGenotypeAlleles(final ArrayList<Allele> siteAlleles, final List<Integer> encoded) {
if ( encoded == null )
// no called sample GT = .
return Collections.emptyList();
else {
// we have at least some alleles to decode
final List<Allele> gt = new ArrayList<Allele>(encoded.size());
for ( final Integer encode : encoded ) {
if ( encode == null ) // absent, as are all following by definition
return gt;
else {
final int offset = encode >> 1;
if ( offset == 0 )
gt.add(Allele.NO_CALL);
else
gt.add(siteAlleles.get(offset - 1));
}
}
return gt;
}
}
private final Map<String, List<Object>> decodeGenotypeFieldValues(final int nFields, final int nSamples) {
assert (nFields > 0 && nSamples > 0) || (nFields == 0 && nSamples == 0);
if ( nFields == 0 ) // fast path exit for sites only file
return Collections.emptyMap();
else {
final Map<String, List<Object>> map = new LinkedHashMap<String, List<Object>>(nFields);
for ( int i = 0; i < nFields; i++ ) {
final String field = getDictionaryString();
final byte typeDescriptor = decoder.readTypeDescriptor();
final List<Object> values = new ArrayList<Object>(nSamples);
for ( int j = 0; j < nSamples; j++ )
values.add(decoder.decodeTypedValue(typeDescriptor));
map.put(field, values);
}
return map;
builder.genotypesNoValidation(lazy);
}
}
@ -412,7 +329,7 @@ public class BCF2Codec implements FeatureCodec<VariantContext>, ReferenceDepende
return getDictionaryString((Integer) decoder.decodeTypedValue());
}
private final String getDictionaryString(final int offset) {
protected final String getDictionaryString(final int offset) {
if ( offset >= dictionary.size() ) throw new UserException.MalformedBCF2("BUG: no dictionary field found at offset " + offset);
final String field = dictionary.get(offset);
return field;
@ -436,4 +353,8 @@ public class BCF2Codec implements FeatureCodec<VariantContext>, ReferenceDepende
return dict;
}
protected VCFHeader getHeader() {
return header;
}
}

View File

@ -0,0 +1,180 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.codecs.bcf2;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.LazyGenotypesContext;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.*;
/**
* Lazy version of genotypes decoder for BCF2 genotypes
*
* @author Mark DePristo
* @since 5/12
*/
class LazyGenotypesDecoder implements LazyGenotypesContext.LazyParser {
final protected static Logger logger = Logger.getLogger(LazyGenotypesDecoder.class);
// the essential information for us to use to decode the genotypes data
// initialized when this lazy decoder is created, as we know all of this from the BCF2Codec
// and its stored here again for code cleanliness
private final BCF2Codec codec;
private final ArrayList<Allele> siteAlleles;
private final int nSamples;
private final int nFields;
LazyGenotypesDecoder(final BCF2Codec codec, final ArrayList<Allele> alleles, final int nSamples, final int nFields) {
this.codec = codec;
this.siteAlleles = alleles;
this.nSamples = nSamples;
this.nFields = nFields;
}
@Override
public LazyGenotypesContext.LazyData parse(final Object data) {
logger.info("Decoding BCF genotypes for " + nSamples + " samples with " + nFields + " fields each");
// load our bytep[] data into the decoder
final BCF2Decoder decoder = new BCF2Decoder((byte[])data);
// go ahead and decode everyone
final List<String> samples = new ArrayList<String>(codec.getHeader().getGenotypeSamples());
if ( samples.size() != nSamples )
throw new UserException.MalformedBCF2("GATK currently doesn't support reading BCF2 files with " +
"different numbers of samples per record. Saw " + samples.size() +
" samples in header but have a record with " + nSamples + " samples");
final Map<String, List<Object>> fieldValues = decodeGenotypeFieldValues(decoder, nFields, nSamples);
final ArrayList<Genotype> genotypes = new ArrayList<Genotype>(nSamples);
for ( int i = 0; i < nSamples; i++ ) {
// all of the information we need for each genotype, with default values
final String sampleName = samples.get(i);
List<Allele> alleles = null;
boolean isPhased = false;
double log10PError = VariantContext.NO_LOG10_PERROR;
Set<String> filters = null;
Map<String, Object> attributes = null;
double[] log10Likelihoods = null;
for ( final Map.Entry<String, List<Object>> entry : fieldValues.entrySet() ) {
final String field = entry.getKey();
Object value = entry.getValue().get(i);
try {
if ( field.equals(VCFConstants.GENOTYPE_KEY) ) {
alleles = decodeGenotypeAlleles(siteAlleles, (List<Integer>)value);
} else if ( field.equals(VCFConstants.GENOTYPE_QUALITY_KEY) ) {
if ( value != BCF2Type.INT8.getMissingJavaValue() )
log10PError = ((Integer)value) / -10.0;
} else if ( field.equals(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY) ) {
final List<Integer> pls = (List<Integer>)value;
if ( pls != null ) { // we have a PL field
log10Likelihoods = new double[pls.size()];
for ( int j = 0; j < log10Likelihoods.length; j++ ) {
final double d = pls.get(j);
log10Likelihoods[j] = d == -0.0 ? 0.0 : d / -10.0;
}
}
} else if ( field.equals(VCFConstants.GENOTYPE_FILTER_KEY) ) {
throw new ReviewedStingException("Genotype filters not implemented in GATK BCF2");
//filters = new HashSet<String>(values.get(i));
} else { // add to attributes
if ( value != null ) { // don't add missing values
if ( attributes == null ) attributes = new HashMap<String, Object>(nFields);
if ( value instanceof List && ((List)value).size() == 1)
value = ((List)value).get(0);
attributes.put(field, value);
}
}
} catch ( ClassCastException e ) {
throw new UserException.MalformedBCF2("BUG: expected encoding of field " + field
+ " inconsistent with the value observed in the decoded value in the "
+ " BCF file. Value was " + value);
}
}
if ( alleles == null ) throw new UserException.MalformedBCF2("BUG: no alleles found");
final Genotype g = new Genotype(sampleName, alleles, log10PError, filters, attributes, isPhased, log10Likelihoods);
genotypes.add(g);
}
return new LazyGenotypesContext.LazyData(genotypes, codec.getHeader().getSampleNamesInOrder(), codec.getHeader().getSampleNameToOffset());
}
private final List<Allele> decodeGenotypeAlleles(final ArrayList<Allele> siteAlleles, final List<Integer> encoded) {
if ( encoded == null )
// no called sample GT = .
return Collections.emptyList();
else {
// we have at least some alleles to decode
final List<Allele> gt = new ArrayList<Allele>(encoded.size());
for ( final Integer encode : encoded ) {
if ( encode == null ) // absent, as are all following by definition
return gt;
else {
final int offset = encode >> 1;
if ( offset == 0 )
gt.add(Allele.NO_CALL);
else
gt.add(siteAlleles.get(offset - 1));
}
}
return gt;
}
}
private final Map<String, List<Object>> decodeGenotypeFieldValues(final BCF2Decoder decoder,
final int nFields,
final int nSamples) {
assert (nFields > 0 && nSamples > 0) || (nFields == 0 && nSamples == 0);
if ( nFields == 0 ) // fast path exit for sites only file
return Collections.emptyMap();
else {
final Map<String, List<Object>> map = new LinkedHashMap<String, List<Object>>(nFields);
for ( int i = 0; i < nFields; i++ ) {
final int offset = (Integer) decoder.decodeTypedValue();
final String field = codec.getDictionaryString(offset);
final byte typeDescriptor = decoder.readTypeDescriptor();
final List<Object> values = new ArrayList<Object>(nSamples);
for ( int j = 0; j < nSamples; j++ )
values.add(decoder.decodeTypedValue(typeDescriptor));
map.put(field, values);
}
return map;
}
}
}

View File

@ -192,7 +192,7 @@ public class VCF3Codec extends AbstractVCFCodec {
}
}
return new LazyGenotypesContext.LazyData(genotypes, header.sampleNamesInOrder, header.sampleNameToOffset);
return new LazyGenotypesContext.LazyData(genotypes, header.getSampleNamesInOrder(), header.getSampleNameToOffset());
}
@Override

View File

@ -241,7 +241,7 @@ public class VCFCodec extends AbstractVCFCodec {
}
}
return new LazyGenotypesContext.LazyData(genotypes, header.sampleNamesInOrder, header.sampleNameToOffset);
return new LazyGenotypesContext.LazyData(genotypes, header.getSampleNamesInOrder(), header.getSampleNameToOffset());
}
@Override

View File

@ -68,8 +68,8 @@ public class VCFHeader {
private boolean samplesWereAlreadySorted = true;
// cache for efficient conversion of VCF -> VariantContext
protected ArrayList<String> sampleNamesInOrder = null;
protected HashMap<String, Integer> sampleNameToOffset = null;
private ArrayList<String> sampleNamesInOrder = null;
private HashMap<String, Integer> sampleNameToOffset = null;
private boolean writeEngineHeaders = true;
private boolean writeCommandLine = true;
@ -299,4 +299,12 @@ public class VCFHeader {
public void setWriteCommandLine(boolean writeCommandLine) {
this.writeCommandLine = writeCommandLine;
}
public ArrayList<String> getSampleNamesInOrder() {
return sampleNamesInOrder;
}
public HashMap<String, Integer> getSampleNameToOffset() {
return sampleNameToOffset;
}
}