BCF2 genotype decoding is now lazy
-- Refactored BCF2Codec into a LazyGenotypesDecoder object that provides on-demand genotype decoding of BCF2 data blocks a la VCFCodec. -- VCFHeader has getters for sampleNamesInOrder and sampleNameToOffset instead of protected variables directly accessed by vcfcodec
This commit is contained in:
parent
9eb83a0771
commit
ff9ac4b5f8
|
|
@ -33,7 +33,6 @@ import org.broad.tribble.readers.AsciiLineReader;
|
|||
import org.broad.tribble.readers.PositionalBufferedStream;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReferenceDependentFeatureCodec;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.*;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
|
|
@ -81,7 +80,7 @@ public class BCF2Codec implements FeatureCodec<VariantContext>, ReferenceDepende
|
|||
decoder.skipNextBlock(genotypeBlockSize, inputStream);
|
||||
} else {
|
||||
decoder.readNextBlock(genotypeBlockSize, inputStream);
|
||||
decodeGenotypes(info, builder);
|
||||
createLazyGenotypesDecoder(info, builder);
|
||||
}
|
||||
|
||||
return builder.fullyDecoded(true).make();
|
||||
|
|
@ -297,114 +296,32 @@ public class BCF2Codec implements FeatureCodec<VariantContext>, ReferenceDepende
|
|||
builder.attributes(infoFieldEntries);
|
||||
}
|
||||
|
||||
private void decodeGenotypes( final SitesInfoForDecoding siteInfo, final VariantContextBuilder builder ) {
|
||||
final List<String> samples = new ArrayList<String>(header.getGenotypeSamples());
|
||||
final int nSamples = siteInfo.nSamples;
|
||||
final int nFields = siteInfo.nFormatFields;
|
||||
// --------------------------------------------------------------------------------
|
||||
//
|
||||
// Decoding Genotypes
|
||||
//
|
||||
// --------------------------------------------------------------------------------
|
||||
|
||||
if ( samples.size() != nSamples )
|
||||
throw new UserException.MalformedBCF2("GATK currently doesn't support reading BCF2 files with " +
|
||||
"different numbers of samples per record. Saw " + samples.size() +
|
||||
" samples in header but have a record with " + nSamples + " samples");
|
||||
/**
|
||||
* Create the lazy loader for the genotypes data, and store it in the builder
|
||||
* so that the VC will be able to decode on demand the genotypes data
|
||||
*
|
||||
* @param siteInfo
|
||||
* @param builder
|
||||
*/
|
||||
private void createLazyGenotypesDecoder( final SitesInfoForDecoding siteInfo,
|
||||
final VariantContextBuilder builder ) {
|
||||
if (siteInfo.nSamples > 0) {
|
||||
final LazyGenotypesContext.LazyParser lazyParser =
|
||||
new LazyGenotypesDecoder(this, siteInfo.alleles, siteInfo.nSamples, siteInfo.nFormatFields);
|
||||
final int nGenotypes = header.getGenotypeSamples().size();
|
||||
LazyGenotypesContext lazy = new LazyGenotypesContext(lazyParser, decoder.getRecordBytes(), nGenotypes);
|
||||
|
||||
final Map<String, List<Object>> fieldValues = decodeGenotypeFieldValues(nFields, nSamples);
|
||||
final List<Genotype> genotypes = new ArrayList<Genotype>(nSamples);
|
||||
for ( int i = 0; i < nSamples; i++ ) {
|
||||
// all of the information we need for each genotype, with default values
|
||||
final String sampleName = samples.get(i);
|
||||
List<Allele> alleles = null;
|
||||
boolean isPhased = false;
|
||||
double log10PError = VariantContext.NO_LOG10_PERROR;
|
||||
Set<String> filters = null;
|
||||
Map<String, Object> attributes = null;
|
||||
double[] log10Likelihoods = null;
|
||||
// did we resort the sample names? If so, we need to load the genotype data
|
||||
if ( !header.samplesWereAlreadySorted() )
|
||||
lazy.decode();
|
||||
|
||||
for ( final Map.Entry<String, List<Object>> entry : fieldValues.entrySet() ) {
|
||||
final String field = entry.getKey();
|
||||
Object value = entry.getValue().get(i);
|
||||
try {
|
||||
if ( field.equals(VCFConstants.GENOTYPE_KEY) ) {
|
||||
alleles = decodeGenotypeAlleles(siteInfo.alleles, (List<Integer>)value);
|
||||
} else if ( field.equals(VCFConstants.GENOTYPE_QUALITY_KEY) ) {
|
||||
if ( value != BCF2Type.INT8.getMissingJavaValue() )
|
||||
log10PError = ((Integer)value) / -10.0;
|
||||
} else if ( field.equals(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY) ) {
|
||||
final List<Integer> pls = (List<Integer>)value;
|
||||
if ( pls != null ) { // we have a PL field
|
||||
log10Likelihoods = new double[pls.size()];
|
||||
for ( int j = 0; j < log10Likelihoods.length; j++ ) {
|
||||
final double d = pls.get(j);
|
||||
log10Likelihoods[j] = d == -0.0 ? 0.0 : d / -10.0;
|
||||
}
|
||||
}
|
||||
} else if ( field.equals(VCFConstants.GENOTYPE_FILTER_KEY) ) {
|
||||
throw new ReviewedStingException("Genotype filters not implemented in GATK BCF2");
|
||||
//filters = new HashSet<String>(values.get(i));
|
||||
} else { // add to attributes
|
||||
if ( value != null ) { // don't add missing values
|
||||
if ( attributes == null ) attributes = new HashMap<String, Object>(nFields);
|
||||
if ( value instanceof List && ((List)value).size() == 1)
|
||||
value = ((List)value).get(0);
|
||||
attributes.put(field, value);
|
||||
}
|
||||
}
|
||||
} catch ( ClassCastException e ) {
|
||||
throw new UserException.MalformedBCF2("BUG: expected encoding of field " + field
|
||||
+ " inconsistent with the value observed in the decoded value in the "
|
||||
+ " BCF file. Value was " + value);
|
||||
}
|
||||
}
|
||||
|
||||
if ( alleles == null ) throw new UserException.MalformedBCF2("BUG: no alleles found");
|
||||
|
||||
final Genotype g = new Genotype(sampleName, alleles, log10PError, filters, attributes, isPhased, log10Likelihoods);
|
||||
genotypes.add(g);
|
||||
}
|
||||
|
||||
builder.genotypes(genotypes);
|
||||
}
|
||||
|
||||
private final List<Allele> decodeGenotypeAlleles(final ArrayList<Allele> siteAlleles, final List<Integer> encoded) {
|
||||
if ( encoded == null )
|
||||
// no called sample GT = .
|
||||
return Collections.emptyList();
|
||||
else {
|
||||
// we have at least some alleles to decode
|
||||
final List<Allele> gt = new ArrayList<Allele>(encoded.size());
|
||||
for ( final Integer encode : encoded ) {
|
||||
if ( encode == null ) // absent, as are all following by definition
|
||||
return gt;
|
||||
else {
|
||||
final int offset = encode >> 1;
|
||||
if ( offset == 0 )
|
||||
gt.add(Allele.NO_CALL);
|
||||
else
|
||||
gt.add(siteAlleles.get(offset - 1));
|
||||
}
|
||||
}
|
||||
|
||||
return gt;
|
||||
}
|
||||
}
|
||||
|
||||
private final Map<String, List<Object>> decodeGenotypeFieldValues(final int nFields, final int nSamples) {
|
||||
assert (nFields > 0 && nSamples > 0) || (nFields == 0 && nSamples == 0);
|
||||
|
||||
if ( nFields == 0 ) // fast path exit for sites only file
|
||||
return Collections.emptyMap();
|
||||
else {
|
||||
final Map<String, List<Object>> map = new LinkedHashMap<String, List<Object>>(nFields);
|
||||
|
||||
for ( int i = 0; i < nFields; i++ ) {
|
||||
final String field = getDictionaryString();
|
||||
final byte typeDescriptor = decoder.readTypeDescriptor();
|
||||
final List<Object> values = new ArrayList<Object>(nSamples);
|
||||
for ( int j = 0; j < nSamples; j++ )
|
||||
values.add(decoder.decodeTypedValue(typeDescriptor));
|
||||
map.put(field, values);
|
||||
}
|
||||
|
||||
return map;
|
||||
builder.genotypesNoValidation(lazy);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -412,7 +329,7 @@ public class BCF2Codec implements FeatureCodec<VariantContext>, ReferenceDepende
|
|||
return getDictionaryString((Integer) decoder.decodeTypedValue());
|
||||
}
|
||||
|
||||
private final String getDictionaryString(final int offset) {
|
||||
protected final String getDictionaryString(final int offset) {
|
||||
if ( offset >= dictionary.size() ) throw new UserException.MalformedBCF2("BUG: no dictionary field found at offset " + offset);
|
||||
final String field = dictionary.get(offset);
|
||||
return field;
|
||||
|
|
@ -436,4 +353,8 @@ public class BCF2Codec implements FeatureCodec<VariantContext>, ReferenceDepende
|
|||
|
||||
return dict;
|
||||
}
|
||||
|
||||
protected VCFHeader getHeader() {
|
||||
return header;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,180 @@
|
|||
/*
|
||||
* Copyright (c) 2012, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.utils.codecs.bcf2;
|
||||
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
||||
import org.broadinstitute.sting.utils.variantcontext.LazyGenotypesContext;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Lazy version of genotypes decoder for BCF2 genotypes
|
||||
*
|
||||
* @author Mark DePristo
|
||||
* @since 5/12
|
||||
*/
|
||||
class LazyGenotypesDecoder implements LazyGenotypesContext.LazyParser {
|
||||
final protected static Logger logger = Logger.getLogger(LazyGenotypesDecoder.class);
|
||||
|
||||
// the essential information for us to use to decode the genotypes data
|
||||
// initialized when this lazy decoder is created, as we know all of this from the BCF2Codec
|
||||
// and its stored here again for code cleanliness
|
||||
private final BCF2Codec codec;
|
||||
private final ArrayList<Allele> siteAlleles;
|
||||
private final int nSamples;
|
||||
private final int nFields;
|
||||
|
||||
LazyGenotypesDecoder(final BCF2Codec codec, final ArrayList<Allele> alleles, final int nSamples, final int nFields) {
|
||||
this.codec = codec;
|
||||
this.siteAlleles = alleles;
|
||||
this.nSamples = nSamples;
|
||||
this.nFields = nFields;
|
||||
}
|
||||
|
||||
@Override
|
||||
public LazyGenotypesContext.LazyData parse(final Object data) {
|
||||
logger.info("Decoding BCF genotypes for " + nSamples + " samples with " + nFields + " fields each");
|
||||
|
||||
// load our bytep[] data into the decoder
|
||||
final BCF2Decoder decoder = new BCF2Decoder((byte[])data);
|
||||
|
||||
// go ahead and decode everyone
|
||||
final List<String> samples = new ArrayList<String>(codec.getHeader().getGenotypeSamples());
|
||||
|
||||
if ( samples.size() != nSamples )
|
||||
throw new UserException.MalformedBCF2("GATK currently doesn't support reading BCF2 files with " +
|
||||
"different numbers of samples per record. Saw " + samples.size() +
|
||||
" samples in header but have a record with " + nSamples + " samples");
|
||||
|
||||
final Map<String, List<Object>> fieldValues = decodeGenotypeFieldValues(decoder, nFields, nSamples);
|
||||
final ArrayList<Genotype> genotypes = new ArrayList<Genotype>(nSamples);
|
||||
for ( int i = 0; i < nSamples; i++ ) {
|
||||
// all of the information we need for each genotype, with default values
|
||||
final String sampleName = samples.get(i);
|
||||
List<Allele> alleles = null;
|
||||
boolean isPhased = false;
|
||||
double log10PError = VariantContext.NO_LOG10_PERROR;
|
||||
Set<String> filters = null;
|
||||
Map<String, Object> attributes = null;
|
||||
double[] log10Likelihoods = null;
|
||||
|
||||
for ( final Map.Entry<String, List<Object>> entry : fieldValues.entrySet() ) {
|
||||
final String field = entry.getKey();
|
||||
Object value = entry.getValue().get(i);
|
||||
try {
|
||||
if ( field.equals(VCFConstants.GENOTYPE_KEY) ) {
|
||||
alleles = decodeGenotypeAlleles(siteAlleles, (List<Integer>)value);
|
||||
} else if ( field.equals(VCFConstants.GENOTYPE_QUALITY_KEY) ) {
|
||||
if ( value != BCF2Type.INT8.getMissingJavaValue() )
|
||||
log10PError = ((Integer)value) / -10.0;
|
||||
} else if ( field.equals(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY) ) {
|
||||
final List<Integer> pls = (List<Integer>)value;
|
||||
if ( pls != null ) { // we have a PL field
|
||||
log10Likelihoods = new double[pls.size()];
|
||||
for ( int j = 0; j < log10Likelihoods.length; j++ ) {
|
||||
final double d = pls.get(j);
|
||||
log10Likelihoods[j] = d == -0.0 ? 0.0 : d / -10.0;
|
||||
}
|
||||
}
|
||||
} else if ( field.equals(VCFConstants.GENOTYPE_FILTER_KEY) ) {
|
||||
throw new ReviewedStingException("Genotype filters not implemented in GATK BCF2");
|
||||
//filters = new HashSet<String>(values.get(i));
|
||||
} else { // add to attributes
|
||||
if ( value != null ) { // don't add missing values
|
||||
if ( attributes == null ) attributes = new HashMap<String, Object>(nFields);
|
||||
if ( value instanceof List && ((List)value).size() == 1)
|
||||
value = ((List)value).get(0);
|
||||
attributes.put(field, value);
|
||||
}
|
||||
}
|
||||
} catch ( ClassCastException e ) {
|
||||
throw new UserException.MalformedBCF2("BUG: expected encoding of field " + field
|
||||
+ " inconsistent with the value observed in the decoded value in the "
|
||||
+ " BCF file. Value was " + value);
|
||||
}
|
||||
}
|
||||
|
||||
if ( alleles == null ) throw new UserException.MalformedBCF2("BUG: no alleles found");
|
||||
|
||||
final Genotype g = new Genotype(sampleName, alleles, log10PError, filters, attributes, isPhased, log10Likelihoods);
|
||||
genotypes.add(g);
|
||||
}
|
||||
|
||||
return new LazyGenotypesContext.LazyData(genotypes, codec.getHeader().getSampleNamesInOrder(), codec.getHeader().getSampleNameToOffset());
|
||||
}
|
||||
|
||||
private final List<Allele> decodeGenotypeAlleles(final ArrayList<Allele> siteAlleles, final List<Integer> encoded) {
|
||||
if ( encoded == null )
|
||||
// no called sample GT = .
|
||||
return Collections.emptyList();
|
||||
else {
|
||||
// we have at least some alleles to decode
|
||||
final List<Allele> gt = new ArrayList<Allele>(encoded.size());
|
||||
for ( final Integer encode : encoded ) {
|
||||
if ( encode == null ) // absent, as are all following by definition
|
||||
return gt;
|
||||
else {
|
||||
final int offset = encode >> 1;
|
||||
if ( offset == 0 )
|
||||
gt.add(Allele.NO_CALL);
|
||||
else
|
||||
gt.add(siteAlleles.get(offset - 1));
|
||||
}
|
||||
}
|
||||
|
||||
return gt;
|
||||
}
|
||||
}
|
||||
|
||||
private final Map<String, List<Object>> decodeGenotypeFieldValues(final BCF2Decoder decoder,
|
||||
final int nFields,
|
||||
final int nSamples) {
|
||||
assert (nFields > 0 && nSamples > 0) || (nFields == 0 && nSamples == 0);
|
||||
|
||||
if ( nFields == 0 ) // fast path exit for sites only file
|
||||
return Collections.emptyMap();
|
||||
else {
|
||||
final Map<String, List<Object>> map = new LinkedHashMap<String, List<Object>>(nFields);
|
||||
|
||||
for ( int i = 0; i < nFields; i++ ) {
|
||||
final int offset = (Integer) decoder.decodeTypedValue();
|
||||
final String field = codec.getDictionaryString(offset);
|
||||
final byte typeDescriptor = decoder.readTypeDescriptor();
|
||||
final List<Object> values = new ArrayList<Object>(nSamples);
|
||||
for ( int j = 0; j < nSamples; j++ )
|
||||
values.add(decoder.decodeTypedValue(typeDescriptor));
|
||||
map.put(field, values);
|
||||
}
|
||||
|
||||
return map;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -192,7 +192,7 @@ public class VCF3Codec extends AbstractVCFCodec {
|
|||
}
|
||||
}
|
||||
|
||||
return new LazyGenotypesContext.LazyData(genotypes, header.sampleNamesInOrder, header.sampleNameToOffset);
|
||||
return new LazyGenotypesContext.LazyData(genotypes, header.getSampleNamesInOrder(), header.getSampleNameToOffset());
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
|||
|
|
@ -241,7 +241,7 @@ public class VCFCodec extends AbstractVCFCodec {
|
|||
}
|
||||
}
|
||||
|
||||
return new LazyGenotypesContext.LazyData(genotypes, header.sampleNamesInOrder, header.sampleNameToOffset);
|
||||
return new LazyGenotypesContext.LazyData(genotypes, header.getSampleNamesInOrder(), header.getSampleNameToOffset());
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
|||
|
|
@ -68,8 +68,8 @@ public class VCFHeader {
|
|||
private boolean samplesWereAlreadySorted = true;
|
||||
|
||||
// cache for efficient conversion of VCF -> VariantContext
|
||||
protected ArrayList<String> sampleNamesInOrder = null;
|
||||
protected HashMap<String, Integer> sampleNameToOffset = null;
|
||||
private ArrayList<String> sampleNamesInOrder = null;
|
||||
private HashMap<String, Integer> sampleNameToOffset = null;
|
||||
|
||||
private boolean writeEngineHeaders = true;
|
||||
private boolean writeCommandLine = true;
|
||||
|
|
@ -299,4 +299,12 @@ public class VCFHeader {
|
|||
public void setWriteCommandLine(boolean writeCommandLine) {
|
||||
this.writeCommandLine = writeCommandLine;
|
||||
}
|
||||
|
||||
public ArrayList<String> getSampleNamesInOrder() {
|
||||
return sampleNamesInOrder;
|
||||
}
|
||||
|
||||
public HashMap<String, Integer> getSampleNameToOffset() {
|
||||
return sampleNameToOffset;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue