diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java index 3974daa64..84018b652 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java @@ -33,7 +33,6 @@ import org.broad.tribble.readers.AsciiLineReader; import org.broad.tribble.readers.PositionalBufferedStream; import org.broadinstitute.sting.gatk.refdata.ReferenceDependentFeatureCodec; import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.codecs.vcf.*; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; @@ -81,7 +80,7 @@ public class BCF2Codec implements FeatureCodec, ReferenceDepende decoder.skipNextBlock(genotypeBlockSize, inputStream); } else { decoder.readNextBlock(genotypeBlockSize, inputStream); - decodeGenotypes(info, builder); + createLazyGenotypesDecoder(info, builder); } return builder.fullyDecoded(true).make(); @@ -297,114 +296,32 @@ public class BCF2Codec implements FeatureCodec, ReferenceDepende builder.attributes(infoFieldEntries); } - private void decodeGenotypes( final SitesInfoForDecoding siteInfo, final VariantContextBuilder builder ) { - final List samples = new ArrayList(header.getGenotypeSamples()); - final int nSamples = siteInfo.nSamples; - final int nFields = siteInfo.nFormatFields; + // -------------------------------------------------------------------------------- + // + // Decoding Genotypes + // + // -------------------------------------------------------------------------------- - if ( samples.size() != nSamples ) - throw new UserException.MalformedBCF2("GATK currently doesn't support reading BCF2 files with " + - "different numbers of samples per record. Saw " + samples.size() + - " samples in header but have a record with " + nSamples + " samples"); + /** + * Create the lazy loader for the genotypes data, and store it in the builder + * so that the VC will be able to decode on demand the genotypes data + * + * @param siteInfo + * @param builder + */ + private void createLazyGenotypesDecoder( final SitesInfoForDecoding siteInfo, + final VariantContextBuilder builder ) { + if (siteInfo.nSamples > 0) { + final LazyGenotypesContext.LazyParser lazyParser = + new LazyGenotypesDecoder(this, siteInfo.alleles, siteInfo.nSamples, siteInfo.nFormatFields); + final int nGenotypes = header.getGenotypeSamples().size(); + LazyGenotypesContext lazy = new LazyGenotypesContext(lazyParser, decoder.getRecordBytes(), nGenotypes); - final Map> fieldValues = decodeGenotypeFieldValues(nFields, nSamples); - final List genotypes = new ArrayList(nSamples); - for ( int i = 0; i < nSamples; i++ ) { - // all of the information we need for each genotype, with default values - final String sampleName = samples.get(i); - List alleles = null; - boolean isPhased = false; - double log10PError = VariantContext.NO_LOG10_PERROR; - Set filters = null; - Map attributes = null; - double[] log10Likelihoods = null; + // did we resort the sample names? If so, we need to load the genotype data + if ( !header.samplesWereAlreadySorted() ) + lazy.decode(); - for ( final Map.Entry> entry : fieldValues.entrySet() ) { - final String field = entry.getKey(); - Object value = entry.getValue().get(i); - try { - if ( field.equals(VCFConstants.GENOTYPE_KEY) ) { - alleles = decodeGenotypeAlleles(siteInfo.alleles, (List)value); - } else if ( field.equals(VCFConstants.GENOTYPE_QUALITY_KEY) ) { - if ( value != BCF2Type.INT8.getMissingJavaValue() ) - log10PError = ((Integer)value) / -10.0; - } else if ( field.equals(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY) ) { - final List pls = (List)value; - if ( pls != null ) { // we have a PL field - log10Likelihoods = new double[pls.size()]; - for ( int j = 0; j < log10Likelihoods.length; j++ ) { - final double d = pls.get(j); - log10Likelihoods[j] = d == -0.0 ? 0.0 : d / -10.0; - } - } - } else if ( field.equals(VCFConstants.GENOTYPE_FILTER_KEY) ) { - throw new ReviewedStingException("Genotype filters not implemented in GATK BCF2"); - //filters = new HashSet(values.get(i)); - } else { // add to attributes - if ( value != null ) { // don't add missing values - if ( attributes == null ) attributes = new HashMap(nFields); - if ( value instanceof List && ((List)value).size() == 1) - value = ((List)value).get(0); - attributes.put(field, value); - } - } - } catch ( ClassCastException e ) { - throw new UserException.MalformedBCF2("BUG: expected encoding of field " + field - + " inconsistent with the value observed in the decoded value in the " - + " BCF file. Value was " + value); - } - } - - if ( alleles == null ) throw new UserException.MalformedBCF2("BUG: no alleles found"); - - final Genotype g = new Genotype(sampleName, alleles, log10PError, filters, attributes, isPhased, log10Likelihoods); - genotypes.add(g); - } - - builder.genotypes(genotypes); - } - - private final List decodeGenotypeAlleles(final ArrayList siteAlleles, final List encoded) { - if ( encoded == null ) - // no called sample GT = . - return Collections.emptyList(); - else { - // we have at least some alleles to decode - final List gt = new ArrayList(encoded.size()); - for ( final Integer encode : encoded ) { - if ( encode == null ) // absent, as are all following by definition - return gt; - else { - final int offset = encode >> 1; - if ( offset == 0 ) - gt.add(Allele.NO_CALL); - else - gt.add(siteAlleles.get(offset - 1)); - } - } - - return gt; - } - } - - private final Map> decodeGenotypeFieldValues(final int nFields, final int nSamples) { - assert (nFields > 0 && nSamples > 0) || (nFields == 0 && nSamples == 0); - - if ( nFields == 0 ) // fast path exit for sites only file - return Collections.emptyMap(); - else { - final Map> map = new LinkedHashMap>(nFields); - - for ( int i = 0; i < nFields; i++ ) { - final String field = getDictionaryString(); - final byte typeDescriptor = decoder.readTypeDescriptor(); - final List values = new ArrayList(nSamples); - for ( int j = 0; j < nSamples; j++ ) - values.add(decoder.decodeTypedValue(typeDescriptor)); - map.put(field, values); - } - - return map; + builder.genotypesNoValidation(lazy); } } @@ -412,7 +329,7 @@ public class BCF2Codec implements FeatureCodec, ReferenceDepende return getDictionaryString((Integer) decoder.decodeTypedValue()); } - private final String getDictionaryString(final int offset) { + protected final String getDictionaryString(final int offset) { if ( offset >= dictionary.size() ) throw new UserException.MalformedBCF2("BUG: no dictionary field found at offset " + offset); final String field = dictionary.get(offset); return field; @@ -436,4 +353,8 @@ public class BCF2Codec implements FeatureCodec, ReferenceDepende return dict; } + + protected VCFHeader getHeader() { + return header; + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/LazyGenotypesDecoder.java b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/LazyGenotypesDecoder.java new file mode 100644 index 000000000..6d716756f --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/LazyGenotypesDecoder.java @@ -0,0 +1,180 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.codecs.bcf2; + +import org.apache.log4j.Logger; +import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.variantcontext.Allele; +import org.broadinstitute.sting.utils.variantcontext.Genotype; +import org.broadinstitute.sting.utils.variantcontext.LazyGenotypesContext; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; + +import java.util.*; + +/** + * Lazy version of genotypes decoder for BCF2 genotypes + * + * @author Mark DePristo + * @since 5/12 + */ +class LazyGenotypesDecoder implements LazyGenotypesContext.LazyParser { + final protected static Logger logger = Logger.getLogger(LazyGenotypesDecoder.class); + + // the essential information for us to use to decode the genotypes data + // initialized when this lazy decoder is created, as we know all of this from the BCF2Codec + // and its stored here again for code cleanliness + private final BCF2Codec codec; + private final ArrayList siteAlleles; + private final int nSamples; + private final int nFields; + + LazyGenotypesDecoder(final BCF2Codec codec, final ArrayList alleles, final int nSamples, final int nFields) { + this.codec = codec; + this.siteAlleles = alleles; + this.nSamples = nSamples; + this.nFields = nFields; + } + + @Override + public LazyGenotypesContext.LazyData parse(final Object data) { + logger.info("Decoding BCF genotypes for " + nSamples + " samples with " + nFields + " fields each"); + + // load our bytep[] data into the decoder + final BCF2Decoder decoder = new BCF2Decoder((byte[])data); + + // go ahead and decode everyone + final List samples = new ArrayList(codec.getHeader().getGenotypeSamples()); + + if ( samples.size() != nSamples ) + throw new UserException.MalformedBCF2("GATK currently doesn't support reading BCF2 files with " + + "different numbers of samples per record. Saw " + samples.size() + + " samples in header but have a record with " + nSamples + " samples"); + + final Map> fieldValues = decodeGenotypeFieldValues(decoder, nFields, nSamples); + final ArrayList genotypes = new ArrayList(nSamples); + for ( int i = 0; i < nSamples; i++ ) { + // all of the information we need for each genotype, with default values + final String sampleName = samples.get(i); + List alleles = null; + boolean isPhased = false; + double log10PError = VariantContext.NO_LOG10_PERROR; + Set filters = null; + Map attributes = null; + double[] log10Likelihoods = null; + + for ( final Map.Entry> entry : fieldValues.entrySet() ) { + final String field = entry.getKey(); + Object value = entry.getValue().get(i); + try { + if ( field.equals(VCFConstants.GENOTYPE_KEY) ) { + alleles = decodeGenotypeAlleles(siteAlleles, (List)value); + } else if ( field.equals(VCFConstants.GENOTYPE_QUALITY_KEY) ) { + if ( value != BCF2Type.INT8.getMissingJavaValue() ) + log10PError = ((Integer)value) / -10.0; + } else if ( field.equals(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY) ) { + final List pls = (List)value; + if ( pls != null ) { // we have a PL field + log10Likelihoods = new double[pls.size()]; + for ( int j = 0; j < log10Likelihoods.length; j++ ) { + final double d = pls.get(j); + log10Likelihoods[j] = d == -0.0 ? 0.0 : d / -10.0; + } + } + } else if ( field.equals(VCFConstants.GENOTYPE_FILTER_KEY) ) { + throw new ReviewedStingException("Genotype filters not implemented in GATK BCF2"); + //filters = new HashSet(values.get(i)); + } else { // add to attributes + if ( value != null ) { // don't add missing values + if ( attributes == null ) attributes = new HashMap(nFields); + if ( value instanceof List && ((List)value).size() == 1) + value = ((List)value).get(0); + attributes.put(field, value); + } + } + } catch ( ClassCastException e ) { + throw new UserException.MalformedBCF2("BUG: expected encoding of field " + field + + " inconsistent with the value observed in the decoded value in the " + + " BCF file. Value was " + value); + } + } + + if ( alleles == null ) throw new UserException.MalformedBCF2("BUG: no alleles found"); + + final Genotype g = new Genotype(sampleName, alleles, log10PError, filters, attributes, isPhased, log10Likelihoods); + genotypes.add(g); + } + + return new LazyGenotypesContext.LazyData(genotypes, codec.getHeader().getSampleNamesInOrder(), codec.getHeader().getSampleNameToOffset()); + } + + private final List decodeGenotypeAlleles(final ArrayList siteAlleles, final List encoded) { + if ( encoded == null ) + // no called sample GT = . + return Collections.emptyList(); + else { + // we have at least some alleles to decode + final List gt = new ArrayList(encoded.size()); + for ( final Integer encode : encoded ) { + if ( encode == null ) // absent, as are all following by definition + return gt; + else { + final int offset = encode >> 1; + if ( offset == 0 ) + gt.add(Allele.NO_CALL); + else + gt.add(siteAlleles.get(offset - 1)); + } + } + + return gt; + } + } + + private final Map> decodeGenotypeFieldValues(final BCF2Decoder decoder, + final int nFields, + final int nSamples) { + assert (nFields > 0 && nSamples > 0) || (nFields == 0 && nSamples == 0); + + if ( nFields == 0 ) // fast path exit for sites only file + return Collections.emptyMap(); + else { + final Map> map = new LinkedHashMap>(nFields); + + for ( int i = 0; i < nFields; i++ ) { + final int offset = (Integer) decoder.decodeTypedValue(); + final String field = codec.getDictionaryString(offset); + final byte typeDescriptor = decoder.readTypeDescriptor(); + final List values = new ArrayList(nSamples); + for ( int j = 0; j < nSamples; j++ ) + values.add(decoder.decodeTypedValue(typeDescriptor)); + map.put(field, values); + } + + return map; + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCF3Codec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCF3Codec.java index ce1fc98ae..4c079d5a3 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCF3Codec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCF3Codec.java @@ -192,7 +192,7 @@ public class VCF3Codec extends AbstractVCFCodec { } } - return new LazyGenotypesContext.LazyData(genotypes, header.sampleNamesInOrder, header.sampleNameToOffset); + return new LazyGenotypesContext.LazyData(genotypes, header.getSampleNamesInOrder(), header.getSampleNameToOffset()); } @Override diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java index e68ea6e00..0b3d0b5ab 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java @@ -241,7 +241,7 @@ public class VCFCodec extends AbstractVCFCodec { } } - return new LazyGenotypesContext.LazyData(genotypes, header.sampleNamesInOrder, header.sampleNameToOffset); + return new LazyGenotypesContext.LazyData(genotypes, header.getSampleNamesInOrder(), header.getSampleNameToOffset()); } @Override diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeader.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeader.java index 0b89c6ef6..99fb23755 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeader.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeader.java @@ -68,8 +68,8 @@ public class VCFHeader { private boolean samplesWereAlreadySorted = true; // cache for efficient conversion of VCF -> VariantContext - protected ArrayList sampleNamesInOrder = null; - protected HashMap sampleNameToOffset = null; + private ArrayList sampleNamesInOrder = null; + private HashMap sampleNameToOffset = null; private boolean writeEngineHeaders = true; private boolean writeCommandLine = true; @@ -299,4 +299,12 @@ public class VCFHeader { public void setWriteCommandLine(boolean writeCommandLine) { this.writeCommandLine = writeCommandLine; } + + public ArrayList getSampleNamesInOrder() { + return sampleNamesInOrder; + } + + public HashMap getSampleNameToOffset() { + return sampleNameToOffset; + } }