From 17fbd103d087d3ab969bb08247bf685acbc573d5 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Sun, 3 Jun 2012 12:20:08 -0400 Subject: [PATCH] Smarter infrastructure to decode genotypes in BCF -- Eliminated the large intermediate map from field name to list of list values needed to create genotypes without the GenotypeBuilder. The new code is cleaner and simply fills in an array of GenotypeBuilders as it moves through the column layout in BCF2 -- Now we create once decoders specialized for each GT field (GT, AD, etc) that can be optimized for putting data into the GenotypeBuilder. In a subsequent commit these will actually use lower level BCF2 decoders to create the low-level ints and int[], avoiding the intermediate List form -- Reduced the amount of data further to be computed in the DiffEngine. The DiffEngine algorithm needs to be rethought to be efficient... --- .../sting/utils/codecs/bcf2/BCF2Codec.java | 11 +- .../bcf2/BCF2GenotypeFieldDecoders.java | 210 ++++++++++++++++++ .../codecs/bcf2/BCF2LazyGenotypesDecoder.java | 139 +++--------- .../sting/utils/variantcontext/Genotype.java | 3 +- .../test/org/broadinstitute/sting/MD5DB.java | 4 +- 5 files changed, 250 insertions(+), 117 deletions(-) create mode 100644 public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2GenotypeFieldDecoders.java diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java index 48c4c8a4f..565de4e25 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java @@ -51,6 +51,7 @@ public class BCF2Codec implements FeatureCodec, ReferenceDepende private final BCF2Decoder decoder = new BCF2Decoder(); private boolean skipGenotypes = false; private final static int MAX_HEADER_SIZE = 0x08000000; + private BCF2GenotypeFieldDecoders gtFieldDecoders = null; // ---------------------------------------------------------------------- // @@ -128,6 +129,9 @@ public class BCF2Codec implements FeatureCodec, ReferenceDepende // create the string dictionary dictionary = parseDictionary(header); + // prepare the genotype field decoders + gtFieldDecoders = new BCF2GenotypeFieldDecoders(header); + // position right before next line (would be right before first real record byte at end of header) return new FeatureCodecHeader(header, inputStream.getPosition()); } @@ -216,7 +220,7 @@ public class BCF2Codec implements FeatureCodec, ReferenceDepende return new SitesInfoForDecoding(pos, nFormatFields, nSamples, alleles); } - private final static class SitesInfoForDecoding { + protected final static class SitesInfoForDecoding { final int pos; final int nFormatFields; final int nSamples; @@ -361,6 +365,7 @@ public class BCF2Codec implements FeatureCodec, ReferenceDepende } } + private final ArrayList parseDictionary(final VCFHeader header) { final ArrayList dict = BCF2Utils.makeDictionary(header); @@ -374,4 +379,8 @@ public class BCF2Codec implements FeatureCodec, ReferenceDepende protected VCFHeader getHeader() { return header; } + + protected BCF2GenotypeFieldDecoders.Decoder getGenotypeFieldDecoder(final String field) { + return gtFieldDecoders.getDecoder(field); + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2GenotypeFieldDecoders.java b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2GenotypeFieldDecoders.java new file mode 100644 index 000000000..e0c89fd77 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2GenotypeFieldDecoders.java @@ -0,0 +1,210 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.codecs.bcf2; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; +import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.variantcontext.Allele; +import org.broadinstitute.sting.utils.variantcontext.GenotypeBuilder; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; + +/** + * An efficient + * + * @author Your Name + * @since Date created + */ +public class BCF2GenotypeFieldDecoders { + // initialized once per writer to allow parallel writers to work + private final HashMap genotypeFieldDecoder = new HashMap(); + private final Decoder defaultDecoder = new GenericDecoder(); + + public BCF2GenotypeFieldDecoders(final VCFHeader header) { + // TODO -- fill in appropriate decoders for each FORMAT field in the header + + genotypeFieldDecoder.put(VCFConstants.GENOTYPE_KEY, new GTDecoder()); + genotypeFieldDecoder.put(VCFConstants.GENOTYPE_FILTER_KEY, new FLDecoder()); + genotypeFieldDecoder.put(VCFConstants.DEPTH_KEY, new DPDecoder()); + genotypeFieldDecoder.put(VCFConstants.GENOTYPE_ALLELE_DEPTHS, new ADDecoder()); + genotypeFieldDecoder.put(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, new PLDecoder()); + genotypeFieldDecoder.put(VCFConstants.GENOTYPE_QUALITY_KEY, new GQDecoder()); + } + + // ----------------------------------------------------------------- + // + // Genotype field decoder + // + // ----------------------------------------------------------------- + + /** + * Return decoder appropriate for field, or the generic decoder if no + * specialized one is bound + * @param field the GT field to decode + * @return a non-null decoder + */ + @Requires("field != null") + @Ensures("result != null") + public Decoder getDecoder(final String field) { + final Decoder d = genotypeFieldDecoder.get(field); + return d == null ? defaultDecoder : d; + } + + /** + * Decoder a field (implicit from creation) encoded as + * typeDescriptor in the decoder object in the GenotypeBuilders + * one for each sample in order. + * + * The way this works is that this decode method + * iterates over the builders, decoding a genotype field + * in BCF2 for each sample from decoder. + * + * This system allows us to easily use specialized + * decoders for specific genotype field values. For example, + * we use a special decoder to directly read the BCF2 data for + * the PL field into a int[] rather than the generic List of Integer + */ + public interface Decoder { + public void decode(final List siteAlleles, + final String field, + final BCF2Decoder decoder, + final byte typeDescriptor, + final List gbs); + } + + private class GTDecoder implements Decoder { + @Override + public void decode(final List siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final List gbs) { + for ( final GenotypeBuilder gb : gbs ) { + // TODO -- fast path for size == 2 (diploid) + final List encoded = (List)decoder.decodeTypedValue(typeDescriptor); + if ( encoded == null ) + // no called sample GT = . + gb.alleles(null); + else { + // we have at least some alleles to decode + final List gt = new ArrayList(encoded.size()); + + for ( final Integer encode : encoded ) { + if ( encode == null ) { + // absent, as are all following by definition + break; + } else { + final int offset = encode >> 1; + if ( offset == 0 ) + gt.add(Allele.NO_CALL); + else + gt.add(siteAlleles.get(offset - 1)); + } + } + + gb.alleles(gt); + } + } + } + } + + private class DPDecoder implements Decoder { + @Override + public void decode(final List siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final List gbs) { + for ( final GenotypeBuilder gb : gbs ) { + final Object value = decoder.decodeTypedValue(typeDescriptor); + if ( value != null ) + gb.DP((Integer)value); + } + } + } + + private class GQDecoder implements Decoder { + @Override + public void decode(final List siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final List gbs) { + for ( final GenotypeBuilder gb : gbs ) { + final Object value = decoder.decodeTypedValue(typeDescriptor); + if ( value != null ) + gb.GQ((Integer)value); + } + } + } + + private class ADDecoder implements Decoder { + @Override + public void decode(final List siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final List gbs) { + for ( final GenotypeBuilder gb : gbs ) { + final int[] AD = decodeIntArray(decoder.decodeTypedValue(typeDescriptor)); + if ( AD != null ) + gb.AD(AD); + } + } + } + + private class PLDecoder implements Decoder { + @Override + public void decode(final List siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final List gbs) { + for ( final GenotypeBuilder gb : gbs ) { + final int[] PL = decodeIntArray(decoder.decodeTypedValue(typeDescriptor)); + if ( PL != null ) + gb.PL(PL); + } + } + } + + private class FLDecoder implements Decoder { + @Override + public void decode(final List siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final List gbs) { + throw new ReviewedStingException("Genotype filter not implemented in BCF2 yet"); + } + } + + private class GenericDecoder implements Decoder { + @Override + public void decode(final List siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final List gbs) { + for ( final GenotypeBuilder gb : gbs ) { + Object value = decoder.decodeTypedValue(typeDescriptor); + if ( value != null ) { // don't add missing values + if ( value instanceof List && ((List)value).size() == 1) + value = ((List)value).get(0); + gb.attribute(field, value); + } + } + } + } + + private static final int[] decodeIntArray(final Object value) { + // todo -- decode directly into int[] + final List pls = (List)value; + if ( pls != null ) { // we have a PL field + final int[] x = new int[pls.size()]; + for ( int j = 0; j < x.length; j++ ) + x[j] = pls.get(j); + return x; + } else + return null; + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2LazyGenotypesDecoder.java b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2LazyGenotypesDecoder.java index 050ff0717..bafda761d 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2LazyGenotypesDecoder.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2LazyGenotypesDecoder.java @@ -25,8 +25,6 @@ package org.broadinstitute.sting.utils.codecs.bcf2; import org.apache.log4j.Logger; -import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.variantcontext.*; @@ -60,9 +58,11 @@ class BCF2LazyGenotypesDecoder implements LazyGenotypesContext.LazyParser { public LazyGenotypesContext.LazyData parse(final Object data) { logger.info("Decoding BCF genotypes for " + nSamples + " samples with " + nFields + " fields each"); - // load our bytep[] data into the decoder + // load our byte[] data into the decoder final BCF2Decoder decoder = new BCF2Decoder(((BCF2Codec.LazyData)data).bytes); + // TODO -- fast path for sites only + // go ahead and decode everyone final List samples = new ArrayList(codec.getHeader().getGenotypeSamples()); @@ -71,119 +71,32 @@ class BCF2LazyGenotypesDecoder implements LazyGenotypesContext.LazyParser { "different numbers of samples per record. Saw " + samples.size() + " samples in header but have a record with " + nSamples + " samples"); - final Map> fieldValues = decodeGenotypeFieldValues(decoder, nFields, nSamples); - final ArrayList genotypes = new ArrayList(nSamples); - final GenotypeBuilder gb = new GenotypeBuilder(); + // create and initialize the genotypes array + final ArrayList builders = new ArrayList(nSamples); for ( int i = 0; i < nSamples; i++ ) { - // all of the information we need for each genotype, with default values - gb.reset(); - gb.name(samples.get(i)); - - for ( final Map.Entry> entry : fieldValues.entrySet() ) { - final String field = entry.getKey(); - Object value = entry.getValue().get(i); - try { - if ( field.equals(VCFConstants.GENOTYPE_KEY) ) { - gb.alleles(decodeGenotypeAlleles(siteAlleles, (List)value)); - } else if ( field.equals(VCFConstants.DEPTH_KEY) ) { - if ( value != BCF2Type.INT8.getMissingJavaValue() ) - gb.DP((Integer)value); - } else if ( field.equals(VCFConstants.GENOTYPE_QUALITY_KEY) ) { - if ( value != BCF2Type.INT8.getMissingJavaValue() ) - gb.GQ((Integer)value); - } else if ( field.equals(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY) ) { - final int[] PLs = decodeIntArray(value); - if ( PLs != null ) - gb.PL(PLs); - } else if ( field.equals(VCFConstants.GENOTYPE_ALLELE_DEPTHS) ) { - final int[] AD = decodeIntArray(value); - if ( AD != null ) - gb.AD(AD); - } else if ( field.equals(VCFConstants.GENOTYPE_FILTER_KEY) ) { - throw new ReviewedStingException("Genotype filters not implemented in GATK BCF2"); - //filters = new HashSet(values.get(i)); - } else { // add to attributes - if ( value != null ) { // don't add missing values - if ( value instanceof List && ((List)value).size() == 1) - value = ((List)value).get(0); - gb.attribute(field, value); - } - } - } catch ( ClassCastException e ) { - throw new UserException.MalformedBCF2("BUG: expected encoding of field " + field - + " inconsistent with the value observed in the decoded value in the " - + " BCF file. Value was " + value); - } - } - - final Genotype g = gb.make(); - genotypes.add(g); + builders.add(new GenotypeBuilder(samples.get(i))); } + for ( int i = 0; i < nFields; i++ ) { + // get the field name + final int offset = (Integer) decoder.decodeTypedValue(); + final String field = codec.getDictionaryString(offset); + + // the type of each element + final byte typeDescriptor = decoder.readTypeDescriptor(); + final BCF2GenotypeFieldDecoders.Decoder fieldDecoder = codec.getGenotypeFieldDecoder(field); + try { + fieldDecoder.decode(siteAlleles, field, decoder, typeDescriptor, builders); + } catch ( ClassCastException e ) { + throw new UserException.MalformedBCF2("BUG: expected encoding of field " + field + + " inconsistent with the value observed in the decoded value"); + } + } + + final ArrayList genotypes = new ArrayList(nSamples); + for ( final GenotypeBuilder gb : builders ) + genotypes.add(gb.make()); + return new LazyGenotypesContext.LazyData(genotypes, codec.getHeader().getSampleNamesInOrder(), codec.getHeader().getSampleNameToOffset()); } - - private final int[] decodeIntArray(final Object value) { - // todo -- decode directly into int[] - final List pls = (List)value; - if ( pls != null ) { // we have a PL field - final int[] x = new int[pls.size()]; - for ( int j = 0; j < x.length; j++ ) - x[j] = pls.get(j); - return x; - } else - return null; - } - - private final List decodeGenotypeAlleles(final ArrayList siteAlleles, final List encoded) { - if ( encoded == null ) - // no called sample GT = . - return Collections.emptyList(); - else { - // we have at least some alleles to decode - final List gt = new ArrayList(encoded.size()); - for ( final Integer encode : encoded ) { - if ( encode == null ) // absent, as are all following by definition - return gt; - else { - final int offset = encode >> 1; - if ( offset == 0 ) - gt.add(Allele.NO_CALL); - else - gt.add(siteAlleles.get(offset - 1)); - } - } - - return gt; - } - } - - private final Map> decodeGenotypeFieldValues(final BCF2Decoder decoder, - final int nFields, - final int nSamples) { - assert (nFields > 0 && nSamples > 0) || (nFields == 0 && nSamples == 0); - - if ( nFields == 0 ) // fast path exit for sites only file - return Collections.emptyMap(); - else { - final Map> map = new LinkedHashMap>(nFields); - - for ( int i = 0; i < nFields; i++ ) { - final int offset = (Integer) decoder.decodeTypedValue(); - final String field = codec.getDictionaryString(offset); - - // the type of each element - final byte typeDescriptor = decoder.readTypeDescriptor(); - final List values = new ArrayList(nSamples); - for ( int j = 0; j < nSamples; j++ ) - values.add(decoder.decodeTypedValue(typeDescriptor)); - - assert ! map.containsKey(field); - - map.put(field, values); - } - - return map; - } - } } diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/Genotype.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/Genotype.java index 52cd6fb77..75b893757 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/Genotype.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/Genotype.java @@ -265,6 +265,7 @@ public abstract class Genotype implements Comparable { /** * @return Returns true if this Genotype has PL field values */ + @Ensures("(result && getLikelihoods() != null) || (! result && getLikelihoods() == null)") public boolean hasLikelihoods() { return getPL() != null; } @@ -284,7 +285,7 @@ public abstract class Genotype implements Comparable { * Returns the GenotypesLikelihoods data associated with this Genotype, or null if missing * @return null or a GenotypesLikelihood object for this sample's PL field */ - @Ensures({"hasLikelihoods() && result != null", "! hasLikelihoods() && result == null"}) + @Ensures("(hasLikelihoods() && result != null) || (! hasLikelihoods() && result == null)") public GenotypeLikelihoods getLikelihoods() { return hasLikelihoods() ? GenotypeLikelihoods.fromPLs(getPL()) : null; } diff --git a/public/java/test/org/broadinstitute/sting/MD5DB.java b/public/java/test/org/broadinstitute/sting/MD5DB.java index 7c47e8306..fd2643387 100644 --- a/public/java/test/org/broadinstitute/sting/MD5DB.java +++ b/public/java/test/org/broadinstitute/sting/MD5DB.java @@ -47,8 +47,8 @@ public class MD5DB { /** * Subdirectory under the ant build directory where we store integration test md5 results */ - private static final int MAX_RECORDS_TO_READ = 10000; - private static final int MAX_RAW_DIFFS_TO_SUMMARIZE = 1000; + private static final int MAX_RECORDS_TO_READ = 1000; + private static final int MAX_RAW_DIFFS_TO_SUMMARIZE = 100; public static final String LOCAL_MD5_DB_DIR = "integrationtests"; public static final String GLOBAL_MD5_DB_DIR = "/humgen/gsa-hpprojects/GATK/data/integrationtests";