diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java index 48c4c8a4f..565de4e25 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java @@ -51,6 +51,7 @@ public class BCF2Codec implements FeatureCodec, ReferenceDepende private final BCF2Decoder decoder = new BCF2Decoder(); private boolean skipGenotypes = false; private final static int MAX_HEADER_SIZE = 0x08000000; + private BCF2GenotypeFieldDecoders gtFieldDecoders = null; // ---------------------------------------------------------------------- // @@ -128,6 +129,9 @@ public class BCF2Codec implements FeatureCodec, ReferenceDepende // create the string dictionary dictionary = parseDictionary(header); + // prepare the genotype field decoders + gtFieldDecoders = new BCF2GenotypeFieldDecoders(header); + // position right before next line (would be right before first real record byte at end of header) return new FeatureCodecHeader(header, inputStream.getPosition()); } @@ -216,7 +220,7 @@ public class BCF2Codec implements FeatureCodec, ReferenceDepende return new SitesInfoForDecoding(pos, nFormatFields, nSamples, alleles); } - private final static class SitesInfoForDecoding { + protected final static class SitesInfoForDecoding { final int pos; final int nFormatFields; final int nSamples; @@ -361,6 +365,7 @@ public class BCF2Codec implements FeatureCodec, ReferenceDepende } } + private final ArrayList parseDictionary(final VCFHeader header) { final ArrayList dict = BCF2Utils.makeDictionary(header); @@ -374,4 +379,8 @@ public class BCF2Codec implements FeatureCodec, ReferenceDepende protected VCFHeader getHeader() { return header; } + + protected BCF2GenotypeFieldDecoders.Decoder getGenotypeFieldDecoder(final String field) { + return gtFieldDecoders.getDecoder(field); + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2GenotypeFieldDecoders.java b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2GenotypeFieldDecoders.java new file mode 100644 index 000000000..e0c89fd77 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2GenotypeFieldDecoders.java @@ -0,0 +1,210 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.codecs.bcf2; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; +import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.variantcontext.Allele; +import org.broadinstitute.sting.utils.variantcontext.GenotypeBuilder; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; + +/** + * An efficient + * + * @author Your Name + * @since Date created + */ +public class BCF2GenotypeFieldDecoders { + // initialized once per writer to allow parallel writers to work + private final HashMap genotypeFieldDecoder = new HashMap(); + private final Decoder defaultDecoder = new GenericDecoder(); + + public BCF2GenotypeFieldDecoders(final VCFHeader header) { + // TODO -- fill in appropriate decoders for each FORMAT field in the header + + genotypeFieldDecoder.put(VCFConstants.GENOTYPE_KEY, new GTDecoder()); + genotypeFieldDecoder.put(VCFConstants.GENOTYPE_FILTER_KEY, new FLDecoder()); + genotypeFieldDecoder.put(VCFConstants.DEPTH_KEY, new DPDecoder()); + genotypeFieldDecoder.put(VCFConstants.GENOTYPE_ALLELE_DEPTHS, new ADDecoder()); + genotypeFieldDecoder.put(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, new PLDecoder()); + genotypeFieldDecoder.put(VCFConstants.GENOTYPE_QUALITY_KEY, new GQDecoder()); + } + + // ----------------------------------------------------------------- + // + // Genotype field decoder + // + // ----------------------------------------------------------------- + + /** + * Return decoder appropriate for field, or the generic decoder if no + * specialized one is bound + * @param field the GT field to decode + * @return a non-null decoder + */ + @Requires("field != null") + @Ensures("result != null") + public Decoder getDecoder(final String field) { + final Decoder d = genotypeFieldDecoder.get(field); + return d == null ? defaultDecoder : d; + } + + /** + * Decoder a field (implicit from creation) encoded as + * typeDescriptor in the decoder object in the GenotypeBuilders + * one for each sample in order. + * + * The way this works is that this decode method + * iterates over the builders, decoding a genotype field + * in BCF2 for each sample from decoder. + * + * This system allows us to easily use specialized + * decoders for specific genotype field values. For example, + * we use a special decoder to directly read the BCF2 data for + * the PL field into a int[] rather than the generic List of Integer + */ + public interface Decoder { + public void decode(final List siteAlleles, + final String field, + final BCF2Decoder decoder, + final byte typeDescriptor, + final List gbs); + } + + private class GTDecoder implements Decoder { + @Override + public void decode(final List siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final List gbs) { + for ( final GenotypeBuilder gb : gbs ) { + // TODO -- fast path for size == 2 (diploid) + final List encoded = (List)decoder.decodeTypedValue(typeDescriptor); + if ( encoded == null ) + // no called sample GT = . + gb.alleles(null); + else { + // we have at least some alleles to decode + final List gt = new ArrayList(encoded.size()); + + for ( final Integer encode : encoded ) { + if ( encode == null ) { + // absent, as are all following by definition + break; + } else { + final int offset = encode >> 1; + if ( offset == 0 ) + gt.add(Allele.NO_CALL); + else + gt.add(siteAlleles.get(offset - 1)); + } + } + + gb.alleles(gt); + } + } + } + } + + private class DPDecoder implements Decoder { + @Override + public void decode(final List siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final List gbs) { + for ( final GenotypeBuilder gb : gbs ) { + final Object value = decoder.decodeTypedValue(typeDescriptor); + if ( value != null ) + gb.DP((Integer)value); + } + } + } + + private class GQDecoder implements Decoder { + @Override + public void decode(final List siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final List gbs) { + for ( final GenotypeBuilder gb : gbs ) { + final Object value = decoder.decodeTypedValue(typeDescriptor); + if ( value != null ) + gb.GQ((Integer)value); + } + } + } + + private class ADDecoder implements Decoder { + @Override + public void decode(final List siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final List gbs) { + for ( final GenotypeBuilder gb : gbs ) { + final int[] AD = decodeIntArray(decoder.decodeTypedValue(typeDescriptor)); + if ( AD != null ) + gb.AD(AD); + } + } + } + + private class PLDecoder implements Decoder { + @Override + public void decode(final List siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final List gbs) { + for ( final GenotypeBuilder gb : gbs ) { + final int[] PL = decodeIntArray(decoder.decodeTypedValue(typeDescriptor)); + if ( PL != null ) + gb.PL(PL); + } + } + } + + private class FLDecoder implements Decoder { + @Override + public void decode(final List siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final List gbs) { + throw new ReviewedStingException("Genotype filter not implemented in BCF2 yet"); + } + } + + private class GenericDecoder implements Decoder { + @Override + public void decode(final List siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final List gbs) { + for ( final GenotypeBuilder gb : gbs ) { + Object value = decoder.decodeTypedValue(typeDescriptor); + if ( value != null ) { // don't add missing values + if ( value instanceof List && ((List)value).size() == 1) + value = ((List)value).get(0); + gb.attribute(field, value); + } + } + } + } + + private static final int[] decodeIntArray(final Object value) { + // todo -- decode directly into int[] + final List pls = (List)value; + if ( pls != null ) { // we have a PL field + final int[] x = new int[pls.size()]; + for ( int j = 0; j < x.length; j++ ) + x[j] = pls.get(j); + return x; + } else + return null; + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2LazyGenotypesDecoder.java b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2LazyGenotypesDecoder.java index 050ff0717..bafda761d 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2LazyGenotypesDecoder.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2LazyGenotypesDecoder.java @@ -25,8 +25,6 @@ package org.broadinstitute.sting.utils.codecs.bcf2; import org.apache.log4j.Logger; -import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.variantcontext.*; @@ -60,9 +58,11 @@ class BCF2LazyGenotypesDecoder implements LazyGenotypesContext.LazyParser { public LazyGenotypesContext.LazyData parse(final Object data) { logger.info("Decoding BCF genotypes for " + nSamples + " samples with " + nFields + " fields each"); - // load our bytep[] data into the decoder + // load our byte[] data into the decoder final BCF2Decoder decoder = new BCF2Decoder(((BCF2Codec.LazyData)data).bytes); + // TODO -- fast path for sites only + // go ahead and decode everyone final List samples = new ArrayList(codec.getHeader().getGenotypeSamples()); @@ -71,119 +71,32 @@ class BCF2LazyGenotypesDecoder implements LazyGenotypesContext.LazyParser { "different numbers of samples per record. Saw " + samples.size() + " samples in header but have a record with " + nSamples + " samples"); - final Map> fieldValues = decodeGenotypeFieldValues(decoder, nFields, nSamples); - final ArrayList genotypes = new ArrayList(nSamples); - final GenotypeBuilder gb = new GenotypeBuilder(); + // create and initialize the genotypes array + final ArrayList builders = new ArrayList(nSamples); for ( int i = 0; i < nSamples; i++ ) { - // all of the information we need for each genotype, with default values - gb.reset(); - gb.name(samples.get(i)); - - for ( final Map.Entry> entry : fieldValues.entrySet() ) { - final String field = entry.getKey(); - Object value = entry.getValue().get(i); - try { - if ( field.equals(VCFConstants.GENOTYPE_KEY) ) { - gb.alleles(decodeGenotypeAlleles(siteAlleles, (List)value)); - } else if ( field.equals(VCFConstants.DEPTH_KEY) ) { - if ( value != BCF2Type.INT8.getMissingJavaValue() ) - gb.DP((Integer)value); - } else if ( field.equals(VCFConstants.GENOTYPE_QUALITY_KEY) ) { - if ( value != BCF2Type.INT8.getMissingJavaValue() ) - gb.GQ((Integer)value); - } else if ( field.equals(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY) ) { - final int[] PLs = decodeIntArray(value); - if ( PLs != null ) - gb.PL(PLs); - } else if ( field.equals(VCFConstants.GENOTYPE_ALLELE_DEPTHS) ) { - final int[] AD = decodeIntArray(value); - if ( AD != null ) - gb.AD(AD); - } else if ( field.equals(VCFConstants.GENOTYPE_FILTER_KEY) ) { - throw new ReviewedStingException("Genotype filters not implemented in GATK BCF2"); - //filters = new HashSet(values.get(i)); - } else { // add to attributes - if ( value != null ) { // don't add missing values - if ( value instanceof List && ((List)value).size() == 1) - value = ((List)value).get(0); - gb.attribute(field, value); - } - } - } catch ( ClassCastException e ) { - throw new UserException.MalformedBCF2("BUG: expected encoding of field " + field - + " inconsistent with the value observed in the decoded value in the " - + " BCF file. Value was " + value); - } - } - - final Genotype g = gb.make(); - genotypes.add(g); + builders.add(new GenotypeBuilder(samples.get(i))); } + for ( int i = 0; i < nFields; i++ ) { + // get the field name + final int offset = (Integer) decoder.decodeTypedValue(); + final String field = codec.getDictionaryString(offset); + + // the type of each element + final byte typeDescriptor = decoder.readTypeDescriptor(); + final BCF2GenotypeFieldDecoders.Decoder fieldDecoder = codec.getGenotypeFieldDecoder(field); + try { + fieldDecoder.decode(siteAlleles, field, decoder, typeDescriptor, builders); + } catch ( ClassCastException e ) { + throw new UserException.MalformedBCF2("BUG: expected encoding of field " + field + + " inconsistent with the value observed in the decoded value"); + } + } + + final ArrayList genotypes = new ArrayList(nSamples); + for ( final GenotypeBuilder gb : builders ) + genotypes.add(gb.make()); + return new LazyGenotypesContext.LazyData(genotypes, codec.getHeader().getSampleNamesInOrder(), codec.getHeader().getSampleNameToOffset()); } - - private final int[] decodeIntArray(final Object value) { - // todo -- decode directly into int[] - final List pls = (List)value; - if ( pls != null ) { // we have a PL field - final int[] x = new int[pls.size()]; - for ( int j = 0; j < x.length; j++ ) - x[j] = pls.get(j); - return x; - } else - return null; - } - - private final List decodeGenotypeAlleles(final ArrayList siteAlleles, final List encoded) { - if ( encoded == null ) - // no called sample GT = . - return Collections.emptyList(); - else { - // we have at least some alleles to decode - final List gt = new ArrayList(encoded.size()); - for ( final Integer encode : encoded ) { - if ( encode == null ) // absent, as are all following by definition - return gt; - else { - final int offset = encode >> 1; - if ( offset == 0 ) - gt.add(Allele.NO_CALL); - else - gt.add(siteAlleles.get(offset - 1)); - } - } - - return gt; - } - } - - private final Map> decodeGenotypeFieldValues(final BCF2Decoder decoder, - final int nFields, - final int nSamples) { - assert (nFields > 0 && nSamples > 0) || (nFields == 0 && nSamples == 0); - - if ( nFields == 0 ) // fast path exit for sites only file - return Collections.emptyMap(); - else { - final Map> map = new LinkedHashMap>(nFields); - - for ( int i = 0; i < nFields; i++ ) { - final int offset = (Integer) decoder.decodeTypedValue(); - final String field = codec.getDictionaryString(offset); - - // the type of each element - final byte typeDescriptor = decoder.readTypeDescriptor(); - final List values = new ArrayList(nSamples); - for ( int j = 0; j < nSamples; j++ ) - values.add(decoder.decodeTypedValue(typeDescriptor)); - - assert ! map.containsKey(field); - - map.put(field, values); - } - - return map; - } - } } diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/Genotype.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/Genotype.java index 52cd6fb77..75b893757 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/Genotype.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/Genotype.java @@ -265,6 +265,7 @@ public abstract class Genotype implements Comparable { /** * @return Returns true if this Genotype has PL field values */ + @Ensures("(result && getLikelihoods() != null) || (! result && getLikelihoods() == null)") public boolean hasLikelihoods() { return getPL() != null; } @@ -284,7 +285,7 @@ public abstract class Genotype implements Comparable { * Returns the GenotypesLikelihoods data associated with this Genotype, or null if missing * @return null or a GenotypesLikelihood object for this sample's PL field */ - @Ensures({"hasLikelihoods() && result != null", "! hasLikelihoods() && result == null"}) + @Ensures("(hasLikelihoods() && result != null) || (! hasLikelihoods() && result == null)") public GenotypeLikelihoods getLikelihoods() { return hasLikelihoods() ? GenotypeLikelihoods.fromPLs(getPL()) : null; } diff --git a/public/java/test/org/broadinstitute/sting/MD5DB.java b/public/java/test/org/broadinstitute/sting/MD5DB.java index 7c47e8306..fd2643387 100644 --- a/public/java/test/org/broadinstitute/sting/MD5DB.java +++ b/public/java/test/org/broadinstitute/sting/MD5DB.java @@ -47,8 +47,8 @@ public class MD5DB { /** * Subdirectory under the ant build directory where we store integration test md5 results */ - private static final int MAX_RECORDS_TO_READ = 10000; - private static final int MAX_RAW_DIFFS_TO_SUMMARIZE = 1000; + private static final int MAX_RECORDS_TO_READ = 1000; + private static final int MAX_RAW_DIFFS_TO_SUMMARIZE = 100; public static final String LOCAL_MD5_DB_DIR = "integrationtests"; public static final String GLOBAL_MD5_DB_DIR = "/humgen/gsa-hpprojects/GATK/data/integrationtests";