From c8ed0bfc4cf49c5edd27ef2e0f9e92593cfa375c Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 18 May 2012 15:49:23 -0400 Subject: [PATCH] Edge case fixes for BCF2 --handle entirely missing GT in a sample in decodeGenotypeAlleles --Create MAX_ALLELES_IN_GENOTYPES constant in BCF2Utils, and extracted its use inline from the code -- Generalized genotype writing code to handle ploidy != 2 and variable ploidy among samples -- Remove special case inline treatment of case where all samples have no GT field values, and moved this into calcVCFGenotypeKeys -- Removed restriction on getPloidy requiring ploidy > 1. It's logically find to return 0 for a no called sample -- getMaxPloidy() in VC that does what it says -- Support for padding / depadding of generic genotype fields --- .../sting/utils/codecs/bcf2/BCF2Codec.java | 29 +++++--- .../sting/utils/codecs/bcf2/BCF2Decoder.java | 6 +- .../sting/utils/codecs/bcf2/BCF2Utils.java | 2 + .../sting/utils/variantcontext/Genotype.java | 4 +- .../utils/variantcontext/VariantContext.java | 12 ++++ .../variantcontext/writer/BCF2Writer.java | 69 ++++++++++--------- .../variantcontext/writer/VCFWriter.java | 15 ++-- .../VariantContextTestProvider.java | 10 ++- 8 files changed, 94 insertions(+), 53 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java index 3c9eff51c..7bb0e16c5 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java @@ -365,19 +365,26 @@ public class BCF2Codec implements FeatureCodec, ReferenceDepende } private final List decodeGenotypeAlleles(final ArrayList siteAlleles, final List encoded) { - final List gt = new ArrayList(encoded.size()); - for ( final Integer encode : encoded ) { - if ( encode == null ) // absent, as are all following by definition - return gt; - else { - final int offset = encode >> 1; - if ( offset == 0 ) - gt.add(Allele.NO_CALL); - else - gt.add(siteAlleles.get(offset - 1)); + if ( encoded == null ) + // no called sample GT = . + return Collections.emptyList(); + else { + // we have at least some alleles to decode + final List gt = new ArrayList(encoded.size()); + for ( final Integer encode : encoded ) { + if ( encode == null ) // absent, as are all following by definition + return gt; + else { + final int offset = encode >> 1; + if ( offset == 0 ) + gt.add(Allele.NO_CALL); + else + gt.add(siteAlleles.get(offset - 1)); + } } + + return gt; } - return gt; } private final Map> decodeGenotypeFieldValues(final int nFields, final int nSamples) { diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Decoder.java b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Decoder.java index 93a3b9519..1ae84483d 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Decoder.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Decoder.java @@ -145,9 +145,11 @@ public class BCF2Decoder { } else { final ArrayList ints = new ArrayList(size); for ( int i = 0; i < size; i++ ) { - ints.add(decodeSingleValue(type)); + final Object val = decodeSingleValue(type); + if ( val == null ) continue; // auto-pruning. We remove trailing nulls + ints.add(val); } - return ints.get(0) == null ? null : ints; // return null when all of the values are null + return ints.isEmpty() ? null : ints; // return null when all of the values are null } } diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Utils.java b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Utils.java index aeec7260b..ae8b91cdc 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Utils.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Utils.java @@ -47,6 +47,8 @@ import java.util.List; public class BCF2Utils { public static final byte[] MAGIC_HEADER_LINE = "BCF\2".getBytes(); + public static final int MAX_ALLELES_IN_GENOTYPES = 127; + public static final int OVERFLOW_ELEMENT_MARKER = 15; public static final int MAX_INLINE_ELEMENTS = 14; diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/Genotype.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/Genotype.java index a5d20b860..9945d5047 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/Genotype.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/Genotype.java @@ -108,11 +108,9 @@ public class Genotype implements Comparable { public boolean isPhased() { return isPhased; } /** - * @return the ploidy of this genotype + * @return the ploidy of this genotype. 0 if the site is no-called. */ public int getPloidy() { - if ( alleles.size() == 0 ) - throw new ReviewedStingException("Requesting ploidy for an UNAVAILABLE genotype"); return alleles.size(); } diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java index ec865499d..9c1baef69 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java @@ -586,6 +586,18 @@ public class VariantContext implements Feature { // to enable tribble integratio return alleles.size(); } + /** + * Returns the maximum ploidy of all samples in this VC, or -1 if there are no genotypes + * @return -1, or the max ploidy + */ + public int getMaxPloidy() { + int max = -1; + for ( final Genotype g : getGenotypes() ) { + max = Math.max(g.getPloidy(), max); + } + return max; + } + /** * @return The allele sharing the same bases as this String. A convenience method; better to use byte[] */ diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java index 57c07fb90..446d37dec 100644 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java @@ -246,17 +246,9 @@ class BCF2Writer extends IndexingVariantContextWriter { if ( o == null ) continue; if ( o instanceof List ) { // only do compute if first value is of type list - final List values = (List)g.getAttribute(field); - if ( values != null ) { - if ( values.size() != size && size != -1 ) - throw new ReviewedStingException("BUG: BCF2 codec doesn't currently support padding " + - "/ depadding of genotype fields with mixed length." + - "Occurred in field " + field + " at " + vc.getChr() + ":" + vc.getStart()); - size = Math.max(size, values.size()); - } - } else { - return 1; - } + size = Math.max(size, ((List)o).size()); + } else if ( size == -1 ) + size = 1; } return size; @@ -268,20 +260,28 @@ class BCF2Writer extends IndexingVariantContextWriter { startGenotypeField(field, numInFormatField, encoding.BCF2Type); for ( final Genotype g : vc.getGenotypes() ) { - if ( ! g.hasAttribute(field) ) { - encoder.encodeRawMissingValues(numInFormatField, encoding.BCF2Type); + final Object fieldValue = g.getAttribute(field); + + if ( numInFormatField == 1 ) { + // we encode the actual allele, encodeRawValue handles the missing case where fieldValue == null + encoder.encodeRawValue(fieldValue, encoding.BCF2Type); } else { - final Object val = g.getAttribute(field); - if ( (val instanceof List) && (((List) val).size() != numInFormatField )) - throw new ReviewedStingException("BUG: header for " + field + - " has inconsistent number of values " + numInFormatField + - " compared to values in VariantContext " + ((List) val).size()); - final List vals = numInFormatField == 1 ? Collections.singletonList(val) : (List)val; - encoder.encodeRawValues(vals, encoding.BCF2Type); + // multiple values, need to handle general case + final List asList = toList(fieldValue); + final int nSampleValues = asList.size(); + for ( int i = 0; i < numInFormatField; i++ ) { + encoder.encodeRawValue(i < nSampleValues ? asList.get(i) : null, encoding.BCF2Type); + } } } } + private final static List toList(final Object o) { + if ( o == null ) return Collections.emptyList(); + else if ( o instanceof List ) return (List)o; + else return Collections.singletonList(o); + } + private final class VCFToBCFEncoding { VCFHeaderLineType vcfType; BCF2Type BCF2Type; @@ -393,22 +393,29 @@ class BCF2Writer extends IndexingVariantContextWriter { } private final void addGenotypes(final VariantContext vc) throws IOException { - if ( vc.getNAlleles() > 127 ) + if ( vc.getNAlleles() > BCF2Utils.MAX_ALLELES_IN_GENOTYPES ) throw new ReviewedStingException("Current BCF2 encoder cannot handle sites " + - "with > 127 alleles, but you have " + vc.getNAlleles() + " at " - + vc.getChr() + ":" + vc.getStart()); + "with > " + BCF2Utils.MAX_ALLELES_IN_GENOTYPES + " alleles, but you have " + + vc.getNAlleles() + " at " + vc.getChr() + ":" + vc.getStart()); final Map alleleMap = VCFWriter.buildAlleleMap(vc); - final int requiredPloidy = 2; // TODO -- handle ploidy, will need padding / depadding - startGenotypeField(VCFConstants.GENOTYPE_KEY, requiredPloidy, BCF2Type.INT8); + final int maxPloidy = vc.getMaxPloidy(); + startGenotypeField(VCFConstants.GENOTYPE_KEY, maxPloidy, BCF2Type.INT8); for ( final Genotype g : vc.getGenotypes() ) { - if ( g.getPloidy() != requiredPloidy ) throw new ReviewedStingException("Cannot currently handle non-diploid calls!"); - final List encoding = new ArrayList(requiredPloidy); - for ( final Allele a : g.getAlleles() ) { - final int offset = a.isNoCall() ? -1 : Integer.valueOf(alleleMap.get(a)); - encoding.add(((offset+1) << 1) | (g.isPhased() ? 0x01 : 0x00)); + final List alleles = g.getAlleles(); + final int samplePloidy = alleles.size(); + for ( int i = 0; i < maxPloidy; i++ ) { + if ( i < samplePloidy ) { + // we encode the actual allele + final Allele a = alleles.get(i); + final int offset = a.isNoCall() ? -1 : Integer.valueOf(alleleMap.get(a)); + final int encoded = ((offset+1) << 1) | (g.isPhased() ? 0x01 : 0x00); + encoder.encodePrimitive(encoded, BCF2Type.INT8); + } else { + // we need to pad with missing as we have ploidy < max for this sample + encoder.encodePrimitive(BCF2Type.INT8.getMissingBytes(), BCF2Type.INT8); + } } - encoder.encodeRawValues(encoding, BCF2Type.INT8); } } diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/VCFWriter.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/VCFWriter.java index d080d1754..10eeb2778 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/VCFWriter.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/VCFWriter.java @@ -257,9 +257,6 @@ class VCFWriter extends IndexingVariantContextWriter { List genotypeAttributeKeys = new ArrayList(); if ( vc.hasGenotypes() ) { genotypeAttributeKeys.addAll(calcVCFGenotypeKeys(vc)); - } else if ( mHeader.hasGenotypingData() ) { - // this needs to be done in case all samples are no-calls - genotypeAttributeKeys.add(VCFConstants.GENOTYPE_KEY); } if ( genotypeAttributeKeys.size() > 0 ) { @@ -470,6 +467,11 @@ class VCFWriter extends IndexingVariantContextWriter { return result; } + /** + * Determine which genotype fields are in use in the genotypes in VC + * @param vc + * @return an ordered list of genotype fields in use in VC. If vc has genotypes this will always include GT first + */ public static List calcVCFGenotypeKeys(VariantContext vc) { Set keys = new HashSet(); @@ -502,7 +504,12 @@ class VCFWriter extends IndexingVariantContextWriter { sortedList = newList; } - return sortedList; + if ( sortedList.isEmpty() && vc.hasGenotypes() ) { + // this needs to be done in case all samples are no-calls + return Collections.singletonList(VCFConstants.GENOTYPE_KEY); + } else { + return sortedList; + } } diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextTestProvider.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextTestProvider.java index 81ad23a5e..196c32f19 100644 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextTestProvider.java +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextTestProvider.java @@ -46,8 +46,8 @@ import java.util.*; * @since Date created */ public class VariantContextTestProvider { - final private static boolean ENABLE_VARARRAY_TESTS = false; - final private static boolean ENABLE_PLOIDY_TESTS = false; + final private static boolean ENABLE_VARARRAY_TESTS = true; + final private static boolean ENABLE_PLOIDY_TESTS = true; final private static boolean ENABLE_PL_TESTS = true; private static VCFHeader syntheticHeader; final static List TEST_DATAs = new ArrayList(); @@ -282,6 +282,7 @@ public class VariantContextTestProvider { final Genotype homVar = new Genotype("homVar", Arrays.asList(alt1, alt1)); addGenotypeTests(site, homRef, het); addGenotypeTests(site, homRef, het, homVar); + final List noCall = new ArrayList(); // ploidy if ( ENABLE_PLOIDY_TESTS ) { @@ -292,6 +293,11 @@ public class VariantContextTestProvider { addGenotypeTests(site, new Genotype("dip", Arrays.asList(ref, alt1)), new Genotype("tet", Arrays.asList(ref, alt1, alt1))); + + addGenotypeTests(site, + new Genotype("nocall", noCall), + new Genotype("dip", Arrays.asList(ref, alt1)), + new Genotype("tet", Arrays.asList(ref, alt1, alt1))); } }