From dd6aee347a6a8e68cf71760b14529f075fdd7245 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Tue, 12 Jun 2012 15:44:12 -0400 Subject: [PATCH] Genotype encoding uses the BCF2FieldEncoder system --- .../sting/utils/codecs/bcf2/BCF2Utils.java | 40 +++ .../codecs/vcf/VCFCompoundHeaderLine.java | 3 +- .../writer/BCF2FieldEncoder.java | 89 +++++- .../writer/BCF2FieldWriter.java | 217 +++++++++++++- .../writer/BCF2FieldWriterManager.java | 128 ++++++-- .../variantcontext/writer/BCF2Writer.java | 275 +----------------- .../UnifiedGenotyperIntegrationTest.java | 66 ++--- .../VariantContextTestProvider.java | 46 +-- 8 files changed, 495 insertions(+), 369 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Utils.java b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Utils.java index 60e82b800..64e9121e9 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Utils.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Utils.java @@ -220,6 +220,7 @@ public final class BCF2Utils { return new File( path + ".bcf" ); } + @Ensures("BCF2Type.INTEGERS.contains(result)") public final static BCF2Type determineIntegerType(final int value) { for ( final BCF2Type potentialType : INTEGER_TYPES_BY_SIZE) { if ( potentialType.withinRange(value) ) @@ -229,6 +230,7 @@ public final class BCF2Utils { throw new ReviewedStingException("Integer cannot be encoded in allowable range of even INT32: " + value); } + @Ensures("BCF2Type.INTEGERS.contains(result)") public final static BCF2Type determineIntegerType(final int[] values) { // literally a copy of the code below, but there's no general way to unify lists and arrays in java BCF2Type maxType = BCF2Type.INT8; @@ -244,6 +246,27 @@ public final class BCF2Utils { return maxType; } + /** + * Returns the maximum BCF2 integer size of t1 and t2 + * + * For example, if t1 == INT8 and t2 == INT16 returns INT16 + * + * @param t1 + * @param t2 + * @return + */ + @Requires({"BCF2Type.INTEGERS.contains(t1)","BCF2Type.INTEGERS.contains(t2)"}) + @Ensures("BCF2Type.INTEGERS.contains(result)") + public final static BCF2Type maxIntegerType(final BCF2Type t1, final BCF2Type t2) { + switch ( t1 ) { + case INT8: return t2; + case INT16: return t2 == BCF2Type.INT32 ? t2 : t1; + case INT32: return t1; + default: throw new ReviewedStingException("BUG: unexpected BCF2Type " + t1); + } + } + + @Ensures("BCF2Type.INTEGERS.contains(result)") public final static BCF2Type determineIntegerType(final List values) { BCF2Type maxType = BCF2Type.INT8; for ( final int value : values ) { @@ -257,4 +280,21 @@ public final class BCF2Utils { } return maxType; } + + /** + * Helper function that takes an object and returns a list representation + * of it: + * + * o == null => [] + * o is a list => o + * else => [o] + * + * @param o + * @return + */ + public final static List toList(final Object o) { + if ( o == null ) return Collections.emptyList(); + else if ( o instanceof List ) return (List)o; + else return Collections.singletonList(o); + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCompoundHeaderLine.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCompoundHeaderLine.java index d2bd507b5..608a2026a 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCompoundHeaderLine.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCompoundHeaderLine.java @@ -56,8 +56,9 @@ public abstract class VCFCompoundHeaderLine extends VCFHeaderLine implements VCF public String getDescription() { return description; } public VCFHeaderLineType getType() { return type; } public VCFHeaderLineCount getCountType() { return countType; } + public boolean isFixedCount() { return countType == VCFHeaderLineCount.INTEGER; } public int getCount() { - if ( countType != VCFHeaderLineCount.INTEGER ) + if ( ! isFixedCount() ) throw new ReviewedStingException("Asking for header line count when type is not an integer"); return count; } diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2FieldEncoder.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2FieldEncoder.java index 78b5aaf39..74aff2d20 100644 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2FieldEncoder.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2FieldEncoder.java @@ -24,6 +24,8 @@ package org.broadinstitute.sting.utils.variantcontext.writer; +import com.google.java.contract.Ensures; +import com.google.java.contract.Invariant; import com.google.java.contract.Requires; import org.broadinstitute.sting.utils.codecs.bcf2.BCF2Encoder; import org.broadinstitute.sting.utils.codecs.bcf2.BCF2Type; @@ -44,6 +46,11 @@ import java.util.Map; * @author Your Name * @since Date created */ +@Invariant({ + "headerLine != null", + "BCF2Type.INTEGERS.contains(dictionaryOffsetType)", + "dictionaryOffset >= 0" +}) public abstract class BCF2FieldEncoder { final VCFCompoundHeaderLine headerLine; final BCF2Type fixedType; @@ -72,16 +79,41 @@ public abstract class BCF2FieldEncoder { public boolean hasUnboundedCount() { return getCountType() == VCFHeaderLineCount.UNBOUNDED; } public boolean hasContextDeterminedCount() { return ! hasFixedCount() && ! hasUnboundedCount(); } + // + // TODO -- this class should own two clean methods + // + // Tell us whether the type and size are static, determined by from the VC itself, + // or from the actual encoded values. If the last case, provide a function that tell us + // the encoding type and size of the underlying data, given a single value. + // + + // TODO -- cleanup logic of counts + // todo -- differentiate between the VCF header declared size and the encoded size + // TODO -- for example, getUnboundedCount should be getCountFromSizeOfValue() + // + // GenotypeEncoders need to inspect the size properties of the underlying encoder + // and determine how (and whether) they need to iterate once through the data to + // determine max size (for padding) + // + @Requires("hasFixedCount()") - public int getFixedCount() { return headerLine.getCount(); } + public int getFixedCount() { + return headerLine.getCount(); + } + + public int getUnboundedCount(final Object value) { + return value instanceof List ? ((List) value).size() : 1; + } + public int getContextDeterminedCount(final VariantContext vc) { return headerLine.getCount(vc.getNAlleles() - 1); } + public int getBCFFieldCount(final VariantContext vc, final Object value) { if ( hasFixedCount() ) return getFixedCount(); else if ( hasUnboundedCount() ) - return value instanceof List ? ((List) value).size() : 1; + return getUnboundedCount(value); else return getContextDeterminedCount(vc); } @@ -107,7 +139,11 @@ public abstract class BCF2FieldEncoder { return "BCF2FieldEncoder for " + getField() + " with count " + getCountType() + " encoded with " + getClass().getSimpleName(); } - public abstract void encodeValue(final BCF2Encoder encoder, final Object value, final BCF2Type type) throws IOException; + public void encodeValue(final BCF2Encoder encoder, final Object value, final BCF2Type type) throws IOException { + encodeValue(encoder, value, type, 0); + } + + public abstract void encodeValue(final BCF2Encoder encoder, final Object value, final BCF2Type type, final int minValues) throws IOException; /** @@ -133,10 +169,10 @@ public abstract class BCF2FieldEncoder { } @Override - public void encodeValue(final BCF2Encoder encoder, final Object value, final BCF2Type type) throws IOException { + public void encodeValue(final BCF2Encoder encoder, final Object value, final BCF2Type type, final int minValues) throws IOException { if ( value != null ) { final String s = encodeString(value); - encoder.encodeString(s, s.length()); + encoder.encodeString(s, Math.max(s.length(), minValues)); } } @@ -153,7 +189,7 @@ public abstract class BCF2FieldEncoder { public static class Flag extends BCF2FieldEncoder { public Flag(final VCFCompoundHeaderLine headerLine, final BCF2Encoder encoder, final Map dict ) { super(headerLine, encoder, dict, BCF2Type.INT8); - if ( getHeaderLine().getCount() != 0 ) + if ( ! headerLine.isFixedCount() || headerLine.getCount() != 0 ) throw new ReviewedStingException("Flag encoder only suppports atomic flags!"); } @@ -163,7 +199,8 @@ public abstract class BCF2FieldEncoder { } @Override - public void encodeValue(final BCF2Encoder encoder, final Object value, final BCF2Type type) throws IOException { + @Requires("minValues <= 1") + public void encodeValue(final BCF2Encoder encoder, final Object value, final BCF2Type type, final int minValues) throws IOException { encoder.encodePrimitive(1, getFixedType()); } } @@ -174,10 +211,14 @@ public abstract class BCF2FieldEncoder { } @Override - public void encodeValue(final BCF2Encoder encoder, final Object value, final BCF2Type type) throws IOException { + public void encodeValue(final BCF2Encoder encoder, final Object value, final BCF2Type type, final int minValues) throws IOException { final List doubles = toList(Double.class, value); - for ( final double d : doubles ) + int count = 0; + for ( final double d : doubles ) { encoder.encodeRawFloat(d); + count++; + } + for ( ; count < minValues; count++ ) encoder.encodeRawMissingValue(type); } } @@ -186,15 +227,26 @@ public abstract class BCF2FieldEncoder { super(headerLine, encoder, dict, null); } + @Override + public int getUnboundedCount(final Object value) { + return value == null ? 0 : ((int[])value).length; + } + @Override public BCF2Type getDynamicType(final Object value) { return value == null ? BCF2Type.INT8 : BCF2Utils.determineIntegerType((int[])value); } @Override - public void encodeValue(final BCF2Encoder encoder, final Object value, final BCF2Type type) throws IOException { - for ( final int i : (int[])value ) - encoder.encodeRawInt(i, type); + public void encodeValue(final BCF2Encoder encoder, final Object value, final BCF2Type type, final int minValues) throws IOException { + int count = 0; + if ( value != null ) { + for ( final int i : (int[])value ) { + encoder.encodeRawInt(i, type); + count++; + } + } + for ( ; count < minValues; count++ ) encoder.encodeRawMissingValue(type); } } @@ -209,9 +261,13 @@ public abstract class BCF2FieldEncoder { } @Override - public void encodeValue(final BCF2Encoder encoder, final Object value, final BCF2Type type) throws IOException { - for ( final int i : toList(Integer.class, value) ) + public void encodeValue(final BCF2Encoder encoder, final Object value, final BCF2Type type, final int minValues) throws IOException { + int count = 0; + for ( final int i : toList(Integer.class, value) ) { encoder.encodeRawInt(i, type); + count++; + } + for ( ; count < minValues; count++ ) encoder.encodeRawMissingValue(type); } } @@ -226,8 +282,9 @@ public abstract class BCF2FieldEncoder { } @Override - public void encodeValue(final BCF2Encoder encoder, final Object value, final BCF2Type type) throws IOException { - encoder.encodeRawInt((Integer)value, type); + @Requires("minValues <= 1") // 0 is ok as this means no values need to be encoded + public void encodeValue(final BCF2Encoder encoder, final Object value, final BCF2Type type, final int minValues) throws IOException { + encoder.encodeRawInt(value == null ? type.getMissingBytes() : (Integer)value, type); } } } diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2FieldWriter.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2FieldWriter.java index 3d3efdd24..d796e09c0 100644 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2FieldWriter.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2FieldWriter.java @@ -26,9 +26,18 @@ package org.broadinstitute.sting.utils.variantcontext.writer; import org.broadinstitute.sting.utils.codecs.bcf2.BCF2Encoder; import org.broadinstitute.sting.utils.codecs.bcf2.BCF2Type; +import org.broadinstitute.sting.utils.codecs.bcf2.BCF2Utils; +import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.variantcontext.Allele; +import org.broadinstitute.sting.utils.variantcontext.Genotype; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; /** * [Short one sentence description of this walker] @@ -58,43 +67,53 @@ import java.io.IOException; * @since Date created */ public abstract class BCF2FieldWriter { + private final VCFHeader header; private final BCF2FieldEncoder fieldEncoder; - protected BCF2FieldWriter(final BCF2FieldEncoder fieldEncoder) { + protected BCF2FieldWriter(final VCFHeader header, final BCF2FieldEncoder fieldEncoder) { + this.header = header; this.fieldEncoder = fieldEncoder; } + protected VCFHeader getHeader() { return header; } protected BCF2FieldEncoder getFieldEncoder() { return fieldEncoder; } + protected String getField() { return getFieldEncoder().getField(); } public void start(final BCF2Encoder encoder, final VariantContext vc) throws IOException { encoder.encodeTyped(fieldEncoder.getDictionaryOffset(), fieldEncoder.getDictionaryOffsetType()); } - public void done(final BCF2Encoder encoder, final VariantContext vc) throws IOException { } + public void done(final BCF2Encoder encoder, final VariantContext vc) throws IOException { } // TODO -- overload done so that we null out values and test for correctness @Override public String toString() { return "BCF2FieldWriter " + getClass().getSimpleName() + " with encoder " + getFieldEncoder(); } + // -------------------------------------------------------------------------------- + // + // Sites writers + // + // -------------------------------------------------------------------------------- + public static abstract class SiteWriter extends BCF2FieldWriter { - protected SiteWriter(final BCF2FieldEncoder fieldEncoder) { - super(fieldEncoder); + protected SiteWriter(final VCFHeader header, final BCF2FieldEncoder fieldEncoder) { + super(header, fieldEncoder); } public abstract void site(final BCF2Encoder encoder, final VariantContext vc) throws IOException; } public static class GenericSiteWriter extends SiteWriter { - public GenericSiteWriter(final BCF2FieldEncoder fieldEncoder) { - super(fieldEncoder); + public GenericSiteWriter(final VCFHeader header, final BCF2FieldEncoder fieldEncoder) { + super(header, fieldEncoder); } @Override public void site(final BCF2Encoder encoder, final VariantContext vc) throws IOException { - final Object rawValue = vc.getAttribute(getFieldEncoder().getField(), null); + final Object rawValue = vc.getAttribute(getField(), null); final BCF2Type type = getFieldEncoder().getType(rawValue); if ( rawValue == null ) { // the value is missing, just write in null @@ -106,5 +125,189 @@ public abstract class BCF2FieldWriter { } } } + + // -------------------------------------------------------------------------------- + // + // Genotypes writers + // + // -------------------------------------------------------------------------------- + + public static abstract class GenotypesWriter extends BCF2FieldWriter { + int nValuesPerGenotype = -1; + BCF2Type encodingType = null; + + protected GenotypesWriter(final VCFHeader header, final BCF2FieldEncoder fieldEncoder) { + super(header, fieldEncoder); + + if ( fieldEncoder.hasFixedCount() ) { + nValuesPerGenotype = getFieldEncoder().getFixedCount(); + } + } + + @Override + public void start(final BCF2Encoder encoder, final VariantContext vc) throws IOException { + // writes the key information + super.start(encoder, vc); + + // only update if we need to + if ( ! getFieldEncoder().hasFixedCount() ) { + if ( getFieldEncoder().hasContextDeterminedCount() ) + // we are cheap -- just depends on genotype of allele counts + nValuesPerGenotype = getFieldEncoder().getContextDeterminedCount(vc); + else + // we have to go fishing through the values themselves (expensive) + nValuesPerGenotype = computeMaxSizeOfGenotypeFieldFromValues(vc); + } + + encoder.encodeType(nValuesPerGenotype, encodingType); + } + + public void addGenotype(final BCF2Encoder encoder, final VariantContext vc, final Genotype g) throws IOException { + final Object fieldValue = g.getAttribute(getField(), null); + getFieldEncoder().encodeValue(encoder, fieldValue, encodingType, nValuesPerGenotype); + } + + public Object getGenotypeValue(final Genotype g) { + return g.getAttribute(getField()); + } + + private final int computeMaxSizeOfGenotypeFieldFromValues(final VariantContext vc) { + int size = -1; + + for ( final Genotype g : vc.getGenotypes() ) { + final Object o = getGenotypeValue(g); + size = Math.max(size, getFieldEncoder().getBCFFieldCount(vc, o)); + } + + return size; + } + } + + public static class FixedTypeGenotypesWriter extends GenotypesWriter { + public FixedTypeGenotypesWriter(final VCFHeader header, final BCF2FieldEncoder fieldEncoder) { + super(header, fieldEncoder); + + encodingType = getFieldEncoder().getFixedType(); + } + } + + public static class IntegerTypeGenotypesWriter extends GenotypesWriter { + public IntegerTypeGenotypesWriter(final VCFHeader header, final BCF2FieldEncoder fieldEncoder) { + super(header, fieldEncoder); + } + + @Override + public void start(final BCF2Encoder encoder, final VariantContext vc) throws IOException { + // the only value that is dynamic are integers + final List values = new ArrayList(vc.getNSamples()); + for ( final Genotype g : vc.getGenotypes() ) { + for ( final Object i : BCF2Utils.toList(g.getAttribute(getField(), null)) ) { + values.add((Integer)i); // we know they are all integers + } + } + + encodingType = BCF2Utils.determineIntegerType(values); + super.start(encoder, vc); + } + } + + // TODO TODO TODO TODO TODO + // TODO + // TODO THIS ROUTINE NEEDS TO BE OPTIMIZED. IT ACCOUNTS FOR A SIGNIFICANT AMOUNT OF THE + // TODO RUNTIME FOR WRITING OUT BCF FILES WITH MANY GENOTYPES + // TODO + // TODO TODO TODO TODO TODO + public static class IGFGenotypesWriter extends GenotypesWriter { + final IntGenotypeFieldAccessors.Accessor ige; + + public IGFGenotypesWriter(final VCFHeader header, final BCF2FieldEncoder fieldEncoder, final IntGenotypeFieldAccessors.Accessor ige) { + super(header, fieldEncoder); + this.ige = ige; + + if ( ! (fieldEncoder instanceof BCF2FieldEncoder.IntArray) ) + throw new ReviewedStingException("BUG: IntGenotypesWriter requires IntArray encoder for field " + getField()); + } + + @Override + public void start(final BCF2Encoder encoder, final VariantContext vc) throws IOException { + encodingType = BCF2Type.INT8; + for ( final Genotype g : vc.getGenotypes() ) { + final int[] pls = ige.getValues(g); + final BCF2Type plsType = getFieldEncoder().getType(pls); + encodingType = BCF2Utils.maxIntegerType(encodingType, plsType); + if ( encodingType == BCF2Type.INT32 ) + break; // stop early + } + + super.start(encoder, vc); + } + + @Override + public void addGenotype(final BCF2Encoder encoder, final VariantContext vc, final Genotype g) throws IOException { + getFieldEncoder().encodeValue(encoder, ige.getValues(g), encodingType, nValuesPerGenotype); + } + + @Override + public Object getGenotypeValue(final Genotype g) { + return ige.getValues(g); + } + } + + // TODO TODO TODO TODO TODO + // TODO + // TODO we should really have a fast path for encoding diploid genotypes where + // TODO we don't pay the overhead of creating the allele maps + // TODO + // TODO TODO TODO TODO TODO + public static class GTWriter extends GenotypesWriter { + Map alleleMap = null; + + public GTWriter(final VCFHeader header, final BCF2FieldEncoder fieldEncoder) { + super(header, fieldEncoder); + } + + @Override + public void start(final BCF2Encoder encoder, final VariantContext vc) throws IOException { + if ( vc.getNAlleles() > BCF2Utils.MAX_ALLELES_IN_GENOTYPES ) + throw new ReviewedStingException("Current BCF2 encoder cannot handle sites " + + "with > " + BCF2Utils.MAX_ALLELES_IN_GENOTYPES + " alleles, but you have " + + vc.getNAlleles() + " at " + vc.getChr() + ":" + vc.getStart()); + + encodingType = BCF2Type.INT8; + alleleMap = buildAlleleMap(vc); + nValuesPerGenotype = vc.getMaxPloidy(); + super.start(encoder, vc); //To change body of overridden methods use File | Settings | File Templates. + } + + @Override + public void addGenotype(final BCF2Encoder encoder, final VariantContext vc, final Genotype g) throws IOException { + final List alleles = g.getAlleles(); + final int samplePloidy = alleles.size(); + for ( int i = 0; i < nValuesPerGenotype; i++ ) { + if ( i < samplePloidy ) { + // we encode the actual allele + final Allele a = alleles.get(i); + final int offset = alleleMap.get(a); + final int encoded = ((offset+1) << 1) | (g.isPhased() ? 0x01 : 0x00); + encoder.encodePrimitive(encoded, encodingType); + } else { + // we need to pad with missing as we have ploidy < max for this sample + encoder.encodePrimitive(encodingType.getMissingBytes(), encodingType); + } + } + } + + private final static Map buildAlleleMap(final VariantContext vc) { + final Map alleleMap = new HashMap(vc.getAlleles().size()+1); + alleleMap.put(Allele.NO_CALL, -1); // convenience for lookup + + final List alleles = vc.getAlleles(); + for ( int i = 0; i < alleles.size(); i++ ) { + alleleMap.put(alleles.get(i), i); + } + + return alleleMap; + } + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2FieldWriterManager.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2FieldWriterManager.java index 47764a681..4b527fdca 100644 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2FieldWriterManager.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2FieldWriterManager.java @@ -26,10 +26,7 @@ package org.broadinstitute.sting.utils.variantcontext.writer; import org.apache.log4j.Logger; import org.broadinstitute.sting.utils.codecs.bcf2.BCF2Encoder; -import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader; -import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLine; -import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineCount; -import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; +import org.broadinstitute.sting.utils.codecs.vcf.*; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import java.util.HashMap; @@ -65,6 +62,8 @@ import java.util.Map; public class BCF2FieldWriterManager { final protected static Logger logger = Logger.getLogger(BCF2FieldWriterManager.class); final Map siteWriters = new HashMap(); + final Map genotypesWriters = new HashMap(); + final IntGenotypeFieldAccessors intGenotypeFieldAccessors = new IntGenotypeFieldAccessors(); public BCF2FieldWriterManager() { } @@ -72,41 +71,108 @@ public class BCF2FieldWriterManager { for (final VCFHeaderLine line : header.getMetaData()) { if ( line instanceof VCFInfoHeaderLine ) { final String field = ((VCFInfoHeaderLine) line).getID(); - final BCF2FieldWriter.SiteWriter writer = createInfoWriter((VCFInfoHeaderLine)line, encoder, dictionary); - logger.info("Installing for field " + field + " field writer " + writer); + final BCF2FieldWriter.SiteWriter writer = createInfoWriter(header, (VCFInfoHeaderLine)line, encoder, dictionary); + log(field, writer); siteWriters.put(field, writer); + } else if ( line instanceof VCFFormatHeaderLine ) { + final String field = ((VCFFormatHeaderLine) line).getID(); + final BCF2FieldWriter.GenotypesWriter writer = createGenotypesWriter(header, (VCFFormatHeaderLine)line, encoder, dictionary); + log(field, writer); + genotypesWriters.put(field, writer); } } } - private BCF2FieldWriter.SiteWriter createInfoWriter(final VCFInfoHeaderLine line, final BCF2Encoder encoder, final Map dict) { - BCF2FieldEncoder fieldEncoder = null; - switch ( line.getType() ) { - case Character: - case String: - fieldEncoder = new BCF2FieldEncoder.StringOrCharacter(line, encoder, dict); - break; - case Flag: - fieldEncoder = new BCF2FieldEncoder.Flag(line, encoder, dict); - break; - case Float: - fieldEncoder = new BCF2FieldEncoder.Float(line, encoder, dict); - break; - case Integer: - if ( line.getCountType() == VCFHeaderLineCount.INTEGER && line.getCount() == 1 ) - fieldEncoder = new BCF2FieldEncoder.AtomicInt(line, encoder, dict); - else - fieldEncoder = new BCF2FieldEncoder.IntList(line, encoder, dict); - break; - default: - throw new ReviewedStingException("Unexpected type for field " + line.getID()); - } - - return new BCF2FieldWriter.GenericSiteWriter(fieldEncoder); + private final void log(final String field, final BCF2FieldWriter writer) { + logger.info("Using writer " + writer); } + // ----------------------------------------------------------------- + // + // Master routine to look at the header, a specific line, and + // build an appropriate SiteWriter for that header element + // + // ----------------------------------------------------------------- + + private BCF2FieldWriter.SiteWriter createInfoWriter(final VCFHeader header, + final VCFInfoHeaderLine line, + final BCF2Encoder encoder, + final Map dict) { + return new BCF2FieldWriter.GenericSiteWriter(header, createFieldEncoder(line, encoder, dict, false)); + } + + private BCF2FieldEncoder createFieldEncoder(final VCFCompoundHeaderLine line, + final BCF2Encoder encoder, + final Map dict, + final boolean createGenotypesEncoders ) { + + if ( createGenotypesEncoders && intGenotypeFieldAccessors.getAccessor(line.getID()) != null ) { + if ( line.getType() != VCFHeaderLineType.Integer ) + logger.warn("Warning: field " + line.getID() + " expected to encode an integer but saw " + line.getType() + " for record " + line); + return new BCF2FieldEncoder.IntArray(line, encoder, dict); + } else if ( createGenotypesEncoders && line.getID().equals(VCFConstants.GENOTYPE_KEY) ) { + return new BCF2FieldEncoder.IntList(line, encoder, dict); + } else { + switch ( line.getType() ) { + case Character: + case String: + return new BCF2FieldEncoder.StringOrCharacter(line, encoder, dict); + case Flag: + return new BCF2FieldEncoder.Flag(line, encoder, dict); + case Float: + return new BCF2FieldEncoder.Float(line, encoder, dict); + case Integer: + if ( line.getCountType() == VCFHeaderLineCount.INTEGER && line.getCount() == 1 ) + return new BCF2FieldEncoder.AtomicInt(line, encoder, dict); + else + return new BCF2FieldEncoder.IntList(line, encoder, dict); + default: + throw new ReviewedStingException("Unexpected type for field " + line.getID()); + } + } + } + + // ----------------------------------------------------------------- + // + // Master routine to look at the header, a specific line, and + // build an appropriate Genotypes for that header element + // + // ----------------------------------------------------------------- + + private BCF2FieldWriter.GenotypesWriter createGenotypesWriter(final VCFHeader header, + final VCFFormatHeaderLine line, + final BCF2Encoder encoder, + final Map dict) { + final String field = line.getID(); + final BCF2FieldEncoder fieldEncoder = createFieldEncoder(line, encoder, dict, true); + + if ( field.equals(VCFConstants.GENOTYPE_KEY) ) { + return new BCF2FieldWriter.GTWriter(header, fieldEncoder); + } else if ( intGenotypeFieldAccessors.getAccessor(field) != null ) { + return new BCF2FieldWriter.IGFGenotypesWriter(header, fieldEncoder, intGenotypeFieldAccessors.getAccessor(field)); + } else if ( line.getType() == VCFHeaderLineType.Integer ) { + return new BCF2FieldWriter.IntegerTypeGenotypesWriter(header, fieldEncoder); + } else { + return new BCF2FieldWriter.FixedTypeGenotypesWriter(header, fieldEncoder); + } + } + + // ----------------------------------------------------------------- + // + // Accessors to get site / genotype writers + // + // ----------------------------------------------------------------- + public BCF2FieldWriter.SiteWriter getSiteFieldWriter(final String key) { - final BCF2FieldWriter.SiteWriter writer = siteWriters.get(key); + return getWriter(key, siteWriters); + } + + public BCF2FieldWriter.GenotypesWriter getGenotypeFieldWriter(final String key) { + return getWriter(key, genotypesWriters); + } + + public T getWriter(final String key, final Map map) { + final T writer = map.get(key); if ( writer == null ) throw new ReviewedStingException("BUG: no writer found for " + key); return writer; } diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java index 2c9f08fec..2c44c0fc8 100644 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java @@ -50,7 +50,6 @@ class BCF2Writer extends IndexingVariantContextWriter { private final boolean doNotWriteGenotypes; private final BCF2Encoder encoder = new BCF2Encoder(); // initialized after the header arrives - IntGenotypeFieldAccessors intGenotypeFieldAccessors = new IntGenotypeFieldAccessors(); final BCF2FieldWriterManager fieldManager = new BCF2FieldWriterManager(); public BCF2Writer(final File location, final OutputStream output, final SAMSequenceDictionary refDict, final boolean enableOnTheFlyIndexing, final boolean doNotWriteGenotypes) { @@ -233,251 +232,32 @@ class BCF2Writer extends IndexingVariantContextWriter { writer.start(encoder, vc); writer.site(encoder, vc); writer.done(encoder, vc); - - // the old way of doing things -// final VCFToBCFEncoding encoding = prepFieldValueForEncoding(key, infoFieldEntry.getValue()); -// encodeStringByRef(key); -// encoder.encodeTyped(encoding.valuesToEncode, encoding.BCF2Type); } } - // -------------------------------------------------------------------------------- - // - // Type matching routines between VCF and BCF - // - // -------------------------------------------------------------------------------- - - - private final class VCFToBCFEncoding { - VCFHeaderLineType vcfType; - BCF2Type BCF2Type; - List valuesToEncode; - - private VCFToBCFEncoding(final VCFHeaderLineType vcfType, final BCF2Type BCF2Type, final List valuesToEncode) { - this.vcfType = vcfType; - this.BCF2Type = BCF2Type; - this.valuesToEncode = valuesToEncode; - } - } - - // TODO TODO TODO TODO TODO - // TODO - // TODO -- we really need explicit converters as first class objects - // TODO -- need to generalize so we can enable vectors of compressed genotype ints - // TODO -- no sense in allocating these over and over - // TODO - // TODO TODO TODO TODO TODO - private final VCFToBCFEncoding prepFieldValueForEncoding(final String field, final Object value) { - final VCFCompoundHeaderLine metaData = VariantContextUtils.getMetaDataForField(header, field); - final boolean isList = value instanceof List; - final Object toType = isList ? ((List)value).get(0) : value; - - try { - switch ( metaData.getType() ) { - case Character: - assert toType instanceof String; - return new VCFToBCFEncoding(metaData.getType(), BCF2Type.CHAR, Collections.singletonList(value)); - case Flag: - return new VCFToBCFEncoding(metaData.getType(), BCF2Type.INT8, Collections.singletonList(1)); - case String: - final String s = isList ? BCF2Utils.collapseStringList((List)value) : (String)value; - return new VCFToBCFEncoding(metaData.getType(), BCF2Type.CHAR, Collections.singletonList(s)); - case Integer: // note integer calculation is a bit complex because of the need to determine sizes - List l; - BCF2Type intType; - if ( isList ) { - l = (List)value; - intType = BCF2Utils.determineIntegerType(l); - } else if ( value != null ) { - intType = BCF2Utils.determineIntegerType((Integer) value); - l = Collections.singletonList((Integer)value); - } else { - intType = BCF2Type.INT8; - l = Collections.singletonList((Integer) null); - } - return new VCFToBCFEncoding(metaData.getType(), intType, l); - case Float: - return new VCFToBCFEncoding(metaData.getType(), BCF2Type.FLOAT, isList ? (List)value : Collections.singletonList(value)); - default: - throw new ReviewedStingException("Unexpected type for field " + field); - } - } catch ( ClassCastException e ) { - throw new ReviewedStingException("Error computing VCF -> BCF encoding. Received cast class exception" - + " indicating that the VCF header for " + metaData + " is inconsistent with the" + - " value seen in the VariantContext object = " + value, e); - } - } - - // -------------------------------------------------------------------------------- - // - // Genotype field encoding - // - // -------------------------------------------------------------------------------- - private byte[] buildSamplesData(final VariantContext vc) throws IOException { - BCF2Codec.LazyData lazyData = getLazyData(vc); + final BCF2Codec.LazyData lazyData = getLazyData(vc); if ( lazyData != null ) { + // we never decoded any data from this BCF file, so just pass it back return lazyData.bytes; } else { // we have to do work to convert the VC into a BCF2 byte stream - List genotypeFields = VCFWriter.calcVCFGenotypeKeys(vc, header); + final List genotypeFields = VCFWriter.calcVCFGenotypeKeys(vc, header); for ( final String field : genotypeFields ) { - if ( field.equals(VCFConstants.GENOTYPE_KEY) ) { - addGenotypes(vc); - } else if ( intGenotypeFieldAccessors.getAccessor(field) != null ) { - addIntGenotypeField(field, intGenotypeFieldAccessors.getAccessor(field), vc); - } else { - addGenericGenotypeField(vc, field); + final BCF2FieldWriter.GenotypesWriter writer = fieldManager.getGenotypeFieldWriter(field); + + writer.start(encoder, vc); + for ( final String name : header.getGenotypeSamples() ) { + // todo -- can we optimize this get (string -> genotype) which can be expensive + final Genotype g = vc.getGenotype(name); + writer.addGenotype(encoder, vc, g); } + writer.done(encoder, vc); } return encoder.getRecordBytes(); } } - private final int getNGenotypeFieldValues(final String field, final VariantContext vc, final IntGenotypeFieldAccessors.Accessor ige) { - final VCFCompoundHeaderLine metaData = VariantContextUtils.getMetaDataForField(header, field); - assert metaData != null; // field is supposed to be in header - - int nFields = metaData.getCount(vc.getNAlleles() - 1); - if ( nFields == -1 ) { // unbounded, need to look at values - return computeMaxSizeOfGenotypeFieldFromValues(field, vc, ige); - } else { - return nFields; - } - } - - private final int computeMaxSizeOfGenotypeFieldFromValues(final String field, final VariantContext vc, final IntGenotypeFieldAccessors.Accessor ige) { - int size = -1; - final GenotypesContext gc = vc.getGenotypes(); - - if ( ige == null ) { - for ( final Genotype g : gc ) { - final Object o = g.getAttribute(field); - if ( o == null ) continue; - if ( o instanceof List ) { - // only do compute if first value is of type list - size = Math.max(size, ((List)o).size()); - } else if ( size == -1 ) - size = 1; - } - } else { - for ( final Genotype g : gc ) { - size = Math.max(size, ige.getSize(g)); - } - } - - return size; - } - - private final void addGenericGenotypeField(final VariantContext vc, final String field) throws IOException { - final int numInFormatField = getNGenotypeFieldValues(field, vc, null); - final VCFToBCFEncoding encoding = prepFieldValueForEncoding(field, null); - - startGenotypeField(field, numInFormatField, encoding.BCF2Type); - for ( final String name : header.getGenotypeSamples() ) { - final Genotype g = vc.getGenotype(name); // todo -- can we optimize this? - try { - final Object fieldValue = g.getAttribute(field); - - if ( numInFormatField == 1 ) { - // we encode the actual allele, encodeRawValue handles the missing case where fieldValue == null - encoder.encodeRawValue(fieldValue, encoding.BCF2Type); - } else if ( encoding.vcfType == VCFHeaderLineType.String ) { - // we have multiple strings, so we need to collapse them - throw new ReviewedStingException("LIMITATION: Cannot encode arrays of string values in genotypes"); - } else { - // multiple values, need to handle general case - final List asList = toList(fieldValue); - final int nSampleValues = asList.size(); - for ( int i = 0; i < numInFormatField; i++ ) { - encoder.encodeRawValue(i < nSampleValues ? asList.get(i) : null, encoding.BCF2Type); - } - } - } catch ( ClassCastException e ) { - throw new ReviewedStingException("Value stored in VariantContext incompatible with VCF header type for field " + field, e); - } - } - } - - // TODO TODO TODO TODO TODO - // TODO - // TODO THIS ROUTINE NEEDS TO BE OPTIMIZED. IT ACCOUNTS FOR A SIGNIFICANT AMOUNT OF THE - // TODO RUNTIME FOR WRITING OUT BCF FILES WITH MANY GENOTYPES - // TODO - // TODO TODO TODO TODO TODO - private final void addIntGenotypeField(final String field, - final IntGenotypeFieldAccessors.Accessor ige, - final VariantContext vc) throws IOException { - final int numPLs = getNGenotypeFieldValues(field, vc, ige); - final int[] allPLs = new int[numPLs * vc.getNSamples()]; - - // collect all of the PLs into a single vector of values - int i = 0; - for ( final String name : header.getGenotypeSamples() ) { - final Genotype g = vc.getGenotype(name); // todo -- can we optimize this? - final int[] pls = ige.getValues(g); - - if ( pls == null ) - for ( int j = 0; j < numPLs; j++) allPLs[i++] = -1; - else { - assert pls.length == numPLs; - for ( int pl : pls ) allPLs[i++] = pl; - } - } - - // determine the best size - final BCF2Type type = BCF2Utils.determineIntegerType(allPLs); - startGenotypeField(field, numPLs, type); - for ( int pl : allPLs ) - encoder.encodePrimitive(pl == -1 ? type.getMissingBytes() : pl, type); - } - - // TODO TODO TODO TODO TODO - // TODO - // TODO we should really have a fast path for encoding diploid genotypes where - // TODO we don't pay the overhead of creating the allele maps - // TODO - // TODO TODO TODO TODO TODO - private final void addGenotypes(final VariantContext vc) throws IOException { - if ( vc.getNAlleles() > BCF2Utils.MAX_ALLELES_IN_GENOTYPES ) - throw new ReviewedStingException("Current BCF2 encoder cannot handle sites " + - "with > " + BCF2Utils.MAX_ALLELES_IN_GENOTYPES + " alleles, but you have " - + vc.getNAlleles() + " at " + vc.getChr() + ":" + vc.getStart()); - - final Map alleleMap = buildAlleleMap(vc); - final int maxPloidy = vc.getMaxPloidy(); - startGenotypeField(VCFConstants.GENOTYPE_KEY, maxPloidy, BCF2Type.INT8); - for ( final String name : header.getGenotypeSamples() ) { - final Genotype g = vc.getGenotype(name); // todo -- can we optimize this? - final List alleles = g.getAlleles(); - final int samplePloidy = alleles.size(); - for ( int i = 0; i < maxPloidy; i++ ) { - if ( i < samplePloidy ) { - // we encode the actual allele - final Allele a = alleles.get(i); - final int offset = alleleMap.get(a); - final int encoded = ((offset+1) << 1) | (g.isPhased() ? 0x01 : 0x00); - encoder.encodePrimitive(encoded, BCF2Type.INT8); - } else { - // we need to pad with missing as we have ploidy < max for this sample - encoder.encodePrimitive(BCF2Type.INT8.getMissingBytes(), BCF2Type.INT8); - } - } - } - } - - private final static Map buildAlleleMap(final VariantContext vc) { - final Map alleleMap = new HashMap(vc.getAlleles().size()+1); - alleleMap.put(Allele.NO_CALL, -1); // convenience for lookup - - final List alleles = vc.getAlleles(); - for ( int i = 0; i < alleles.size(); i++ ) { - alleleMap.put(alleles.get(i), i); - } - - return alleleMap; - } - // -------------------------------------------------------------------------------- // // Low-level block encoding @@ -499,16 +279,6 @@ class BCF2Writer extends IndexingVariantContextWriter { outputStream.write(genotypesBlock); } - @Requires("string != null") - @Ensures("BCF2Type.INTEGERS.contains(result)") - private final BCF2Type encodeStringByRef(final String string) throws IOException { - final Integer offset = stringDictionaryMap.get(string); - if ( offset == null ) throw new ReviewedStingException("Format error: could not find string " + string + " in header as required by BCF"); - final BCF2Type type = BCF2Utils.determineIntegerType(offset); - encoder.encodeTyped(offset, type); - return type; - } - @Requires("! strings.isEmpty()") @Ensures("BCF2Type.INTEGERS.contains(result)") private final BCF2Type encodeStringsByRef(final Collection strings) throws IOException { @@ -540,29 +310,6 @@ class BCF2Writer extends IndexingVariantContextWriter { return maxType; } - @Requires({"key != null", "! key.equals(\"\")", "size >= 0"}) - private final void startGenotypeField(final String key, final int size, final BCF2Type valueType) throws IOException { - encodeStringByRef(key); - encoder.encodeType(size, valueType); - } - - /** - * Helper function that takes an object and returns a list representation - * of it: - * - * o == null => [] - * o is a list => o - * else => [o] - * - * @param o - * @return - */ - private final static List toList(final Object o) { - if ( o == null ) return Collections.emptyList(); - else if ( o instanceof List ) return (List)o; - else return Collections.singletonList(o); - } - /** * Create the contigDictionary from the contigLines extracted from the VCF header * diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index da2aa9003..48f57c01f 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -29,7 +29,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultiSamplePilot1() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000", 1, - Arrays.asList("bf5c76bec6e00199d441b6175b6b7c39")); + Arrays.asList("40dae29480e26efb117344e0e82e997c")); executeTest("test MultiSample Pilot1", spec); } @@ -37,7 +37,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testWithAllelesPassedIn1() { WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommand + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + testDir + "allelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,025,000", 1, - Arrays.asList("9f56f8d62c047213c894c3f250706aea")); + Arrays.asList("96b9f6005d9437bf527344cf98319739")); executeTest("test MultiSample Pilot2 with alleles passed in", spec1); } @@ -45,7 +45,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testWithAllelesPassedIn2() { WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( baseCommand + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + testDir + "allelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,025,000", 1, - Arrays.asList("0a5048062cd9022b761ae87efed5957e")); + Arrays.asList("02c5c466ad26104b64166beb788a618c")); executeTest("test MultiSample Pilot2 with alleles passed in and emitting all sites", spec2); } @@ -53,7 +53,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testSingleSamplePilot2() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,100,000", 1, - Arrays.asList("f50a30bf9bbd4e5dcd5d7d9282b6dadf")); + Arrays.asList("7cc7ec249243a0dcb9e3a3033d4704e4")); executeTest("test SingleSample Pilot2", spec); } @@ -61,7 +61,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultipleSNPAlleles() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b37KGReference + " -nosl --no_cmdline_in_header -glm BOTH --dbsnp " + b37dbSNP129 + " -I " + testDir + "multiallelic.snps.bam -o %s -L " + testDir + "multiallelic.snps.intervals", 1, - Arrays.asList("6fb6ea5f2b9da02a0fea7cb2994fb5db")); + Arrays.asList("c0416efb6ff481889542e7807baa495c")); executeTest("test Multiple SNP alleles", spec); } @@ -69,7 +69,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testBadRead() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b37KGReference + " -nosl --no_cmdline_in_header -glm BOTH -I " + testDir + "badRead.test.bam -o %s -L 1:22753424-22753464", 1, - Arrays.asList("95158fb50db5d41a678cd331a3ffe5e1")); + Arrays.asList("e2cf97bca4a720ca64ca7f682da6c9f0")); executeTest("test bad read", spec); } @@ -77,7 +77,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testReverseTrim() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b37KGReference + " -nosl --no_cmdline_in_header -glm INDEL -I " + validationDataLocation + "CEUTrio.HiSeq.b37.chr20.10_11mb.bam -o %s -L 20:10289124 -L 20:10090289", 1, - Arrays.asList("c86e05f315a86bc190d72cde911e6fe2")); + Arrays.asList("976694d7aedd0272a3e2ba46e00581f5")); executeTest("test reverse trim", spec); } @@ -87,7 +87,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { // // -------------------------------------------------------------------------------------------------------------- - private final static String COMPRESSED_OUTPUT_MD5 = "f6d655714706b6e8390037db3fad60ef"; + private final static String COMPRESSED_OUTPUT_MD5 = "c0f3d88386367fcd66d196b49a22bfaa"; @Test public void testCompressedOutput() { @@ -108,7 +108,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { // Note that we need to turn off any randomization for this to work, so no downsampling and no annotations - String md5 = "7bc812cc553b4ab77c08049f0e32d0f6"; + String md5 = "34cb7146c037925e8f324cffd986834d"; WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommand + " -dt NONE -G none -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,075,000", 1, @@ -140,7 +140,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMinBaseQualityScore() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 --min_base_quality_score 26", 1, - Arrays.asList("dfeaccb68165fdaffafde9150914432d")); + Arrays.asList("ac4c8e0fdaf94a7949ceb157d3c836a9")); executeTest("test min_base_quality_score 26", spec); } @@ -148,7 +148,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testSLOD() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b36KGReference + " --no_cmdline_in_header -glm BOTH --dbsnp " + b36dbSNP129 + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1, - Arrays.asList("35ef19b4f248969c74da8bd7489385d6")); + Arrays.asList("751cfe684b0ed19b0d509bb7a315d65d")); executeTest("test SLOD", spec); } @@ -156,7 +156,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testNDA() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " --annotateNDA -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1, - Arrays.asList("aa49989fde8c6378f5c751f8b267c471")); + Arrays.asList("cbf1b8b261f726abe43045806e17e841")); executeTest("test NDA", spec); } @@ -164,23 +164,23 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testCompTrack() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b36KGReference + " --no_cmdline_in_header -glm BOTH -comp:FOO " + b36dbSNP129 + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1, - Arrays.asList("ffaeb60a5776d85b41c64786ddc4d14d")); + Arrays.asList("70fd3b551f9def332393b7f3976440ac")); executeTest("test using comp track", spec); } @Test public void testOutputParameterSitesOnly() { - testOutputParameters("-sites_only", "f9a4005c53291170800e6023503d5635"); + testOutputParameters("-sites_only", "9700208609431548b2651dc582f1ea10"); } @Test public void testOutputParameterAllConfident() { - testOutputParameters("--output_mode EMIT_ALL_CONFIDENT_SITES", "e6c63baff51aaeb318c8bebaf2989828"); + testOutputParameters("--output_mode EMIT_ALL_CONFIDENT_SITES", "b7bfc2a5298f86e3aef1bb70bebfb38e"); } @Test public void testOutputParameterAllSites() { - testOutputParameters("--output_mode EMIT_ALL_SITES", "43ffa34646d781a368ea81342c21ae2e"); + testOutputParameters("--output_mode EMIT_ALL_SITES", "90dcce8cd6de92258f332b261e64a5e9"); } private void testOutputParameters(final String args, final String md5) { @@ -194,7 +194,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testConfidence() { WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_call_conf 10 ", 1, - Arrays.asList("c7cb29121eb30e752ab6652a6d2a62a6")); + Arrays.asList("c5311739d1bba16cd59fb33e5f92ff49")); executeTest("test confidence 1", spec1); } @@ -202,7 +202,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testConfidence2() { WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_emit_conf 10 ", 1, - Arrays.asList("e7bdb76be82420a03ff28038d283822d")); + Arrays.asList("189f5b7026595a0814e1eb1466834760")); executeTest("test confidence 2", spec2); } @@ -213,12 +213,12 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { // -------------------------------------------------------------------------------------------------------------- @Test public void testHeterozyosity1() { - testHeterozosity( 0.01, "ca65e199e9ff0bc986df3dee74e11eb1" ); + testHeterozosity( 0.01, "a818570e33316086ad5cb95992faca69" ); } @Test public void testHeterozyosity2() { - testHeterozosity( 1.0 / 1850, "ddcdfe4a5252da59278a6f1ba6f8a175" ); + testHeterozosity( 1.0 / 1850, "3bfdba9bcc9c84d326750ad4a879897f" ); } private void testHeterozosity(final double arg, final String md5) { @@ -242,7 +242,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,100,000", 1, - Arrays.asList("c4b3876d76e3d0fb78a1d3ebd674f1a1")); + Arrays.asList("1bd18653e8d9bf72c180142c2287eb5e")); executeTest(String.format("test multiple technologies"), spec); } @@ -261,7 +261,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -L 1:10,000,000-10,100,000" + " -baq CALCULATE_AS_NECESSARY", 1, - Arrays.asList("41445b1cd1a82af71126ff1692f7a5fe")); + Arrays.asList("1defb30f6b870fbe5bcb27aed22ffb04")); executeTest(String.format("test calling with BAQ"), spec); } @@ -280,7 +280,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,500,000", 1, - Arrays.asList("c9e79470a4ce6eacde366e9fcf4d5b14")); + Arrays.asList("2d52f5243da578c4070b03881852aae3")); executeTest(String.format("test indel caller in SLX"), spec); } @@ -295,7 +295,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -minIndelCnt 1" + " -L 1:10,000,000-10,100,000", 1, - Arrays.asList("70f8a17ba68131520db5c764ac5acdd2")); + Arrays.asList("9271105e630ab39cf1c88b338da54594")); executeTest(String.format("test indel caller in SLX with low min allele count"), spec); } @@ -308,7 +308,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,500,000", 1, - Arrays.asList("e4316d80fd833886820c8b4e122fbfc4")); + Arrays.asList("80f0221c47c30a03cb902b1c54c1eb12")); executeTest(String.format("test indel calling, multiple technologies"), spec); } @@ -318,7 +318,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + testDir + "indelAllelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1, - Arrays.asList("c92aba3635f3331ddf8ae7a0382ca594")); + Arrays.asList("ce40ff952025e806638156c3cd002927")); executeTest("test MultiSample Pilot2 indels with alleles passed in", spec); } @@ -328,7 +328,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { baseCommandIndels + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + testDir + "indelAllelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1, - Arrays.asList("b87034f349887160ec1124e12863d543")); + Arrays.asList("4738fa9e68a010e05c64ae965ed94de3")); executeTest("test MultiSample Pilot2 indels with alleles passed in and emitting all sites", spec); } @@ -336,13 +336,13 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultiSampleIndels1() { WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommandIndels + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10450700-10551000", 1, - Arrays.asList("51e6a7868d2ea2daefa411ed82f18be2")); + Arrays.asList("0b0114364680a23e125f469208b96c6f")); List result = executeTest("test MultiSample Pilot1 CEU indels", spec1).getFirst(); WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + result.get(0).getAbsolutePath() + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10450700-10551000", 1, - Arrays.asList("954c52be0c6ca9ed5a213a53f4efbc10")); + Arrays.asList("adcf53b8dcfde7f2c657745751549bfe")); executeTest("test MultiSample Pilot1 CEU indels using GENOTYPE_GIVEN_ALLELES", spec2); } @@ -352,7 +352,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommandIndelsb37 + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles " + testDir + vcf + " -I " + validationDataLocation + "NA12878.HiSeq.WGS.bwa.cleaned.recal.hg19.20.bam -o %s -L " + validationDataLocation + vcf, 1, - Arrays.asList("ae44230ed54fd8ce63711cae908470cb")); + Arrays.asList("3e3ac23846801c34acbf10a1a527264a")); executeTest("test GENOTYPE_GIVEN_ALLELES with no evidence in reads", spec); } @@ -385,7 +385,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMinIndelFraction0() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( assessMinIndelFraction + " -minIndelFrac 0.0", 1, - Arrays.asList("471012c1d3dbec4633710264de5daa24")); + Arrays.asList("eb9d8d1b19617190762f62bad139263a")); executeTest("test minIndelFraction 0.0", spec); } @@ -393,7 +393,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMinIndelFraction25() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( assessMinIndelFraction + " -minIndelFrac 0.25", 1, - Arrays.asList("9165507fb202d515512a947a8a9db6bb")); + Arrays.asList("952f46506d3ef9c11cafcc072a00750a")); executeTest("test minIndelFraction 0.25", spec); } @@ -401,7 +401,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMinIndelFraction100() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( assessMinIndelFraction + " -minIndelFrac 1", 1, - Arrays.asList("c1bbd4998b7c6dffee1682d3e5c929cc")); + Arrays.asList("50a6774b7d8f71fe0e125c204d50ba84")); executeTest("test minIndelFraction 1.0", spec); } } diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextTestProvider.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextTestProvider.java index 620975b78..ddaef78ab 100644 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextTestProvider.java +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextTestProvider.java @@ -156,20 +156,32 @@ public class VariantContextTestProvider { } } + private final static void addHeaderLine(final Set metaData, final String id, final int count, final VCFHeaderLineType type) { + metaData.add(new VCFInfoHeaderLine(id, count, type, "x")); + if ( type != VCFHeaderLineType.Flag ) + metaData.add(new VCFFormatHeaderLine(id, count, type, "x")); + } + + private final static void addHeaderLine(final Set metaData, final String id, final VCFHeaderLineCount count, final VCFHeaderLineType type) { + metaData.add(new VCFInfoHeaderLine(id, count, type, "x")); + if ( type != VCFHeaderLineType.Flag ) + metaData.add(new VCFFormatHeaderLine(id, count, type, "x")); + } + private static void createSyntheticHeader() { Set metaData = new TreeSet(); - metaData.add(new VCFInfoHeaderLine("STRING1", 1, VCFHeaderLineType.String, "x")); - metaData.add(new VCFInfoHeaderLine("STRING3", 3, VCFHeaderLineType.String, "x")); - metaData.add(new VCFInfoHeaderLine("STRING20", 20, VCFHeaderLineType.String, "x")); - metaData.add(new VCFInfoHeaderLine("VAR.INFO.STRING", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "x")); + addHeaderLine(metaData, "STRING1", 1, VCFHeaderLineType.String); + addHeaderLine(metaData, "STRING3", 3, VCFHeaderLineType.String); + addHeaderLine(metaData, "STRING20", 20, VCFHeaderLineType.String); + addHeaderLine(metaData, "VAR.INFO.STRING", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String); - metaData.add(new VCFFormatHeaderLine("GT", 1, VCFHeaderLineType.String, "Genotype")); - metaData.add(new VCFFormatHeaderLine("GQ", 1, VCFHeaderLineType.Integer, "Genotype Quality")); - metaData.add(new VCFFormatHeaderLine("PL", VCFHeaderLineCount.G, VCFHeaderLineType.Integer, "Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification")); - metaData.add(new VCFFormatHeaderLine("GS", 2, VCFHeaderLineType.String, "A doubleton list of strings")); - metaData.add(new VCFFormatHeaderLine("GV", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "A variable list of strings")); - metaData.add(new VCFFormatHeaderLine("FT", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Genotype filters")); + addHeaderLine(metaData, "GT", 1, VCFHeaderLineType.Integer); + addHeaderLine(metaData, "GQ", 1, VCFHeaderLineType.Integer); + addHeaderLine(metaData, "PL", VCFHeaderLineCount.G, VCFHeaderLineType.Integer); + addHeaderLine(metaData, "GS", 2, VCFHeaderLineType.String); + addHeaderLine(metaData, "GV", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String); + addHeaderLine(metaData, "FT", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String); // prep the header metaData.add(new VCFContigHeaderLine(VCFHeader.CONTIG_KEY, Collections.singletonMap("ID", "1"), 0)); @@ -177,13 +189,13 @@ public class VariantContextTestProvider { metaData.add(new VCFFilterHeaderLine("FILTER1")); metaData.add(new VCFFilterHeaderLine("FILTER2")); - metaData.add(new VCFInfoHeaderLine("INT1", 1, VCFHeaderLineType.Integer, "x")); - metaData.add(new VCFInfoHeaderLine("INT3", 3, VCFHeaderLineType.Integer, "x")); - metaData.add(new VCFInfoHeaderLine("INT20", 20, VCFHeaderLineType.Integer, "x")); - metaData.add(new VCFInfoHeaderLine("INT.VAR", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x")); - metaData.add(new VCFInfoHeaderLine("FLOAT1", 1, VCFHeaderLineType.Float, "x")); - metaData.add(new VCFInfoHeaderLine("FLOAT3", 3, VCFHeaderLineType.Float, "x")); - metaData.add(new VCFInfoHeaderLine("FLAG", 0, VCFHeaderLineType.Flag, "x")); + addHeaderLine(metaData, "INT1", 1, VCFHeaderLineType.Integer); + addHeaderLine(metaData, "INT3", 3, VCFHeaderLineType.Integer); + addHeaderLine(metaData, "INT20", 20, VCFHeaderLineType.Integer); + addHeaderLine(metaData, "INT.VAR", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer); + addHeaderLine(metaData, "FLOAT1", 1, VCFHeaderLineType.Float); + addHeaderLine(metaData, "FLOAT3", 3, VCFHeaderLineType.Float); + addHeaderLine(metaData, "FLAG", 0, VCFHeaderLineType.Flag); syntheticHeader = new VCFHeader(metaData); }