From 2a86b81a3f081912274a7510a122c77ef9162a2d Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Sun, 10 Jun 2012 10:53:51 -0400 Subject: [PATCH] Initial version of clean, fast formatting routines built dynamically from a VCF header -- BCFFieldEncoder and writers divide up the task of formatting values (atomic or vector, ints, strings, floats, etc) from the task of writing these out at the sites or genotypes level. -- Allows us to create efficient encoders for specific combinations of header fields, such as int[] encoded values with exactly 3 values -- Currently only used for INFO fields, but subsequent commit will include optimized genotype field encoder -- Allowed us to naturally support encoding of lists of strings -- Bugfixes in VariantContextUtils introduced in genotype -> genotypebuilder conversion -- Fixes for integration test failures -- Enabling contig updates -- WalkerTest now prints out relative paths where possible to make cut/paste/run easier --- .../io/stubs/VariantContextWriterStub.java | 2 +- .../beagle/BeagleOutputToVCFWalker.java | 2 +- .../walkers/variantutils/SelectVariants.java | 2 +- .../utils/variantcontext/GenotypeBuilder.java | 10 + .../variantcontext/VariantContextUtils.java | 7 +- .../writer/BCF2FieldEncoder.java | 233 ++++++++++++++++++ .../writer/BCF2FieldWriter.java | 110 +++++++++ .../writer/BCF2FieldWriterManager.java | 113 +++++++++ .../variantcontext/writer/BCF2Writer.java | 25 +- .../org/broadinstitute/sting/BaseTest.java | 4 +- .../org/broadinstitute/sting/WalkerTest.java | 4 +- .../VariantContextTestProvider.java | 90 ++++--- 12 files changed, 546 insertions(+), 56 deletions(-) create mode 100644 public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2FieldEncoder.java create mode 100644 public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2FieldWriter.java create mode 100644 public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2FieldWriterManager.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VariantContextWriterStub.java b/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VariantContextWriterStub.java index 0a2e2af17..7b5cff321 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VariantContextWriterStub.java +++ b/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VariantContextWriterStub.java @@ -51,7 +51,7 @@ import java.util.List; * @version 0.1 */ public class VariantContextWriterStub implements Stub, VariantContextWriter { - public final static boolean UPDATE_CONTIG_HEADERS = false; + public final static boolean UPDATE_CONTIG_HEADERS = true; /** * The engine, central to the GATK's processing. diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCFWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCFWalker.java index f9ef52857..31006f4d8 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCFWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCFWalker.java @@ -326,7 +326,7 @@ public class BeagleOutputToVCFWalker extends RodWalker { else { originalAttributes.put("OG","."); } - Genotype imputedGenotype = new GenotypeBuilder(g.getSampleName(), alleles).log10PError(genotypeQuality).attributes(originalAttributes).phased(genotypeIsPhased).make(); + Genotype imputedGenotype = new GenotypeBuilder(g).alleles(alleles).log10PError(genotypeQuality).attributes(originalAttributes).phased(genotypeIsPhased).make(); if ( imputedGenotype.isHet() || imputedGenotype.isHomVar() ) { beagleVarCounts++; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java index d37d1f895..b0f4c569a 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java @@ -730,7 +730,7 @@ public class SelectVariants extends RodWalker implements TreeR //Set genotype to no call if it falls in the fraction. if(fractionGenotypes>0 && randomGenotypes.nextDouble() alleles = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL); - genotypes.add(new GenotypeBuilder(genotype).alleles(alleles).GQ(-1).make()); + genotypes.add(new GenotypeBuilder(genotype).alleles(alleles).noGQ().make()); } else{ genotypes.add(genotype); diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeBuilder.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeBuilder.java index 713dc219b..37e7d5d58 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeBuilder.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeBuilder.java @@ -346,6 +346,16 @@ public final class GenotypeBuilder { return this; } + /** + * Tells this builder to remove all extended attributes + * + * @return + */ + public GenotypeBuilder noAttributes() { + this.extendedAttributes = null; + return this; + } + /** * This genotype has this attribute key / value pair. * diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java index 2a4b251bf..a51f2189d 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java @@ -462,9 +462,10 @@ public class VariantContextUtils { // Genotypes final GenotypesContext genotypes = GenotypesContext.create(vc.getNSamples()); for ( final Genotype g : vc.getGenotypes() ) { - // TODO -- fixme - //Map genotypeAttributes = subsetAttributes(g.commonInfo, keysToPreserve); - //genotypes.add(new GenotypeBuilder(g).attributes(genotypeAttributes).make()); + final GenotypeBuilder gb = new GenotypeBuilder(g); + // remove AD, DP, PL, and all extended attributes, keeping just GT and GQ + gb.noAD().noDP().noPL().noAttributes(); + genotypes.add(gb.make()); } return builder.genotypes(genotypes).attributes(attributes); diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2FieldEncoder.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2FieldEncoder.java new file mode 100644 index 000000000..78b5aaf39 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2FieldEncoder.java @@ -0,0 +1,233 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.variantcontext.writer; + +import com.google.java.contract.Requires; +import org.broadinstitute.sting.utils.codecs.bcf2.BCF2Encoder; +import org.broadinstitute.sting.utils.codecs.bcf2.BCF2Type; +import org.broadinstitute.sting.utils.codecs.bcf2.BCF2Utils; +import org.broadinstitute.sting.utils.codecs.vcf.VCFCompoundHeaderLine; +import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineCount; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; + +import java.io.IOException; +import java.util.Collections; +import java.util.List; +import java.util.Map; + +/** + * + * + * @author Your Name + * @since Date created + */ +public abstract class BCF2FieldEncoder { + final VCFCompoundHeaderLine headerLine; + final BCF2Type fixedType; + final int dictionaryOffset; + final BCF2Type dictionaryOffsetType; + + public BCF2FieldEncoder(final VCFCompoundHeaderLine headerLine, final BCF2Encoder encoder, final Map dict, final BCF2Type fixedType) { + this.headerLine = headerLine; + this.fixedType = fixedType; + + final Integer offset = dict.get(getField()); + if ( offset == null ) throw new ReviewedStingException("Format error: could not find string " + getField() + " in header as required by BCF"); + this.dictionaryOffset = offset; + dictionaryOffsetType = BCF2Utils.determineIntegerType(offset); + } + + public VCFHeaderLineCount getCountType() { + return headerLine.getCountType(); + } + + public VCFCompoundHeaderLine getHeaderLine() { + return headerLine; + } + + public boolean hasFixedCount() { return getCountType() == VCFHeaderLineCount.INTEGER; } + public boolean hasUnboundedCount() { return getCountType() == VCFHeaderLineCount.UNBOUNDED; } + public boolean hasContextDeterminedCount() { return ! hasFixedCount() && ! hasUnboundedCount(); } + + @Requires("hasFixedCount()") + public int getFixedCount() { return headerLine.getCount(); } + public int getContextDeterminedCount(final VariantContext vc) { + return headerLine.getCount(vc.getNAlleles() - 1); + } + public int getBCFFieldCount(final VariantContext vc, final Object value) { + if ( hasFixedCount() ) + return getFixedCount(); + else if ( hasUnboundedCount() ) + return value instanceof List ? ((List) value).size() : 1; + else + return getContextDeterminedCount(vc); + } + + public String getField() { return headerLine.getID(); } + + public int getDictionaryOffset() { return dictionaryOffset; } + public BCF2Type getDictionaryOffsetType() { return dictionaryOffsetType; } + + public boolean isFixedTyped() { return ! isDynamicallyTyped(); } + public boolean isDynamicallyTyped() { return fixedType == null; } + public BCF2Type getType(final Object value) { return isDynamicallyTyped() ? getDynamicType(value) : getFixedType(); } + public BCF2Type getFixedType() { + if ( fixedType != null ) + return fixedType; + else + throw new ReviewedStingException("Not a fixed type encoder: " + getField()); + } + public BCF2Type getDynamicType(final Object value) { throw new ReviewedStingException("Function getDynamicType() not implemented"); } + + @Override + public String toString() { + return "BCF2FieldEncoder for " + getField() + " with count " + getCountType() + " encoded with " + getClass().getSimpleName(); + } + + public abstract void encodeValue(final BCF2Encoder encoder, final Object value, final BCF2Type type) throws IOException; + + + /** + * Helper function that takes an object and returns a list representation + * of it: + * + * o == null => [] + * o is a list => o + * else => [o] + * + * @param o + * @return + */ + private final static List toList(final Class c, final Object o) { + if ( o == null ) return Collections.emptyList(); + else if ( o instanceof List ) return (List)o; + else return Collections.singletonList((T)o); + } + + public static class StringOrCharacter extends BCF2FieldEncoder { + public StringOrCharacter(final VCFCompoundHeaderLine headerLine, final BCF2Encoder encoder, final Map dict ) { + super(headerLine, encoder, dict, BCF2Type.CHAR); + } + + @Override + public void encodeValue(final BCF2Encoder encoder, final Object value, final BCF2Type type) throws IOException { + if ( value != null ) { + final String s = encodeString(value); + encoder.encodeString(s, s.length()); + } + } + + @Override + public int getBCFFieldCount(final VariantContext vc, final Object value) { + return value == null ? 0 : encodeString(value).length(); + } + + private String encodeString(final Object value) { + return value instanceof List ? BCF2Utils.collapseStringList((List)value) : (String)value; + } + } + + public static class Flag extends BCF2FieldEncoder { + public Flag(final VCFCompoundHeaderLine headerLine, final BCF2Encoder encoder, final Map dict ) { + super(headerLine, encoder, dict, BCF2Type.INT8); + if ( getHeaderLine().getCount() != 0 ) + throw new ReviewedStingException("Flag encoder only suppports atomic flags!"); + } + + @Override + public int getFixedCount() { + return 1; // the header says 0 but we will write 1 value + } + + @Override + public void encodeValue(final BCF2Encoder encoder, final Object value, final BCF2Type type) throws IOException { + encoder.encodePrimitive(1, getFixedType()); + } + } + + public static class Float extends BCF2FieldEncoder { + public Float(final VCFCompoundHeaderLine headerLine, final BCF2Encoder encoder, final Map dict ) { + super(headerLine, encoder, dict, BCF2Type.FLOAT); + } + + @Override + public void encodeValue(final BCF2Encoder encoder, final Object value, final BCF2Type type) throws IOException { + final List doubles = toList(Double.class, value); + for ( final double d : doubles ) + encoder.encodeRawFloat(d); + } + } + + public static class IntArray extends BCF2FieldEncoder { + public IntArray(final VCFCompoundHeaderLine headerLine, final BCF2Encoder encoder, final Map dict ) { + super(headerLine, encoder, dict, null); + } + + @Override + public BCF2Type getDynamicType(final Object value) { + return value == null ? BCF2Type.INT8 : BCF2Utils.determineIntegerType((int[])value); + } + + @Override + public void encodeValue(final BCF2Encoder encoder, final Object value, final BCF2Type type) throws IOException { + for ( final int i : (int[])value ) + encoder.encodeRawInt(i, type); + } + } + + public static class IntList extends BCF2FieldEncoder { + public IntList(final VCFCompoundHeaderLine headerLine, final BCF2Encoder encoder, final Map dict ) { + super(headerLine, encoder, dict, null); + } + + @Override + public BCF2Type getDynamicType(final Object value) { + return value == null ? BCF2Type.INT8 : BCF2Utils.determineIntegerType(toList(Integer.class, value)); + } + + @Override + public void encodeValue(final BCF2Encoder encoder, final Object value, final BCF2Type type) throws IOException { + for ( final int i : toList(Integer.class, value) ) + encoder.encodeRawInt(i, type); + } + } + + public static class AtomicInt extends BCF2FieldEncoder { + public AtomicInt(final VCFCompoundHeaderLine headerLine, final BCF2Encoder encoder, final Map dict ) { + super(headerLine, encoder, dict, null); + } + + @Override + public BCF2Type getDynamicType(final Object value) { + return value == null ? BCF2Type.INT8 : BCF2Utils.determineIntegerType((Integer)value); + } + + @Override + public void encodeValue(final BCF2Encoder encoder, final Object value, final BCF2Type type) throws IOException { + encoder.encodeRawInt((Integer)value, type); + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2FieldWriter.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2FieldWriter.java new file mode 100644 index 000000000..3d3efdd24 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2FieldWriter.java @@ -0,0 +1,110 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.variantcontext.writer; + +import org.broadinstitute.sting.utils.codecs.bcf2.BCF2Encoder; +import org.broadinstitute.sting.utils.codecs.bcf2.BCF2Type; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; + +import java.io.IOException; + +/** + * [Short one sentence description of this walker] + *

+ *

+ * [Functionality of this walker] + *

+ *

+ *

Input

+ *

+ * [Input description] + *

+ *

+ *

Output

+ *

+ * [Output description] + *

+ *

+ *

Examples

+ *
+ *    java
+ *      -jar GenomeAnalysisTK.jar
+ *      -T $WalkerName
+ *  
+ * + * @author Your Name + * @since Date created + */ +public abstract class BCF2FieldWriter { + private final BCF2FieldEncoder fieldEncoder; + + protected BCF2FieldWriter(final BCF2FieldEncoder fieldEncoder) { + this.fieldEncoder = fieldEncoder; + } + + protected BCF2FieldEncoder getFieldEncoder() { + return fieldEncoder; + } + + public void start(final BCF2Encoder encoder, final VariantContext vc) throws IOException { + encoder.encodeTyped(fieldEncoder.getDictionaryOffset(), fieldEncoder.getDictionaryOffsetType()); + } + + public void done(final BCF2Encoder encoder, final VariantContext vc) throws IOException { } + + @Override + public String toString() { + return "BCF2FieldWriter " + getClass().getSimpleName() + " with encoder " + getFieldEncoder(); + } + + public static abstract class SiteWriter extends BCF2FieldWriter { + protected SiteWriter(final BCF2FieldEncoder fieldEncoder) { + super(fieldEncoder); + } + + public abstract void site(final BCF2Encoder encoder, final VariantContext vc) throws IOException; + } + + public static class GenericSiteWriter extends SiteWriter { + public GenericSiteWriter(final BCF2FieldEncoder fieldEncoder) { + super(fieldEncoder); + } + + @Override + public void site(final BCF2Encoder encoder, final VariantContext vc) throws IOException { + final Object rawValue = vc.getAttribute(getFieldEncoder().getField(), null); + final BCF2Type type = getFieldEncoder().getType(rawValue); + if ( rawValue == null ) { + // the value is missing, just write in null + encoder.encodeType(0, type); + } else { + final int valueCount = getFieldEncoder().getBCFFieldCount(vc, rawValue); + encoder.encodeType(valueCount, type); + getFieldEncoder().encodeValue(encoder, rawValue, type); + } + } + } +} + diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2FieldWriterManager.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2FieldWriterManager.java new file mode 100644 index 000000000..47764a681 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2FieldWriterManager.java @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.variantcontext.writer; + +import org.apache.log4j.Logger; +import org.broadinstitute.sting.utils.codecs.bcf2.BCF2Encoder; +import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader; +import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLine; +import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineCount; +import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +import java.util.HashMap; +import java.util.Map; + +/** + * [Short one sentence description of this walker] + *

+ *

+ * [Functionality of this walker] + *

+ *

+ *

Input

+ *

+ * [Input description] + *

+ *

+ *

Output

+ *

+ * [Output description] + *

+ *

+ *

Examples

+ *
+ *    java
+ *      -jar GenomeAnalysisTK.jar
+ *      -T $WalkerName
+ *  
+ * + * @author Your Name + * @since Date created + */ +public class BCF2FieldWriterManager { + final protected static Logger logger = Logger.getLogger(BCF2FieldWriterManager.class); + final Map siteWriters = new HashMap(); + + public BCF2FieldWriterManager() { } + + public void setup(final VCFHeader header, final BCF2Encoder encoder, final Map dictionary) { + for (final VCFHeaderLine line : header.getMetaData()) { + if ( line instanceof VCFInfoHeaderLine ) { + final String field = ((VCFInfoHeaderLine) line).getID(); + final BCF2FieldWriter.SiteWriter writer = createInfoWriter((VCFInfoHeaderLine)line, encoder, dictionary); + logger.info("Installing for field " + field + " field writer " + writer); + siteWriters.put(field, writer); + } + } + } + + private BCF2FieldWriter.SiteWriter createInfoWriter(final VCFInfoHeaderLine line, final BCF2Encoder encoder, final Map dict) { + BCF2FieldEncoder fieldEncoder = null; + switch ( line.getType() ) { + case Character: + case String: + fieldEncoder = new BCF2FieldEncoder.StringOrCharacter(line, encoder, dict); + break; + case Flag: + fieldEncoder = new BCF2FieldEncoder.Flag(line, encoder, dict); + break; + case Float: + fieldEncoder = new BCF2FieldEncoder.Float(line, encoder, dict); + break; + case Integer: + if ( line.getCountType() == VCFHeaderLineCount.INTEGER && line.getCount() == 1 ) + fieldEncoder = new BCF2FieldEncoder.AtomicInt(line, encoder, dict); + else + fieldEncoder = new BCF2FieldEncoder.IntList(line, encoder, dict); + break; + default: + throw new ReviewedStingException("Unexpected type for field " + line.getID()); + } + + return new BCF2FieldWriter.GenericSiteWriter(fieldEncoder); + } + + public BCF2FieldWriter.SiteWriter getSiteFieldWriter(final String key) { + final BCF2FieldWriter.SiteWriter writer = siteWriters.get(key); + if ( writer == null ) throw new ReviewedStingException("BUG: no writer found for " + key); + return writer; + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java index 1783cd27f..2c9f08fec 100644 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java @@ -51,6 +51,7 @@ class BCF2Writer extends IndexingVariantContextWriter { private final BCF2Encoder encoder = new BCF2Encoder(); // initialized after the header arrives IntGenotypeFieldAccessors intGenotypeFieldAccessors = new IntGenotypeFieldAccessors(); + final BCF2FieldWriterManager fieldManager = new BCF2FieldWriterManager(); public BCF2Writer(final File location, final OutputStream output, final SAMSequenceDictionary refDict, final boolean enableOnTheFlyIndexing, final boolean doNotWriteGenotypes) { super(writerName(location, output), location, output, refDict, enableOnTheFlyIndexing); @@ -80,6 +81,9 @@ class BCF2Writer extends IndexingVariantContextWriter { stringDictionaryMap.put(dict.get(i), i); } + // setup the field encodings + fieldManager.setup(header, encoder, stringDictionaryMap); + try { // write out the header into a byte stream, get it's length, and write everything to the file final ByteArrayOutputStream capture = new ByteArrayOutputStream(); @@ -225,10 +229,15 @@ class BCF2Writer extends IndexingVariantContextWriter { private void buildInfo( VariantContext vc ) throws IOException { for ( Map.Entry infoFieldEntry : vc.getAttributes().entrySet() ) { final String key = infoFieldEntry.getKey(); - final VCFToBCFEncoding encoding = prepFieldValueForEncoding(key, infoFieldEntry.getValue()); + final BCF2FieldWriter.SiteWriter writer = fieldManager.getSiteFieldWriter(key); + writer.start(encoder, vc); + writer.site(encoder, vc); + writer.done(encoder, vc); - encodeStringByRef(key); - encoder.encodeTyped(encoding.valuesToEncode, encoding.BCF2Type); + // the old way of doing things +// final VCFToBCFEncoding encoding = prepFieldValueForEncoding(key, infoFieldEntry.getValue()); +// encodeStringByRef(key); +// encoder.encodeTyped(encoding.valuesToEncode, encoding.BCF2Type); } } @@ -278,9 +287,9 @@ class BCF2Writer extends IndexingVariantContextWriter { BCF2Type intType; if ( isList ) { l = (List)value; - intType = encoder.determineIntegerType(l); + intType = BCF2Utils.determineIntegerType(l); } else if ( value != null ) { - intType = encoder.determineIntegerType((Integer)value); + intType = BCF2Utils.determineIntegerType((Integer) value); l = Collections.singletonList((Integer)value); } else { intType = BCF2Type.INT8; @@ -417,7 +426,7 @@ class BCF2Writer extends IndexingVariantContextWriter { } // determine the best size - final BCF2Type type = encoder.determineIntegerType(allPLs); + final BCF2Type type = BCF2Utils.determineIntegerType(allPLs); startGenotypeField(field, numPLs, type); for ( int pl : allPLs ) encoder.encodePrimitive(pl == -1 ? type.getMissingBytes() : pl, type); @@ -495,7 +504,7 @@ class BCF2Writer extends IndexingVariantContextWriter { private final BCF2Type encodeStringByRef(final String string) throws IOException { final Integer offset = stringDictionaryMap.get(string); if ( offset == null ) throw new ReviewedStingException("Format error: could not find string " + string + " in header as required by BCF"); - final BCF2Type type = encoder.determineIntegerType(offset); + final BCF2Type type = BCF2Utils.determineIntegerType(offset); encoder.encodeTyped(offset, type); return type; } @@ -516,7 +525,7 @@ class BCF2Writer extends IndexingVariantContextWriter { offsets.add(offset); if ( maxType != BCF2Type.INT32) { // don't bother looking if we already are at 32 bit ints - final BCF2Type type1 = encoder.determineIntegerType(offset); + final BCF2Type type1 = BCF2Utils.determineIntegerType(offset); switch ( type1 ) { case INT8: break; case INT16: if ( maxType == BCF2Type.INT8 ) maxType = BCF2Type.INT16; break; diff --git a/public/java/test/org/broadinstitute/sting/BaseTest.java b/public/java/test/org/broadinstitute/sting/BaseTest.java index 58f961762..3e3cf65f2 100755 --- a/public/java/test/org/broadinstitute/sting/BaseTest.java +++ b/public/java/test/org/broadinstitute/sting/BaseTest.java @@ -87,8 +87,10 @@ public abstract class BaseTest { private static final String networkTempDir; private static final File networkTempDirFile; - public static final File testDirFile = new File("public/testdata/"); + protected static final String testDirRelative = "public/testdata/"; + public static final File testDirFile = new File(testDirRelative); public static final String testDir = testDirFile.getAbsolutePath() + "/"; + protected static final String testDirRoot = testDirFile.getPath().replace(testDirRelative, ""); public static final String keysDataLocation = validationDataLocation + "keys/"; public static final String gatkKeyFile = CryptUtils.GATK_USER_KEY_DIRECTORY + "gsamembers_broadinstitute.org.key"; diff --git a/public/java/test/org/broadinstitute/sting/WalkerTest.java b/public/java/test/org/broadinstitute/sting/WalkerTest.java index b55cb03f2..9871f637c 100755 --- a/public/java/test/org/broadinstitute/sting/WalkerTest.java +++ b/public/java/test/org/broadinstitute/sting/WalkerTest.java @@ -354,7 +354,9 @@ public class WalkerTest extends BaseTest { final String now = new SimpleDateFormat("HH:mm:ss").format(new Date()); final String cmdline = Utils.join(" ",command); System.out.println(String.format("[%s] Executing test %s with GATK arguments: %s", now, name, cmdline)); - BaseTest.log(cmdline); // also write the command line to the HTML log for convenient follow-up + // also write the command line to the HTML log for convenient follow-up + // do the replaceAll so paths become relative to the current + BaseTest.log(cmdline.replaceAll(testDirRoot, "")); CommandLineExecutable.start(instance, command); } catch (Exception e) { gotAnException = true; diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextTestProvider.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextTestProvider.java index da8625411..620975b78 100644 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextTestProvider.java +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextTestProvider.java @@ -55,6 +55,8 @@ public class VariantContextTestProvider { final private static boolean ENABLE_PLOIDY_TESTS = true; final private static boolean ENABLE_PL_TESTS = true; final private static boolean ENABLE_SOURCE_VCF_TESTS = true; + final private static boolean ENABLE_VARIABLE_LENGTH_GENOTYPE_STRING_TESTS = false; + private static VCFHeader syntheticHeader; final static List TEST_DATAs = new ArrayList(); private static VariantContext ROOT; @@ -160,6 +162,7 @@ public class VariantContextTestProvider { metaData.add(new VCFInfoHeaderLine("STRING1", 1, VCFHeaderLineType.String, "x")); metaData.add(new VCFInfoHeaderLine("STRING3", 3, VCFHeaderLineType.String, "x")); metaData.add(new VCFInfoHeaderLine("STRING20", 20, VCFHeaderLineType.String, "x")); + metaData.add(new VCFInfoHeaderLine("VAR.INFO.STRING", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "x")); metaData.add(new VCFFormatHeaderLine("GT", 1, VCFHeaderLineType.String, "Genotype")); metaData.add(new VCFFormatHeaderLine("GQ", 1, VCFHeaderLineType.Integer, "Genotype Quality")); @@ -180,7 +183,7 @@ public class VariantContextTestProvider { metaData.add(new VCFInfoHeaderLine("INT.VAR", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x")); metaData.add(new VCFInfoHeaderLine("FLOAT1", 1, VCFHeaderLineType.Float, "x")); metaData.add(new VCFInfoHeaderLine("FLOAT3", 3, VCFHeaderLineType.Float, "x")); - metaData.add(new VCFInfoHeaderLine("FLAG", 1, VCFHeaderLineType.Flag, "x")); + metaData.add(new VCFInfoHeaderLine("FLAG", 0, VCFHeaderLineType.Flag, "x")); syntheticHeader = new VCFHeader(metaData); } @@ -246,6 +249,11 @@ public class VariantContextTestProvider { add(builder().attribute("STRING3", null)); add(builder().attribute("STRING20", Arrays.asList("s1", "s2", "s3", "s4", "s5", "s6", "s7", "s8", "s9", "s10", "s11", "s12", "s13", "s14", "s15", "s16", "s17", "s18", "s19", "s20"))); + add(builder().attribute("VAR.INFO.STRING", "s1")); + add(builder().attribute("VAR.INFO.STRING", Arrays.asList("s1", "s2"))); + add(builder().attribute("VAR.INFO.STRING", Arrays.asList("s1", "s2", "s3"))); + add(builder().attribute("VAR.INFO.STRING", null)); + addGenotypesToTestData(); addComplexGenotypesTest(); @@ -390,51 +398,53 @@ public class VariantContextTestProvider { attr("g1", ref, "FLOAT3", 1.0, 2.0, 3.0), attr("g2", ref, "FLOAT3")); - // - // - // TESTING MULTIPLE SIZED LISTS IN THE GENOTYPE FIELD - // - // - addGenotypeTests(site, - attr("g1", ref, "GS", Arrays.asList("S1", "S2")), - attr("g2", ref, "GS", Arrays.asList("S3", "S4"))); + if (ENABLE_VARIABLE_LENGTH_GENOTYPE_STRING_TESTS) { + // + // + // TESTING MULTIPLE SIZED LISTS IN THE GENOTYPE FIELD + // + // + addGenotypeTests(site, + attr("g1", ref, "GS", Arrays.asList("S1", "S2")), + attr("g2", ref, "GS", Arrays.asList("S3", "S4"))); - addGenotypeTests(site, // g1 is missing the string, and g2 is missing FLOAT1 - attr("g1", ref, "FLOAT1", 1.0), - attr("g2", ref, "GS", Arrays.asList("S3", "S4"))); + addGenotypeTests(site, // g1 is missing the string, and g2 is missing FLOAT1 + attr("g1", ref, "FLOAT1", 1.0), + attr("g2", ref, "GS", Arrays.asList("S3", "S4"))); - // variable sized lists - addGenotypeTests(site, - attr("g1", ref, "GV", Arrays.asList("S1")), - attr("g2", ref, "GV", Arrays.asList("S3", "S4"))); + // variable sized lists + addGenotypeTests(site, + attr("g1", ref, "GV", Arrays.asList("S1")), + attr("g2", ref, "GV", Arrays.asList("S3", "S4"))); - addGenotypeTests(site, - attr("g1", ref, "GV", Arrays.asList("S1", "S2")), - attr("g2", ref, "GV", Arrays.asList("S3", "S4", "S5"))); + addGenotypeTests(site, + attr("g1", ref, "GV", Arrays.asList("S1", "S2")), + attr("g2", ref, "GV", Arrays.asList("S3", "S4", "S5"))); - addGenotypeTests(site, // missing value in varlist of string - attr("g1", ref, "FLOAT1", 1.0), - attr("g2", ref, "GV", Arrays.asList("S3", "S4", "S5"))); + addGenotypeTests(site, // missing value in varlist of string + attr("g1", ref, "FLOAT1", 1.0), + attr("g2", ref, "GV", Arrays.asList("S3", "S4", "S5"))); - // - // - // TESTING GENOTYPE FILTERS - // - // - addGenotypeTests(site, - new GenotypeBuilder("g1", Arrays.asList(ref, ref)).filters("X").make(), - new GenotypeBuilder("g2", Arrays.asList(ref, ref)).filters("X").make()); - addGenotypeTests(site, - new GenotypeBuilder("g1", Arrays.asList(ref, ref)).unfiltered().make(), - new GenotypeBuilder("g2", Arrays.asList(ref, ref)).filters("X").make()); - addGenotypeTests(site, - new GenotypeBuilder("g1", Arrays.asList(ref, ref)).unfiltered().make(), - new GenotypeBuilder("g2", Arrays.asList(ref, ref)).filters("X", "Y").make()); - addGenotypeTests(site, - new GenotypeBuilder("g1", Arrays.asList(ref, ref)).unfiltered().make(), - new GenotypeBuilder("g2", Arrays.asList(ref, ref)).filters("X").make(), - new GenotypeBuilder("g3", Arrays.asList(ref, ref)).filters("X", "Y").make()); + // + // + // TESTING GENOTYPE FILTERS + // + // + addGenotypeTests(site, + new GenotypeBuilder("g1", Arrays.asList(ref, ref)).filters("X").make(), + new GenotypeBuilder("g2", Arrays.asList(ref, ref)).filters("X").make()); + addGenotypeTests(site, + new GenotypeBuilder("g1", Arrays.asList(ref, ref)).unfiltered().make(), + new GenotypeBuilder("g2", Arrays.asList(ref, ref)).filters("X").make()); + addGenotypeTests(site, + new GenotypeBuilder("g1", Arrays.asList(ref, ref)).unfiltered().make(), + new GenotypeBuilder("g2", Arrays.asList(ref, ref)).filters("X", "Y").make()); + addGenotypeTests(site, + new GenotypeBuilder("g1", Arrays.asList(ref, ref)).unfiltered().make(), + new GenotypeBuilder("g2", Arrays.asList(ref, ref)).filters("X").make(), + new GenotypeBuilder("g3", Arrays.asList(ref, ref)).filters("X", "Y").make()); + } // TODO -- test test Integer, Float, Flag, String atomic, vector, and missing types of different lengths per sample }