From 8b0196976269d3a9aa607c28fc72bf360f507366 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 13 Jun 2012 21:49:22 -0400 Subject: [PATCH] More code cleanup and optimizations to BCF2 writer -- Cleanup a few contracts -- BCF2FieldManager uses new VCFHeader accessors for specific info and format fields -- A few simple optimizations -- VCF header samples stored in String[] in the writer for fast access -- getCalledChrCount() uses emptySet instead of allocating over and over empty hashset -- VariantContextWriterStorage now creates a 1MB buffered output writer, which results in 3x performance boost when writing BCF2 files -- A few editorial comments in VCFHeader --- .../storage/VariantContextWriterStorage.java | 9 ++--- .../sting/utils/codecs/vcf/VCFHeader.java | 40 +++++++++++++++++-- .../utils/variantcontext/VariantContext.java | 3 +- .../variantcontext/writer/BCF2Encoder.java | 3 +- .../writer/BCF2FieldWriterManager.java | 20 +++++----- .../variantcontext/writer/BCF2Writer.java | 8 ++-- 6 files changed, 58 insertions(+), 25 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/io/storage/VariantContextWriterStorage.java b/public/java/src/org/broadinstitute/sting/gatk/io/storage/VariantContextWriterStorage.java index 972144ab8..b97475000 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/io/storage/VariantContextWriterStorage.java +++ b/public/java/src/org/broadinstitute/sting/gatk/io/storage/VariantContextWriterStorage.java @@ -38,10 +38,7 @@ import org.broadinstitute.sting.utils.variantcontext.writer.Options; import org.broadinstitute.sting.utils.variantcontext.writer.VariantContextWriter; import org.broadinstitute.sting.utils.variantcontext.writer.VariantContextWriterFactory; -import java.io.File; -import java.io.IOException; -import java.io.OutputStream; -import java.io.PrintStream; +import java.io.*; import java.util.Arrays; import java.util.EnumSet; import java.util.List; @@ -58,6 +55,8 @@ public class VariantContextWriterStorage implements Storage * Class VCFHeader @@ -37,6 +43,7 @@ import java.util.*; * A class representing the VCF header */ public class VCFHeader { + final protected static Logger logger = Logger.getLogger(VCFHeader.class); // the mandatory header fields public enum HEADER_FIELDS { @@ -164,10 +171,10 @@ public class VCFHeader { for ( VCFHeaderLine line : mMetaData ) { if ( line instanceof VCFInfoHeaderLine ) { VCFInfoHeaderLine infoLine = (VCFInfoHeaderLine)line; - mInfoMetaData.put(infoLine.getID(), infoLine); + addMetaDataMapBinding(mInfoMetaData, infoLine); } else if ( line instanceof VCFFormatHeaderLine ) { VCFFormatHeaderLine formatLine = (VCFFormatHeaderLine)line; - mFormatMetaData.put(formatLine.getID(), formatLine); + addMetaDataMapBinding(mFormatMetaData, formatLine); } else if ( line instanceof VCFContigHeaderLine ) { contigMetaData.add((VCFContigHeaderLine)line); } else { @@ -176,6 +183,21 @@ public class VCFHeader { } } + /** + * Add line to map, issuing warnings about duplicates + * + * @param map + * @param line + * @param + */ + private final void addMetaDataMapBinding(final Map map, T line) { + final String key = line.getID(); + if ( map.containsKey(key) ) + logger.warn("Found duplicate VCF header lines for " + key + "; keeping the first only" ); + else + map.put(key, line); + } + /** * get the header fields in order they're presented in the input file (which is now required to be * the order presented in the spec). @@ -221,13 +243,17 @@ public class VCFHeader { return mGenotypeSampleNames; } + public int getNGenotypeSamples() { + return mGenotypeSampleNames.size(); + } + /** * do we have genotyping data? * * @return true if we have genotyping columns, false otherwise */ public boolean hasGenotypingData() { - return mGenotypeSampleNames.size() > 0; + return getNGenotypeSamples() > 0; } /** @@ -244,6 +270,14 @@ public class VCFHeader { return HEADER_FIELDS.values().length + (hasGenotypingData() ? mGenotypeSampleNames.size() + 1 : 0); } + public Collection getInfoHeaderLines() { + return mInfoMetaData.values(); + } + + public Collection getFormatHeaderLines() { + return mFormatMetaData.values(); + } + /** * @param id the header key name * @return the meta data line, or null if there is none diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java index bd37c482d..6739f1d0e 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java @@ -866,7 +866,8 @@ public class VariantContext implements Feature { // to enable tribble integratio * @return chromosome count */ public int getCalledChrCount() { - return getCalledChrCount(new HashSet(0)); + final Set noSamples = Collections.emptySet(); + return getCalledChrCount(noSamples); } /** diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Encoder.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Encoder.java index 4d2eae4ab..62da3174d 100644 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Encoder.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Encoder.java @@ -159,9 +159,8 @@ public final class BCF2Encoder { encodePrimitive(Float.floatToIntBits((float)value), BCF2Type.FLOAT); } + @Requires("size >= 0") public final void encodeType(final int size, final BCF2Type type) throws IOException { - if ( size < 0 ) throw new ReviewedStingException("BUG: size < 0"); - final byte typeByte = BCF2Utils.encodeTypeDescriptor(size, type); encodeStream.write(typeByte); if ( BCF2Utils.willOverflow(size) ) { diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2FieldWriterManager.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2FieldWriterManager.java index 665c2cc21..00ab1a2ed 100644 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2FieldWriterManager.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2FieldWriterManager.java @@ -57,16 +57,16 @@ public class BCF2FieldWriterManager { * @param stringDictionary a map from VCFHeader strings to their offsets for encoding */ public void setup(final VCFHeader header, final BCF2Encoder encoder, final Map stringDictionary) { - for (final VCFHeaderLine line : header.getMetaData()) { - if ( line instanceof VCFInfoHeaderLine ) { - final String field = ((VCFInfoHeaderLine) line).getID(); - final BCF2FieldWriter.SiteWriter writer = createInfoWriter(header, (VCFInfoHeaderLine)line, encoder, stringDictionary); - add(siteWriters, field, writer); - } else if ( line instanceof VCFFormatHeaderLine ) { - final String field = ((VCFFormatHeaderLine) line).getID(); - final BCF2FieldWriter.GenotypesWriter writer = createGenotypesWriter(header, (VCFFormatHeaderLine)line, encoder, stringDictionary); - add(genotypesWriters, field, writer); - } + for (final VCFInfoHeaderLine line : header.getInfoHeaderLines()) { + final String field = line.getID(); + final BCF2FieldWriter.SiteWriter writer = createInfoWriter(header, line, encoder, stringDictionary); + add(siteWriters, field, writer); + } + + for (final VCFFormatHeaderLine line : header.getFormatHeaderLines()) { + final String field = line.getID(); + final BCF2FieldWriter.GenotypesWriter writer = createGenotypesWriter(header, line, encoder, stringDictionary); + add(genotypesWriters, field, writer); } } diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java index f72cb81ec..b328dfc59 100644 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java @@ -90,6 +90,7 @@ class BCF2Writer extends IndexingVariantContextWriter { private final Map contigDictionary = new HashMap(); private final Map stringDictionaryMap = new LinkedHashMap(); private final boolean doNotWriteGenotypes; + private String[] sampleNames = null; private final BCF2Encoder encoder = new BCF2Encoder(); // initialized after the header arrives final BCF2FieldWriterManager fieldManager = new BCF2FieldWriterManager(); @@ -122,6 +123,8 @@ class BCF2Writer extends IndexingVariantContextWriter { stringDictionaryMap.put(dict.get(i), i); } + sampleNames = header.getGenotypeSamples().toArray(new String[header.getNGenotypeSamples()]); + // setup the field encodings fieldManager.setup(header, encoder, stringDictionaryMap); @@ -289,8 +292,7 @@ class BCF2Writer extends IndexingVariantContextWriter { final BCF2FieldWriter.GenotypesWriter writer = fieldManager.getGenotypeFieldWriter(field); writer.start(encoder, vc); - for ( final String name : header.getGenotypeSamples() ) { - // todo -- can we optimize this get (string -> genotype) which can be expensive + for ( final String name : sampleNames ) { Genotype g = vc.getGenotype(name); if ( g == null ) // we don't have any data about g at all @@ -327,8 +329,6 @@ class BCF2Writer extends IndexingVariantContextWriter { @Requires("! strings.isEmpty()") @Ensures("BCF2Type.INTEGERS.contains(result)") private final BCF2Type encodeStringsByRef(final Collection strings) throws IOException { - assert ! strings.isEmpty(); - final List offsets = new ArrayList(strings.size()); BCF2Type maxType = BCF2Type.INT8; // start with the smallest size