More code cleanup and optimizations to BCF2 writer

-- Cleanup a few contracts -- BCF2FieldManager uses new VCFHeader accessors for specific info and format fields -- A few simple optimizations -- VCF header samples stored in String[] in the writer for fast access -- getCalledChrCount() uses emptySet instead of allocating over and over empty hashset -- VariantContextWriterStorage now creates a 1MB buffered output writer, which results in 3x performance boost when writing BCF2 files -- A few editorial comments in VCFHeader
2012-06-13 21:49:22 -04:00 · 2012-06-13 21:49:22 -04:00 · 8b01969762
parent e34ca0acb1
commit 8b01969762
6 changed files with 58 additions and 25 deletions
--- a/public/java/src/org/broadinstitute/sting/gatk/io/storage/VariantContextWriterStorage.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/io/storage/VariantContextWriterStorage.java
@ -38,10 +38,7 @@ import org.broadinstitute.sting.utils.variantcontext.writer.Options;
 import org.broadinstitute.sting.utils.variantcontext.writer.VariantContextWriter;
 import org.broadinstitute.sting.utils.variantcontext.writer.VariantContextWriterFactory;

-import java.io.File;
-import java.io.IOException;
-import java.io.OutputStream;
-import java.io.PrintStream;
+import java.io.*;
 import java.util.Arrays;
 import java.util.EnumSet;
 import java.util.List;
@ -58,6 +55,8 @@ public class VariantContextWriterStorage implements Storage<VariantContextWriter
     */
    private static Logger logger = Logger.getLogger(VariantContextWriterStorage.class);

+    private final static int BUFFER_SIZE = 1048576;
+
    protected final File file;
    protected OutputStream stream;
    protected final VariantContextWriter writer;
@ -93,7 +92,7 @@ public class VariantContextWriterStorage implements Storage<VariantContextWriter
            if ( stub.isCompressed() )
                stream = new BlockCompressedOutputStream(file);
            else
-                stream = new PrintStream(file);
+                stream = new PrintStream(new BufferedOutputStream(new FileOutputStream(file), BUFFER_SIZE));
        }
        catch(IOException ex) {
            throw new UserException.CouldNotCreateOutputFile(file, "Unable to open target output stream", ex);
--- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeader.java
+++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeader.java
@ -24,12 +24,18 @@

 package org.broadinstitute.sting.utils.codecs.vcf;

+import org.apache.log4j.Logger;
 import org.broad.tribble.util.ParsingUtils;

 import java.util.*;


 /**
+ * This class is really a POS.  It allows duplicate entries in the metadata,
+ * stores header lines in lots of places, and all around f*cking sucks.
+ *
+ * todo -- clean this POS up
+ *
 * @author aaron
 *         <p/>
 *         Class VCFHeader
@ -37,6 +43,7 @@ import java.util.*;
 *         A class representing the VCF header
 */
 public class VCFHeader {
+    final protected static Logger logger = Logger.getLogger(VCFHeader.class);

    // the mandatory header fields
    public enum HEADER_FIELDS {
@ -164,10 +171,10 @@ public class VCFHeader {
        for ( VCFHeaderLine line : mMetaData ) {
            if ( line instanceof VCFInfoHeaderLine )  {
                VCFInfoHeaderLine infoLine = (VCFInfoHeaderLine)line;
-                mInfoMetaData.put(infoLine.getID(), infoLine);
+                addMetaDataMapBinding(mInfoMetaData, infoLine);
            } else if ( line instanceof VCFFormatHeaderLine ) {
                VCFFormatHeaderLine formatLine = (VCFFormatHeaderLine)line;
-                mFormatMetaData.put(formatLine.getID(), formatLine);
+                addMetaDataMapBinding(mFormatMetaData, formatLine);
            } else if ( line instanceof VCFContigHeaderLine ) {
                contigMetaData.add((VCFContigHeaderLine)line);
            } else {
@ -176,6 +183,21 @@ public class VCFHeader {
        }
    }

+    /**
+     * Add line to map, issuing warnings about duplicates
+     *
+     * @param map
+     * @param line
+     * @param <T>
+     */
+    private final <T extends VCFCompoundHeaderLine> void addMetaDataMapBinding(final Map<String, T> map, T line) {
+        final String key = line.getID();
+        if ( map.containsKey(key) )
+            logger.warn("Found duplicate VCF header lines for " + key + "; keeping the first only" );
+        else
+            map.put(key, line);
+    }
+
    /**
     * get the header fields in order they're presented in the input file (which is now required to be
     * the order presented in the spec).
@ -221,13 +243,17 @@ public class VCFHeader {
        return mGenotypeSampleNames;
    }

+    public int getNGenotypeSamples() {
+        return mGenotypeSampleNames.size();
+    }
+
    /**
     * do we have genotyping data?
     *
     * @return true if we have genotyping columns, false otherwise
     */
    public boolean hasGenotypingData() {
-        return mGenotypeSampleNames.size() > 0;
+        return getNGenotypeSamples() > 0;
    }

    /**
@ -244,6 +270,14 @@ public class VCFHeader {
        return HEADER_FIELDS.values().length + (hasGenotypingData() ? mGenotypeSampleNames.size() + 1 : 0);
    }

+    public Collection<VCFInfoHeaderLine> getInfoHeaderLines() {
+        return mInfoMetaData.values();
+    }
+
+    public Collection<VCFFormatHeaderLine> getFormatHeaderLines() {
+        return mFormatMetaData.values();
+    }
+
    /**
     * @param id the header key name
     * @return the meta data line, or null if there is none
--- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java
+++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java
@ -866,7 +866,8 @@ public class VariantContext implements Feature { // to enable tribble integratio
     * @return chromosome count
     */
    public int getCalledChrCount() {
-        return  getCalledChrCount(new HashSet<String>(0));
+        final Set<String> noSamples = Collections.emptySet();
+        return  getCalledChrCount(noSamples);
    }

    /**
--- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Encoder.java
+++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Encoder.java
@ -159,9 +159,8 @@ public final class BCF2Encoder {
        encodePrimitive(Float.floatToIntBits((float)value), BCF2Type.FLOAT);
    }

+    @Requires("size >= 0")
    public final void encodeType(final int size, final BCF2Type type) throws IOException {
-        if ( size < 0 ) throw new ReviewedStingException("BUG: size < 0");
-
        final byte typeByte = BCF2Utils.encodeTypeDescriptor(size, type);
        encodeStream.write(typeByte);
        if ( BCF2Utils.willOverflow(size) ) {
--- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2FieldWriterManager.java
+++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2FieldWriterManager.java
@ -57,16 +57,16 @@ public class BCF2FieldWriterManager {
     * @param stringDictionary a map from VCFHeader strings to their offsets for encoding
     */
    public void setup(final VCFHeader header, final BCF2Encoder encoder, final Map<String, Integer> stringDictionary) {
-        for (final VCFHeaderLine line : header.getMetaData()) {
-            if ( line instanceof VCFInfoHeaderLine ) {
-                final String field = ((VCFInfoHeaderLine) line).getID();
-                final BCF2FieldWriter.SiteWriter writer = createInfoWriter(header, (VCFInfoHeaderLine)line, encoder, stringDictionary);
-                add(siteWriters, field, writer);
-            } else if ( line instanceof VCFFormatHeaderLine ) {
-                final String field = ((VCFFormatHeaderLine) line).getID();
-                final BCF2FieldWriter.GenotypesWriter writer = createGenotypesWriter(header, (VCFFormatHeaderLine)line, encoder, stringDictionary);
-                add(genotypesWriters, field, writer);
-            }
+        for (final VCFInfoHeaderLine line : header.getInfoHeaderLines()) {
+            final String field = line.getID();
+            final BCF2FieldWriter.SiteWriter writer = createInfoWriter(header, line, encoder, stringDictionary);
+            add(siteWriters, field, writer);
+        }
+
+        for (final VCFFormatHeaderLine line : header.getFormatHeaderLines()) {
+            final String field = line.getID();
+            final BCF2FieldWriter.GenotypesWriter writer = createGenotypesWriter(header, line, encoder, stringDictionary);
+            add(genotypesWriters, field, writer);
        }
    }

--- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java
+++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java
@ -90,6 +90,7 @@ class BCF2Writer extends IndexingVariantContextWriter {
    private final Map<String, Integer> contigDictionary = new HashMap<String, Integer>();
    private final Map<String, Integer> stringDictionaryMap = new LinkedHashMap<String, Integer>();
    private final boolean doNotWriteGenotypes;
+    private String[] sampleNames = null;

    private final BCF2Encoder encoder = new BCF2Encoder(); // initialized after the header arrives
    final BCF2FieldWriterManager fieldManager = new BCF2FieldWriterManager();
@ -122,6 +123,8 @@ class BCF2Writer extends IndexingVariantContextWriter {
            stringDictionaryMap.put(dict.get(i), i);
        }

+        sampleNames = header.getGenotypeSamples().toArray(new String[header.getNGenotypeSamples()]);
+
        // setup the field encodings
        fieldManager.setup(header, encoder, stringDictionaryMap);

@ -289,8 +292,7 @@ class BCF2Writer extends IndexingVariantContextWriter {
                final BCF2FieldWriter.GenotypesWriter writer = fieldManager.getGenotypeFieldWriter(field);

                writer.start(encoder, vc);
-                for ( final String name : header.getGenotypeSamples() ) {
-                    // todo -- can we optimize this get (string -> genotype) which can be expensive
+                for ( final String name : sampleNames ) {
                    Genotype g = vc.getGenotype(name);
                    if ( g == null )
                        // we don't have any data about g at all
@ -327,8 +329,6 @@ class BCF2Writer extends IndexingVariantContextWriter {
    @Requires("! strings.isEmpty()")
    @Ensures("BCF2Type.INTEGERS.contains(result)")
    private final BCF2Type encodeStringsByRef(final Collection<String> strings) throws IOException {
-        assert ! strings.isEmpty();
-
        final List<Integer> offsets = new ArrayList<Integer>(strings.size());
        BCF2Type maxType = BCF2Type.INT8; // start with the smallest size