More code cleanup and optimizations to BCF2 writer

-- Cleanup a few contracts
-- BCF2FieldManager uses new VCFHeader accessors for specific info and format fields
-- A few simple optimizations
    -- VCF header samples stored in String[] in the writer for fast access
    -- getCalledChrCount() uses emptySet instead of allocating over and over empty hashset
    -- VariantContextWriterStorage now creates a 1MB buffered output writer, which results in 3x performance boost when writing BCF2 files
-- A few editorial comments in VCFHeader
This commit is contained in:
Mark DePristo 2012-06-13 21:49:22 -04:00
parent e34ca0acb1
commit 8b01969762
6 changed files with 58 additions and 25 deletions

View File

@ -38,10 +38,7 @@ import org.broadinstitute.sting.utils.variantcontext.writer.Options;
import org.broadinstitute.sting.utils.variantcontext.writer.VariantContextWriter;
import org.broadinstitute.sting.utils.variantcontext.writer.VariantContextWriterFactory;
import java.io.File;
import java.io.IOException;
import java.io.OutputStream;
import java.io.PrintStream;
import java.io.*;
import java.util.Arrays;
import java.util.EnumSet;
import java.util.List;
@ -58,6 +55,8 @@ public class VariantContextWriterStorage implements Storage<VariantContextWriter
*/
private static Logger logger = Logger.getLogger(VariantContextWriterStorage.class);
private final static int BUFFER_SIZE = 1048576;
protected final File file;
protected OutputStream stream;
protected final VariantContextWriter writer;
@ -93,7 +92,7 @@ public class VariantContextWriterStorage implements Storage<VariantContextWriter
if ( stub.isCompressed() )
stream = new BlockCompressedOutputStream(file);
else
stream = new PrintStream(file);
stream = new PrintStream(new BufferedOutputStream(new FileOutputStream(file), BUFFER_SIZE));
}
catch(IOException ex) {
throw new UserException.CouldNotCreateOutputFile(file, "Unable to open target output stream", ex);

View File

@ -24,12 +24,18 @@
package org.broadinstitute.sting.utils.codecs.vcf;
import org.apache.log4j.Logger;
import org.broad.tribble.util.ParsingUtils;
import java.util.*;
/**
* This class is really a POS. It allows duplicate entries in the metadata,
* stores header lines in lots of places, and all around f*cking sucks.
*
* todo -- clean this POS up
*
* @author aaron
* <p/>
* Class VCFHeader
@ -37,6 +43,7 @@ import java.util.*;
* A class representing the VCF header
*/
public class VCFHeader {
final protected static Logger logger = Logger.getLogger(VCFHeader.class);
// the mandatory header fields
public enum HEADER_FIELDS {
@ -164,10 +171,10 @@ public class VCFHeader {
for ( VCFHeaderLine line : mMetaData ) {
if ( line instanceof VCFInfoHeaderLine ) {
VCFInfoHeaderLine infoLine = (VCFInfoHeaderLine)line;
mInfoMetaData.put(infoLine.getID(), infoLine);
addMetaDataMapBinding(mInfoMetaData, infoLine);
} else if ( line instanceof VCFFormatHeaderLine ) {
VCFFormatHeaderLine formatLine = (VCFFormatHeaderLine)line;
mFormatMetaData.put(formatLine.getID(), formatLine);
addMetaDataMapBinding(mFormatMetaData, formatLine);
} else if ( line instanceof VCFContigHeaderLine ) {
contigMetaData.add((VCFContigHeaderLine)line);
} else {
@ -176,6 +183,21 @@ public class VCFHeader {
}
}
/**
* Add line to map, issuing warnings about duplicates
*
* @param map
* @param line
* @param <T>
*/
private final <T extends VCFCompoundHeaderLine> void addMetaDataMapBinding(final Map<String, T> map, T line) {
final String key = line.getID();
if ( map.containsKey(key) )
logger.warn("Found duplicate VCF header lines for " + key + "; keeping the first only" );
else
map.put(key, line);
}
/**
* get the header fields in order they're presented in the input file (which is now required to be
* the order presented in the spec).
@ -221,13 +243,17 @@ public class VCFHeader {
return mGenotypeSampleNames;
}
public int getNGenotypeSamples() {
return mGenotypeSampleNames.size();
}
/**
* do we have genotyping data?
*
* @return true if we have genotyping columns, false otherwise
*/
public boolean hasGenotypingData() {
return mGenotypeSampleNames.size() > 0;
return getNGenotypeSamples() > 0;
}
/**
@ -244,6 +270,14 @@ public class VCFHeader {
return HEADER_FIELDS.values().length + (hasGenotypingData() ? mGenotypeSampleNames.size() + 1 : 0);
}
public Collection<VCFInfoHeaderLine> getInfoHeaderLines() {
return mInfoMetaData.values();
}
public Collection<VCFFormatHeaderLine> getFormatHeaderLines() {
return mFormatMetaData.values();
}
/**
* @param id the header key name
* @return the meta data line, or null if there is none

View File

@ -866,7 +866,8 @@ public class VariantContext implements Feature { // to enable tribble integratio
* @return chromosome count
*/
public int getCalledChrCount() {
return getCalledChrCount(new HashSet<String>(0));
final Set<String> noSamples = Collections.emptySet();
return getCalledChrCount(noSamples);
}
/**

View File

@ -159,9 +159,8 @@ public final class BCF2Encoder {
encodePrimitive(Float.floatToIntBits((float)value), BCF2Type.FLOAT);
}
@Requires("size >= 0")
public final void encodeType(final int size, final BCF2Type type) throws IOException {
if ( size < 0 ) throw new ReviewedStingException("BUG: size < 0");
final byte typeByte = BCF2Utils.encodeTypeDescriptor(size, type);
encodeStream.write(typeByte);
if ( BCF2Utils.willOverflow(size) ) {

View File

@ -57,16 +57,16 @@ public class BCF2FieldWriterManager {
* @param stringDictionary a map from VCFHeader strings to their offsets for encoding
*/
public void setup(final VCFHeader header, final BCF2Encoder encoder, final Map<String, Integer> stringDictionary) {
for (final VCFHeaderLine line : header.getMetaData()) {
if ( line instanceof VCFInfoHeaderLine ) {
final String field = ((VCFInfoHeaderLine) line).getID();
final BCF2FieldWriter.SiteWriter writer = createInfoWriter(header, (VCFInfoHeaderLine)line, encoder, stringDictionary);
add(siteWriters, field, writer);
} else if ( line instanceof VCFFormatHeaderLine ) {
final String field = ((VCFFormatHeaderLine) line).getID();
final BCF2FieldWriter.GenotypesWriter writer = createGenotypesWriter(header, (VCFFormatHeaderLine)line, encoder, stringDictionary);
add(genotypesWriters, field, writer);
}
for (final VCFInfoHeaderLine line : header.getInfoHeaderLines()) {
final String field = line.getID();
final BCF2FieldWriter.SiteWriter writer = createInfoWriter(header, line, encoder, stringDictionary);
add(siteWriters, field, writer);
}
for (final VCFFormatHeaderLine line : header.getFormatHeaderLines()) {
final String field = line.getID();
final BCF2FieldWriter.GenotypesWriter writer = createGenotypesWriter(header, line, encoder, stringDictionary);
add(genotypesWriters, field, writer);
}
}

View File

@ -90,6 +90,7 @@ class BCF2Writer extends IndexingVariantContextWriter {
private final Map<String, Integer> contigDictionary = new HashMap<String, Integer>();
private final Map<String, Integer> stringDictionaryMap = new LinkedHashMap<String, Integer>();
private final boolean doNotWriteGenotypes;
private String[] sampleNames = null;
private final BCF2Encoder encoder = new BCF2Encoder(); // initialized after the header arrives
final BCF2FieldWriterManager fieldManager = new BCF2FieldWriterManager();
@ -122,6 +123,8 @@ class BCF2Writer extends IndexingVariantContextWriter {
stringDictionaryMap.put(dict.get(i), i);
}
sampleNames = header.getGenotypeSamples().toArray(new String[header.getNGenotypeSamples()]);
// setup the field encodings
fieldManager.setup(header, encoder, stringDictionaryMap);
@ -289,8 +292,7 @@ class BCF2Writer extends IndexingVariantContextWriter {
final BCF2FieldWriter.GenotypesWriter writer = fieldManager.getGenotypeFieldWriter(field);
writer.start(encoder, vc);
for ( final String name : header.getGenotypeSamples() ) {
// todo -- can we optimize this get (string -> genotype) which can be expensive
for ( final String name : sampleNames ) {
Genotype g = vc.getGenotype(name);
if ( g == null )
// we don't have any data about g at all
@ -327,8 +329,6 @@ class BCF2Writer extends IndexingVariantContextWriter {
@Requires("! strings.isEmpty()")
@Ensures("BCF2Type.INTEGERS.contains(result)")
private final BCF2Type encodeStringsByRef(final Collection<String> strings) throws IOException {
assert ! strings.isEmpty();
final List<Integer> offsets = new ArrayList<Integer>(strings.size());
BCF2Type maxType = BCF2Type.INT8; // start with the smallest size