More code cleanup and optimizations to BCF2 writer
-- Cleanup a few contracts
-- BCF2FieldManager uses new VCFHeader accessors for specific info and format fields
-- A few simple optimizations
-- VCF header samples stored in String[] in the writer for fast access
-- getCalledChrCount() uses emptySet instead of allocating over and over empty hashset
-- VariantContextWriterStorage now creates a 1MB buffered output writer, which results in 3x performance boost when writing BCF2 files
-- A few editorial comments in VCFHeader
This commit is contained in:
parent
e34ca0acb1
commit
8b01969762
|
|
@ -38,10 +38,7 @@ import org.broadinstitute.sting.utils.variantcontext.writer.Options;
|
|||
import org.broadinstitute.sting.utils.variantcontext.writer.VariantContextWriter;
|
||||
import org.broadinstitute.sting.utils.variantcontext.writer.VariantContextWriterFactory;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.OutputStream;
|
||||
import java.io.PrintStream;
|
||||
import java.io.*;
|
||||
import java.util.Arrays;
|
||||
import java.util.EnumSet;
|
||||
import java.util.List;
|
||||
|
|
@ -58,6 +55,8 @@ public class VariantContextWriterStorage implements Storage<VariantContextWriter
|
|||
*/
|
||||
private static Logger logger = Logger.getLogger(VariantContextWriterStorage.class);
|
||||
|
||||
private final static int BUFFER_SIZE = 1048576;
|
||||
|
||||
protected final File file;
|
||||
protected OutputStream stream;
|
||||
protected final VariantContextWriter writer;
|
||||
|
|
@ -93,7 +92,7 @@ public class VariantContextWriterStorage implements Storage<VariantContextWriter
|
|||
if ( stub.isCompressed() )
|
||||
stream = new BlockCompressedOutputStream(file);
|
||||
else
|
||||
stream = new PrintStream(file);
|
||||
stream = new PrintStream(new BufferedOutputStream(new FileOutputStream(file), BUFFER_SIZE));
|
||||
}
|
||||
catch(IOException ex) {
|
||||
throw new UserException.CouldNotCreateOutputFile(file, "Unable to open target output stream", ex);
|
||||
|
|
|
|||
|
|
@ -24,12 +24,18 @@
|
|||
|
||||
package org.broadinstitute.sting.utils.codecs.vcf;
|
||||
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broad.tribble.util.ParsingUtils;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
|
||||
/**
|
||||
* This class is really a POS. It allows duplicate entries in the metadata,
|
||||
* stores header lines in lots of places, and all around f*cking sucks.
|
||||
*
|
||||
* todo -- clean this POS up
|
||||
*
|
||||
* @author aaron
|
||||
* <p/>
|
||||
* Class VCFHeader
|
||||
|
|
@ -37,6 +43,7 @@ import java.util.*;
|
|||
* A class representing the VCF header
|
||||
*/
|
||||
public class VCFHeader {
|
||||
final protected static Logger logger = Logger.getLogger(VCFHeader.class);
|
||||
|
||||
// the mandatory header fields
|
||||
public enum HEADER_FIELDS {
|
||||
|
|
@ -164,10 +171,10 @@ public class VCFHeader {
|
|||
for ( VCFHeaderLine line : mMetaData ) {
|
||||
if ( line instanceof VCFInfoHeaderLine ) {
|
||||
VCFInfoHeaderLine infoLine = (VCFInfoHeaderLine)line;
|
||||
mInfoMetaData.put(infoLine.getID(), infoLine);
|
||||
addMetaDataMapBinding(mInfoMetaData, infoLine);
|
||||
} else if ( line instanceof VCFFormatHeaderLine ) {
|
||||
VCFFormatHeaderLine formatLine = (VCFFormatHeaderLine)line;
|
||||
mFormatMetaData.put(formatLine.getID(), formatLine);
|
||||
addMetaDataMapBinding(mFormatMetaData, formatLine);
|
||||
} else if ( line instanceof VCFContigHeaderLine ) {
|
||||
contigMetaData.add((VCFContigHeaderLine)line);
|
||||
} else {
|
||||
|
|
@ -176,6 +183,21 @@ public class VCFHeader {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Add line to map, issuing warnings about duplicates
|
||||
*
|
||||
* @param map
|
||||
* @param line
|
||||
* @param <T>
|
||||
*/
|
||||
private final <T extends VCFCompoundHeaderLine> void addMetaDataMapBinding(final Map<String, T> map, T line) {
|
||||
final String key = line.getID();
|
||||
if ( map.containsKey(key) )
|
||||
logger.warn("Found duplicate VCF header lines for " + key + "; keeping the first only" );
|
||||
else
|
||||
map.put(key, line);
|
||||
}
|
||||
|
||||
/**
|
||||
* get the header fields in order they're presented in the input file (which is now required to be
|
||||
* the order presented in the spec).
|
||||
|
|
@ -221,13 +243,17 @@ public class VCFHeader {
|
|||
return mGenotypeSampleNames;
|
||||
}
|
||||
|
||||
public int getNGenotypeSamples() {
|
||||
return mGenotypeSampleNames.size();
|
||||
}
|
||||
|
||||
/**
|
||||
* do we have genotyping data?
|
||||
*
|
||||
* @return true if we have genotyping columns, false otherwise
|
||||
*/
|
||||
public boolean hasGenotypingData() {
|
||||
return mGenotypeSampleNames.size() > 0;
|
||||
return getNGenotypeSamples() > 0;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -244,6 +270,14 @@ public class VCFHeader {
|
|||
return HEADER_FIELDS.values().length + (hasGenotypingData() ? mGenotypeSampleNames.size() + 1 : 0);
|
||||
}
|
||||
|
||||
public Collection<VCFInfoHeaderLine> getInfoHeaderLines() {
|
||||
return mInfoMetaData.values();
|
||||
}
|
||||
|
||||
public Collection<VCFFormatHeaderLine> getFormatHeaderLines() {
|
||||
return mFormatMetaData.values();
|
||||
}
|
||||
|
||||
/**
|
||||
* @param id the header key name
|
||||
* @return the meta data line, or null if there is none
|
||||
|
|
|
|||
|
|
@ -866,7 +866,8 @@ public class VariantContext implements Feature { // to enable tribble integratio
|
|||
* @return chromosome count
|
||||
*/
|
||||
public int getCalledChrCount() {
|
||||
return getCalledChrCount(new HashSet<String>(0));
|
||||
final Set<String> noSamples = Collections.emptySet();
|
||||
return getCalledChrCount(noSamples);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -159,9 +159,8 @@ public final class BCF2Encoder {
|
|||
encodePrimitive(Float.floatToIntBits((float)value), BCF2Type.FLOAT);
|
||||
}
|
||||
|
||||
@Requires("size >= 0")
|
||||
public final void encodeType(final int size, final BCF2Type type) throws IOException {
|
||||
if ( size < 0 ) throw new ReviewedStingException("BUG: size < 0");
|
||||
|
||||
final byte typeByte = BCF2Utils.encodeTypeDescriptor(size, type);
|
||||
encodeStream.write(typeByte);
|
||||
if ( BCF2Utils.willOverflow(size) ) {
|
||||
|
|
|
|||
|
|
@ -57,16 +57,16 @@ public class BCF2FieldWriterManager {
|
|||
* @param stringDictionary a map from VCFHeader strings to their offsets for encoding
|
||||
*/
|
||||
public void setup(final VCFHeader header, final BCF2Encoder encoder, final Map<String, Integer> stringDictionary) {
|
||||
for (final VCFHeaderLine line : header.getMetaData()) {
|
||||
if ( line instanceof VCFInfoHeaderLine ) {
|
||||
final String field = ((VCFInfoHeaderLine) line).getID();
|
||||
final BCF2FieldWriter.SiteWriter writer = createInfoWriter(header, (VCFInfoHeaderLine)line, encoder, stringDictionary);
|
||||
add(siteWriters, field, writer);
|
||||
} else if ( line instanceof VCFFormatHeaderLine ) {
|
||||
final String field = ((VCFFormatHeaderLine) line).getID();
|
||||
final BCF2FieldWriter.GenotypesWriter writer = createGenotypesWriter(header, (VCFFormatHeaderLine)line, encoder, stringDictionary);
|
||||
add(genotypesWriters, field, writer);
|
||||
}
|
||||
for (final VCFInfoHeaderLine line : header.getInfoHeaderLines()) {
|
||||
final String field = line.getID();
|
||||
final BCF2FieldWriter.SiteWriter writer = createInfoWriter(header, line, encoder, stringDictionary);
|
||||
add(siteWriters, field, writer);
|
||||
}
|
||||
|
||||
for (final VCFFormatHeaderLine line : header.getFormatHeaderLines()) {
|
||||
final String field = line.getID();
|
||||
final BCF2FieldWriter.GenotypesWriter writer = createGenotypesWriter(header, line, encoder, stringDictionary);
|
||||
add(genotypesWriters, field, writer);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -90,6 +90,7 @@ class BCF2Writer extends IndexingVariantContextWriter {
|
|||
private final Map<String, Integer> contigDictionary = new HashMap<String, Integer>();
|
||||
private final Map<String, Integer> stringDictionaryMap = new LinkedHashMap<String, Integer>();
|
||||
private final boolean doNotWriteGenotypes;
|
||||
private String[] sampleNames = null;
|
||||
|
||||
private final BCF2Encoder encoder = new BCF2Encoder(); // initialized after the header arrives
|
||||
final BCF2FieldWriterManager fieldManager = new BCF2FieldWriterManager();
|
||||
|
|
@ -122,6 +123,8 @@ class BCF2Writer extends IndexingVariantContextWriter {
|
|||
stringDictionaryMap.put(dict.get(i), i);
|
||||
}
|
||||
|
||||
sampleNames = header.getGenotypeSamples().toArray(new String[header.getNGenotypeSamples()]);
|
||||
|
||||
// setup the field encodings
|
||||
fieldManager.setup(header, encoder, stringDictionaryMap);
|
||||
|
||||
|
|
@ -289,8 +292,7 @@ class BCF2Writer extends IndexingVariantContextWriter {
|
|||
final BCF2FieldWriter.GenotypesWriter writer = fieldManager.getGenotypeFieldWriter(field);
|
||||
|
||||
writer.start(encoder, vc);
|
||||
for ( final String name : header.getGenotypeSamples() ) {
|
||||
// todo -- can we optimize this get (string -> genotype) which can be expensive
|
||||
for ( final String name : sampleNames ) {
|
||||
Genotype g = vc.getGenotype(name);
|
||||
if ( g == null )
|
||||
// we don't have any data about g at all
|
||||
|
|
@ -327,8 +329,6 @@ class BCF2Writer extends IndexingVariantContextWriter {
|
|||
@Requires("! strings.isEmpty()")
|
||||
@Ensures("BCF2Type.INTEGERS.contains(result)")
|
||||
private final BCF2Type encodeStringsByRef(final Collection<String> strings) throws IOException {
|
||||
assert ! strings.isEmpty();
|
||||
|
||||
final List<Integer> offsets = new ArrayList<Integer>(strings.size());
|
||||
BCF2Type maxType = BCF2Type.INT8; // start with the smallest size
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue