More code cleanup and optimizations to BCF2 writer
-- Cleanup a few contracts
-- BCF2FieldManager uses new VCFHeader accessors for specific info and format fields
-- A few simple optimizations
-- VCF header samples stored in String[] in the writer for fast access
-- getCalledChrCount() uses emptySet instead of allocating over and over empty hashset
-- VariantContextWriterStorage now creates a 1MB buffered output writer, which results in 3x performance boost when writing BCF2 files
-- A few editorial comments in VCFHeader
This commit is contained in:
parent
e34ca0acb1
commit
8b01969762
|
|
@ -38,10 +38,7 @@ import org.broadinstitute.sting.utils.variantcontext.writer.Options;
|
||||||
import org.broadinstitute.sting.utils.variantcontext.writer.VariantContextWriter;
|
import org.broadinstitute.sting.utils.variantcontext.writer.VariantContextWriter;
|
||||||
import org.broadinstitute.sting.utils.variantcontext.writer.VariantContextWriterFactory;
|
import org.broadinstitute.sting.utils.variantcontext.writer.VariantContextWriterFactory;
|
||||||
|
|
||||||
import java.io.File;
|
import java.io.*;
|
||||||
import java.io.IOException;
|
|
||||||
import java.io.OutputStream;
|
|
||||||
import java.io.PrintStream;
|
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.EnumSet;
|
import java.util.EnumSet;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
@ -58,6 +55,8 @@ public class VariantContextWriterStorage implements Storage<VariantContextWriter
|
||||||
*/
|
*/
|
||||||
private static Logger logger = Logger.getLogger(VariantContextWriterStorage.class);
|
private static Logger logger = Logger.getLogger(VariantContextWriterStorage.class);
|
||||||
|
|
||||||
|
private final static int BUFFER_SIZE = 1048576;
|
||||||
|
|
||||||
protected final File file;
|
protected final File file;
|
||||||
protected OutputStream stream;
|
protected OutputStream stream;
|
||||||
protected final VariantContextWriter writer;
|
protected final VariantContextWriter writer;
|
||||||
|
|
@ -93,7 +92,7 @@ public class VariantContextWriterStorage implements Storage<VariantContextWriter
|
||||||
if ( stub.isCompressed() )
|
if ( stub.isCompressed() )
|
||||||
stream = new BlockCompressedOutputStream(file);
|
stream = new BlockCompressedOutputStream(file);
|
||||||
else
|
else
|
||||||
stream = new PrintStream(file);
|
stream = new PrintStream(new BufferedOutputStream(new FileOutputStream(file), BUFFER_SIZE));
|
||||||
}
|
}
|
||||||
catch(IOException ex) {
|
catch(IOException ex) {
|
||||||
throw new UserException.CouldNotCreateOutputFile(file, "Unable to open target output stream", ex);
|
throw new UserException.CouldNotCreateOutputFile(file, "Unable to open target output stream", ex);
|
||||||
|
|
|
||||||
|
|
@ -24,12 +24,18 @@
|
||||||
|
|
||||||
package org.broadinstitute.sting.utils.codecs.vcf;
|
package org.broadinstitute.sting.utils.codecs.vcf;
|
||||||
|
|
||||||
|
import org.apache.log4j.Logger;
|
||||||
import org.broad.tribble.util.ParsingUtils;
|
import org.broad.tribble.util.ParsingUtils;
|
||||||
|
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
* This class is really a POS. It allows duplicate entries in the metadata,
|
||||||
|
* stores header lines in lots of places, and all around f*cking sucks.
|
||||||
|
*
|
||||||
|
* todo -- clean this POS up
|
||||||
|
*
|
||||||
* @author aaron
|
* @author aaron
|
||||||
* <p/>
|
* <p/>
|
||||||
* Class VCFHeader
|
* Class VCFHeader
|
||||||
|
|
@ -37,6 +43,7 @@ import java.util.*;
|
||||||
* A class representing the VCF header
|
* A class representing the VCF header
|
||||||
*/
|
*/
|
||||||
public class VCFHeader {
|
public class VCFHeader {
|
||||||
|
final protected static Logger logger = Logger.getLogger(VCFHeader.class);
|
||||||
|
|
||||||
// the mandatory header fields
|
// the mandatory header fields
|
||||||
public enum HEADER_FIELDS {
|
public enum HEADER_FIELDS {
|
||||||
|
|
@ -164,10 +171,10 @@ public class VCFHeader {
|
||||||
for ( VCFHeaderLine line : mMetaData ) {
|
for ( VCFHeaderLine line : mMetaData ) {
|
||||||
if ( line instanceof VCFInfoHeaderLine ) {
|
if ( line instanceof VCFInfoHeaderLine ) {
|
||||||
VCFInfoHeaderLine infoLine = (VCFInfoHeaderLine)line;
|
VCFInfoHeaderLine infoLine = (VCFInfoHeaderLine)line;
|
||||||
mInfoMetaData.put(infoLine.getID(), infoLine);
|
addMetaDataMapBinding(mInfoMetaData, infoLine);
|
||||||
} else if ( line instanceof VCFFormatHeaderLine ) {
|
} else if ( line instanceof VCFFormatHeaderLine ) {
|
||||||
VCFFormatHeaderLine formatLine = (VCFFormatHeaderLine)line;
|
VCFFormatHeaderLine formatLine = (VCFFormatHeaderLine)line;
|
||||||
mFormatMetaData.put(formatLine.getID(), formatLine);
|
addMetaDataMapBinding(mFormatMetaData, formatLine);
|
||||||
} else if ( line instanceof VCFContigHeaderLine ) {
|
} else if ( line instanceof VCFContigHeaderLine ) {
|
||||||
contigMetaData.add((VCFContigHeaderLine)line);
|
contigMetaData.add((VCFContigHeaderLine)line);
|
||||||
} else {
|
} else {
|
||||||
|
|
@ -176,6 +183,21 @@ public class VCFHeader {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Add line to map, issuing warnings about duplicates
|
||||||
|
*
|
||||||
|
* @param map
|
||||||
|
* @param line
|
||||||
|
* @param <T>
|
||||||
|
*/
|
||||||
|
private final <T extends VCFCompoundHeaderLine> void addMetaDataMapBinding(final Map<String, T> map, T line) {
|
||||||
|
final String key = line.getID();
|
||||||
|
if ( map.containsKey(key) )
|
||||||
|
logger.warn("Found duplicate VCF header lines for " + key + "; keeping the first only" );
|
||||||
|
else
|
||||||
|
map.put(key, line);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* get the header fields in order they're presented in the input file (which is now required to be
|
* get the header fields in order they're presented in the input file (which is now required to be
|
||||||
* the order presented in the spec).
|
* the order presented in the spec).
|
||||||
|
|
@ -221,13 +243,17 @@ public class VCFHeader {
|
||||||
return mGenotypeSampleNames;
|
return mGenotypeSampleNames;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public int getNGenotypeSamples() {
|
||||||
|
return mGenotypeSampleNames.size();
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* do we have genotyping data?
|
* do we have genotyping data?
|
||||||
*
|
*
|
||||||
* @return true if we have genotyping columns, false otherwise
|
* @return true if we have genotyping columns, false otherwise
|
||||||
*/
|
*/
|
||||||
public boolean hasGenotypingData() {
|
public boolean hasGenotypingData() {
|
||||||
return mGenotypeSampleNames.size() > 0;
|
return getNGenotypeSamples() > 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -244,6 +270,14 @@ public class VCFHeader {
|
||||||
return HEADER_FIELDS.values().length + (hasGenotypingData() ? mGenotypeSampleNames.size() + 1 : 0);
|
return HEADER_FIELDS.values().length + (hasGenotypingData() ? mGenotypeSampleNames.size() + 1 : 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public Collection<VCFInfoHeaderLine> getInfoHeaderLines() {
|
||||||
|
return mInfoMetaData.values();
|
||||||
|
}
|
||||||
|
|
||||||
|
public Collection<VCFFormatHeaderLine> getFormatHeaderLines() {
|
||||||
|
return mFormatMetaData.values();
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @param id the header key name
|
* @param id the header key name
|
||||||
* @return the meta data line, or null if there is none
|
* @return the meta data line, or null if there is none
|
||||||
|
|
|
||||||
|
|
@ -866,7 +866,8 @@ public class VariantContext implements Feature { // to enable tribble integratio
|
||||||
* @return chromosome count
|
* @return chromosome count
|
||||||
*/
|
*/
|
||||||
public int getCalledChrCount() {
|
public int getCalledChrCount() {
|
||||||
return getCalledChrCount(new HashSet<String>(0));
|
final Set<String> noSamples = Collections.emptySet();
|
||||||
|
return getCalledChrCount(noSamples);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
||||||
|
|
@ -159,9 +159,8 @@ public final class BCF2Encoder {
|
||||||
encodePrimitive(Float.floatToIntBits((float)value), BCF2Type.FLOAT);
|
encodePrimitive(Float.floatToIntBits((float)value), BCF2Type.FLOAT);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Requires("size >= 0")
|
||||||
public final void encodeType(final int size, final BCF2Type type) throws IOException {
|
public final void encodeType(final int size, final BCF2Type type) throws IOException {
|
||||||
if ( size < 0 ) throw new ReviewedStingException("BUG: size < 0");
|
|
||||||
|
|
||||||
final byte typeByte = BCF2Utils.encodeTypeDescriptor(size, type);
|
final byte typeByte = BCF2Utils.encodeTypeDescriptor(size, type);
|
||||||
encodeStream.write(typeByte);
|
encodeStream.write(typeByte);
|
||||||
if ( BCF2Utils.willOverflow(size) ) {
|
if ( BCF2Utils.willOverflow(size) ) {
|
||||||
|
|
|
||||||
|
|
@ -57,16 +57,16 @@ public class BCF2FieldWriterManager {
|
||||||
* @param stringDictionary a map from VCFHeader strings to their offsets for encoding
|
* @param stringDictionary a map from VCFHeader strings to their offsets for encoding
|
||||||
*/
|
*/
|
||||||
public void setup(final VCFHeader header, final BCF2Encoder encoder, final Map<String, Integer> stringDictionary) {
|
public void setup(final VCFHeader header, final BCF2Encoder encoder, final Map<String, Integer> stringDictionary) {
|
||||||
for (final VCFHeaderLine line : header.getMetaData()) {
|
for (final VCFInfoHeaderLine line : header.getInfoHeaderLines()) {
|
||||||
if ( line instanceof VCFInfoHeaderLine ) {
|
final String field = line.getID();
|
||||||
final String field = ((VCFInfoHeaderLine) line).getID();
|
final BCF2FieldWriter.SiteWriter writer = createInfoWriter(header, line, encoder, stringDictionary);
|
||||||
final BCF2FieldWriter.SiteWriter writer = createInfoWriter(header, (VCFInfoHeaderLine)line, encoder, stringDictionary);
|
add(siteWriters, field, writer);
|
||||||
add(siteWriters, field, writer);
|
}
|
||||||
} else if ( line instanceof VCFFormatHeaderLine ) {
|
|
||||||
final String field = ((VCFFormatHeaderLine) line).getID();
|
for (final VCFFormatHeaderLine line : header.getFormatHeaderLines()) {
|
||||||
final BCF2FieldWriter.GenotypesWriter writer = createGenotypesWriter(header, (VCFFormatHeaderLine)line, encoder, stringDictionary);
|
final String field = line.getID();
|
||||||
add(genotypesWriters, field, writer);
|
final BCF2FieldWriter.GenotypesWriter writer = createGenotypesWriter(header, line, encoder, stringDictionary);
|
||||||
}
|
add(genotypesWriters, field, writer);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -90,6 +90,7 @@ class BCF2Writer extends IndexingVariantContextWriter {
|
||||||
private final Map<String, Integer> contigDictionary = new HashMap<String, Integer>();
|
private final Map<String, Integer> contigDictionary = new HashMap<String, Integer>();
|
||||||
private final Map<String, Integer> stringDictionaryMap = new LinkedHashMap<String, Integer>();
|
private final Map<String, Integer> stringDictionaryMap = new LinkedHashMap<String, Integer>();
|
||||||
private final boolean doNotWriteGenotypes;
|
private final boolean doNotWriteGenotypes;
|
||||||
|
private String[] sampleNames = null;
|
||||||
|
|
||||||
private final BCF2Encoder encoder = new BCF2Encoder(); // initialized after the header arrives
|
private final BCF2Encoder encoder = new BCF2Encoder(); // initialized after the header arrives
|
||||||
final BCF2FieldWriterManager fieldManager = new BCF2FieldWriterManager();
|
final BCF2FieldWriterManager fieldManager = new BCF2FieldWriterManager();
|
||||||
|
|
@ -122,6 +123,8 @@ class BCF2Writer extends IndexingVariantContextWriter {
|
||||||
stringDictionaryMap.put(dict.get(i), i);
|
stringDictionaryMap.put(dict.get(i), i);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
sampleNames = header.getGenotypeSamples().toArray(new String[header.getNGenotypeSamples()]);
|
||||||
|
|
||||||
// setup the field encodings
|
// setup the field encodings
|
||||||
fieldManager.setup(header, encoder, stringDictionaryMap);
|
fieldManager.setup(header, encoder, stringDictionaryMap);
|
||||||
|
|
||||||
|
|
@ -289,8 +292,7 @@ class BCF2Writer extends IndexingVariantContextWriter {
|
||||||
final BCF2FieldWriter.GenotypesWriter writer = fieldManager.getGenotypeFieldWriter(field);
|
final BCF2FieldWriter.GenotypesWriter writer = fieldManager.getGenotypeFieldWriter(field);
|
||||||
|
|
||||||
writer.start(encoder, vc);
|
writer.start(encoder, vc);
|
||||||
for ( final String name : header.getGenotypeSamples() ) {
|
for ( final String name : sampleNames ) {
|
||||||
// todo -- can we optimize this get (string -> genotype) which can be expensive
|
|
||||||
Genotype g = vc.getGenotype(name);
|
Genotype g = vc.getGenotype(name);
|
||||||
if ( g == null )
|
if ( g == null )
|
||||||
// we don't have any data about g at all
|
// we don't have any data about g at all
|
||||||
|
|
@ -327,8 +329,6 @@ class BCF2Writer extends IndexingVariantContextWriter {
|
||||||
@Requires("! strings.isEmpty()")
|
@Requires("! strings.isEmpty()")
|
||||||
@Ensures("BCF2Type.INTEGERS.contains(result)")
|
@Ensures("BCF2Type.INTEGERS.contains(result)")
|
||||||
private final BCF2Type encodeStringsByRef(final Collection<String> strings) throws IOException {
|
private final BCF2Type encodeStringsByRef(final Collection<String> strings) throws IOException {
|
||||||
assert ! strings.isEmpty();
|
|
||||||
|
|
||||||
final List<Integer> offsets = new ArrayList<Integer>(strings.size());
|
final List<Integer> offsets = new ArrayList<Integer>(strings.size());
|
||||||
BCF2Type maxType = BCF2Type.INT8; // start with the smallest size
|
BCF2Type maxType = BCF2Type.INT8; // start with the smallest size
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue