Actual working version of unflushing VCFWriter

-- Uses high-performance local writer backed by byte array that writes the entire VCF line in some write operation to the underlying output stream.
-- Fixes problems with indexing of unflushed writes while still allowing efficient block zipping
-- Same (or better) IO performance as previous implementation
-- IndexingVariantContextWriter now properly closes the underlying output stream when it's closed
-- Updated compressed VCF output file
This commit is contained in:
Mark DePristo 2012-12-13 12:24:08 -05:00
parent 5e66109268
commit aeab932c63
3 changed files with 97 additions and 47 deletions

View File

@ -96,7 +96,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
//
// --------------------------------------------------------------------------------------------------------------
private final static String COMPRESSED_OUTPUT_MD5 = "3eba6c309514d1e9ee06a20a112b68e6";
private final static String COMPRESSED_OUTPUT_MD5 = "af8187e2baf516dde1cddea787a52b8a";
@Test
public void testCompressedOutput() {

View File

@ -93,16 +93,19 @@ abstract class IndexingVariantContextWriter implements VariantContextWriter {
* attempt to close the VCF file
*/
public void close() {
// try to close the index stream (keep it separate to help debugging efforts)
if ( indexer != null ) {
try {
try {
// try to close the index stream (keep it separate to help debugging efforts)
if ( indexer != null ) {
Index index = indexer.finalizeIndex(positionalOutputStream.getPosition());
IndexDictionaryUtils.setIndexSequenceDictionary(index, refDict);
index.write(idxStream);
idxStream.close();
} catch (IOException e) {
throw new ReviewedStingException("Unable to close index for " + getStreamName(), e);
}
// close the underlying output stream as well
outputStream.close();
} catch (IOException e) {
throw new ReviewedStingException("Unable to close index for " + getStreamName(), e);
}
}

View File

@ -34,6 +34,7 @@ import org.broadinstitute.sting.utils.variantcontext.*;
import java.io.*;
import java.lang.reflect.Array;
import java.nio.charset.Charset;
import java.util.*;
/**
@ -42,9 +43,6 @@ import java.util.*;
class VCFWriter extends IndexingVariantContextWriter {
private final static String VERSION_LINE = VCFHeader.METADATA_INDICATOR + VCFHeaderVersion.VCF4_1.getFormatString() + "=" + VCFHeaderVersion.VCF4_1.getVersionString();
// the print stream we're writing to
final protected BufferedWriter mWriter;
// should we write genotypes or just sites?
final protected boolean doNotWriteGenotypes;
@ -53,15 +51,33 @@ class VCFWriter extends IndexingVariantContextWriter {
final private boolean allowMissingFieldsInHeader;
/**
* The VCF writer uses an internal Writer, based by the ByteArrayOutputStream lineBuffer,
* to temp. buffer the header and per-site output before flushing the per line output
* in one go to the super.getOutputStream. This results in high-performance, proper encoding,
* and allows us to avoid flushing explicitly the output stream getOutputStream, which
* allows us to properly compress vcfs in gz format without breaking indexing on the fly
* for uncompressed streams.
*/
private static final int INITIAL_BUFFER_SIZE = 1024 * 16;
private final ByteArrayOutputStream lineBuffer = new ByteArrayOutputStream(INITIAL_BUFFER_SIZE);
private final Writer writer;
/**
* The encoding used for VCF files. ISO-8859-1
*/
final private Charset charset;
private IntGenotypeFieldAccessors intGenotypeFieldAccessors = new IntGenotypeFieldAccessors();
public VCFWriter(final File location, final OutputStream output, final SAMSequenceDictionary refDict,
final boolean enableOnTheFlyIndexing, boolean doNotWriteGenotypes,
final boolean allowMissingFieldsInHeader ) {
super(writerName(location, output), location, output, refDict, enableOnTheFlyIndexing);
mWriter = new BufferedWriter(new OutputStreamWriter(getOutputStream())); // todo -- fix buffer size
this.doNotWriteGenotypes = doNotWriteGenotypes;
this.allowMissingFieldsInHeader = allowMissingFieldsInHeader;
this.charset = Charset.forName("ISO-8859-1");
this.writer = new OutputStreamWriter(lineBuffer, charset);
}
// --------------------------------------------------------------------------------
@ -70,14 +86,44 @@ class VCFWriter extends IndexingVariantContextWriter {
//
// --------------------------------------------------------------------------------
/**
* Write String s to the internal buffered writer.
*
* flushBuffer() must be called to actually write the data to the true output stream.
*
* @param s the string to write
* @throws IOException
*/
private void write(final String s) throws IOException {
writer.write(s);
}
/**
* Actually write the line buffer contents to the destination output stream.
*
* After calling this function the line buffer is reset, so the contents of the buffer can be reused
*
* @throws IOException
*/
private void flushBuffer() throws IOException {
writer.flush();
getOutputStream().write(lineBuffer.toByteArray());
lineBuffer.reset();
}
@Override
public void writeHeader(VCFHeader header) {
// note we need to update the mHeader object after this call because they header
// may have genotypes trimmed out of it, if doNotWriteGenotypes is true
mHeader = writeHeader(header, mWriter, doNotWriteGenotypes, getVersionLine(), getStreamName());
try {
mHeader = writeHeader(header, writer, doNotWriteGenotypes, getVersionLine(), getStreamName());
flushBuffer();
} catch ( IOException e ) {
throw new UserException.CouldNotCreateOutputFile(getStreamName(), e);
}
}
public static final String getVersionLine() {
public static String getVersionLine() {
return VERSION_LINE;
}
@ -138,8 +184,8 @@ class VCFWriter extends IndexingVariantContextWriter {
public void close() {
// try to close the vcf stream
try {
mWriter.flush();
mWriter.close();
// TODO -- would it be useful to null out the line buffer so we don't have it around unnecessarily?
writer.close();
} catch (IOException e) {
throw new ReviewedStingException("Unable to close " + getStreamName(), e);
}
@ -166,51 +212,51 @@ class VCFWriter extends IndexingVariantContextWriter {
Map<Allele, String> alleleMap = buildAlleleMap(vc);
// CHROM
mWriter.write(vc.getChr());
mWriter.write(VCFConstants.FIELD_SEPARATOR);
write(vc.getChr());
write(VCFConstants.FIELD_SEPARATOR);
// POS
mWriter.write(String.valueOf(vc.getStart()));
mWriter.write(VCFConstants.FIELD_SEPARATOR);
write(String.valueOf(vc.getStart()));
write(VCFConstants.FIELD_SEPARATOR);
// ID
String ID = vc.getID();
mWriter.write(ID);
mWriter.write(VCFConstants.FIELD_SEPARATOR);
write(ID);
write(VCFConstants.FIELD_SEPARATOR);
// REF
String refString = vc.getReference().getDisplayString();
mWriter.write(refString);
mWriter.write(VCFConstants.FIELD_SEPARATOR);
write(refString);
write(VCFConstants.FIELD_SEPARATOR);
// ALT
if ( vc.isVariant() ) {
Allele altAllele = vc.getAlternateAllele(0);
String alt = altAllele.getDisplayString();
mWriter.write(alt);
write(alt);
for (int i = 1; i < vc.getAlternateAlleles().size(); i++) {
altAllele = vc.getAlternateAllele(i);
alt = altAllele.getDisplayString();
mWriter.write(",");
mWriter.write(alt);
write(",");
write(alt);
}
} else {
mWriter.write(VCFConstants.EMPTY_ALTERNATE_ALLELE_FIELD);
write(VCFConstants.EMPTY_ALTERNATE_ALLELE_FIELD);
}
mWriter.write(VCFConstants.FIELD_SEPARATOR);
write(VCFConstants.FIELD_SEPARATOR);
// QUAL
if ( !vc.hasLog10PError() )
mWriter.write(VCFConstants.MISSING_VALUE_v4);
write(VCFConstants.MISSING_VALUE_v4);
else
mWriter.write(formatQualValue(vc.getPhredScaledQual()));
mWriter.write(VCFConstants.FIELD_SEPARATOR);
write(formatQualValue(vc.getPhredScaledQual()));
write(VCFConstants.FIELD_SEPARATOR);
// FILTER
String filters = getFilterString(vc);
mWriter.write(filters);
mWriter.write(VCFConstants.FIELD_SEPARATOR);
write(filters);
write(VCFConstants.FIELD_SEPARATOR);
// INFO
Map<String, String> infoFields = new TreeMap<String, String>();
@ -229,8 +275,8 @@ class VCFWriter extends IndexingVariantContextWriter {
// FORMAT
final GenotypesContext gc = vc.getGenotypes();
if ( gc.isLazyWithData() && ((LazyGenotypesContext)gc).getUnparsedGenotypeData() instanceof String ) {
mWriter.write(VCFConstants.FIELD_SEPARATOR);
mWriter.write(((LazyGenotypesContext)gc).getUnparsedGenotypeData().toString());
write(VCFConstants.FIELD_SEPARATOR);
write(((LazyGenotypesContext) gc).getUnparsedGenotypeData().toString());
} else {
List<String> genotypeAttributeKeys = calcVCFGenotypeKeys(vc, mHeader);
if ( ! genotypeAttributeKeys.isEmpty() ) {
@ -240,16 +286,17 @@ class VCFWriter extends IndexingVariantContextWriter {
final String genotypeFormatString = ParsingUtils.join(VCFConstants.GENOTYPE_FIELD_SEPARATOR, genotypeAttributeKeys);
mWriter.write(VCFConstants.FIELD_SEPARATOR);
mWriter.write(genotypeFormatString);
write(VCFConstants.FIELD_SEPARATOR);
write(genotypeFormatString);
addGenotypeData(vc, alleleMap, genotypeAttributeKeys);
}
}
mWriter.write("\n");
write("\n");
// note that we cannot call flush here if we want block gzipping to work properly
// calling flush results in all gzipped blocks for each variant
flushBuffer();
} catch (IOException e) {
throw new RuntimeException("Unable to write the VCF object to " + getStreamName(), e);
}
@ -305,7 +352,7 @@ class VCFWriter extends IndexingVariantContextWriter {
*/
private void writeInfoString(Map<String, String> infoFields) throws IOException {
if ( infoFields.isEmpty() ) {
mWriter.write(VCFConstants.EMPTY_INFO_FIELD);
write(VCFConstants.EMPTY_INFO_FIELD);
return;
}
@ -314,16 +361,16 @@ class VCFWriter extends IndexingVariantContextWriter {
if ( isFirst )
isFirst = false;
else
mWriter.write(VCFConstants.INFO_FIELD_SEPARATOR);
write(VCFConstants.INFO_FIELD_SEPARATOR);
String key = entry.getKey();
mWriter.write(key);
write(key);
if ( !entry.getValue().equals("") ) {
VCFInfoHeaderLine metaData = mHeader.getInfoHeaderLine(key);
if ( metaData == null || metaData.getCountType() != VCFHeaderLineCount.INTEGER || metaData.getCount() != 0 ) {
mWriter.write("=");
mWriter.write(entry.getValue());
write("=");
write(entry.getValue());
}
}
}
@ -342,7 +389,7 @@ class VCFWriter extends IndexingVariantContextWriter {
final int ploidy = vc.getMaxPloidy(2);
for ( String sample : mHeader.getGenotypeSamples() ) {
mWriter.write(VCFConstants.FIELD_SEPARATOR);
write(VCFConstants.FIELD_SEPARATOR);
Genotype g = vc.getGenotype(sample);
if ( g == null ) g = GenotypeBuilder.createMissing(sample, ploidy);
@ -356,7 +403,7 @@ class VCFWriter extends IndexingVariantContextWriter {
writeAllele(g.getAllele(0), alleleMap);
for (int i = 1; i < g.getPloidy(); i++) {
mWriter.write(g.isPhased() ? VCFConstants.PHASED : VCFConstants.UNPHASED);
write(g.isPhased() ? VCFConstants.PHASED : VCFConstants.UNPHASED);
writeAllele(g.getAllele(i), alleleMap);
}
@ -420,8 +467,8 @@ class VCFWriter extends IndexingVariantContextWriter {
for (int i = 0; i < attrs.size(); i++) {
if ( i > 0 || genotypeFormatKeys.contains(VCFConstants.GENOTYPE_KEY) )
mWriter.write(VCFConstants.GENOTYPE_FIELD_SEPARATOR);
mWriter.write(attrs.get(i));
write(VCFConstants.GENOTYPE_FIELD_SEPARATOR);
write(attrs.get(i));
}
}
}
@ -435,7 +482,7 @@ class VCFWriter extends IndexingVariantContextWriter {
String encoding = alleleMap.get(allele);
if ( encoding == null )
throw new TribbleException.InternalCodecException("Allele " + allele + " is not an allele in the variant context");
mWriter.write(encoding);
write(encoding);
}
/**