GCF improvements
-- Support for streaming VCF writing via the VCFWriter interface -- GCF now has a header and a footer. The header is minimal, and contains a forward pointer to the position of the footer in the file. -- Readers now read the header, and then jump to the footer to get the rest of the "header" information -- Version now a field in GCF
This commit is contained in:
parent
fe5724b6ea
commit
cd2c511c4a
|
|
@ -79,8 +79,13 @@ public class GCF {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public GCF(DataInputStream inputStream, boolean skipGenotypes) throws IOException {
|
public GCF(DataInputStream inputStream, boolean skipGenotypes) throws IOException, EOFException {
|
||||||
chromOffset = inputStream.readInt();
|
chromOffset = inputStream.readInt();
|
||||||
|
|
||||||
|
// have we reached the footer?
|
||||||
|
if ( chromOffset == GCFHeader.FOOTER_START_MARKER )
|
||||||
|
throw new EOFException();
|
||||||
|
|
||||||
start = inputStream.readInt();
|
start = inputStream.readInt();
|
||||||
stop = inputStream.readInt();
|
stop = inputStream.readInt();
|
||||||
id = inputStream.readUTF();
|
id = inputStream.readUTF();
|
||||||
|
|
@ -106,6 +111,32 @@ public class GCF {
|
||||||
throw new UserException.MalformedFile("Record not terminated by RECORD_TERMINATOR key");
|
throw new UserException.MalformedFile("Record not terminated by RECORD_TERMINATOR key");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public int write(DataOutputStream outputStream) throws IOException {
|
||||||
|
int startSize = outputStream.size();
|
||||||
|
outputStream.writeInt(chromOffset);
|
||||||
|
outputStream.writeInt(start);
|
||||||
|
outputStream.writeInt(stop);
|
||||||
|
outputStream.writeUTF(id);
|
||||||
|
outputStream.writeByte(refPad);
|
||||||
|
writeIntArray(alleleOffsets, outputStream, true);
|
||||||
|
outputStream.writeFloat(qual);
|
||||||
|
outputStream.writeUTF(info);
|
||||||
|
outputStream.writeInt(filterOffset);
|
||||||
|
|
||||||
|
int nGenotypes = genotypes.size();
|
||||||
|
int expectedSizeOfGenotypes = nGenotypes == 0 ? 0 : genotypes.get(0).sizeInBytes() * nGenotypes;
|
||||||
|
outputStream.writeInt(nGenotypes);
|
||||||
|
outputStream.writeInt(expectedSizeOfGenotypes);
|
||||||
|
int obsSizeOfGenotypes = 0;
|
||||||
|
for ( GCFGenotype g : genotypes )
|
||||||
|
obsSizeOfGenotypes += g.write(outputStream);
|
||||||
|
if ( obsSizeOfGenotypes != expectedSizeOfGenotypes )
|
||||||
|
throw new RuntimeException("Expect and observed genotype sizes disagree! expect = " + expectedSizeOfGenotypes + " obs =" + obsSizeOfGenotypes);
|
||||||
|
|
||||||
|
outputStream.writeInt(RECORD_TERMINATOR);
|
||||||
|
return outputStream.size() - startSize;
|
||||||
|
}
|
||||||
|
|
||||||
public VariantContext decode(final String source, final GCFHeader header) {
|
public VariantContext decode(final String source, final GCFHeader header) {
|
||||||
final String contig = header.getString(chromOffset);
|
final String contig = header.getString(chromOffset);
|
||||||
alleleMap = header.getAlleles(alleleOffsets);
|
alleleMap = header.getAlleles(alleleOffsets);
|
||||||
|
|
@ -154,31 +185,6 @@ public class GCF {
|
||||||
|
|
||||||
public int getNAlleles() { return alleleOffsets.length; }
|
public int getNAlleles() { return alleleOffsets.length; }
|
||||||
|
|
||||||
public int write(DataOutputStream outputStream) throws IOException {
|
|
||||||
int startSize = outputStream.size();
|
|
||||||
outputStream.writeInt(chromOffset);
|
|
||||||
outputStream.writeInt(start);
|
|
||||||
outputStream.writeInt(stop);
|
|
||||||
outputStream.writeUTF(id);
|
|
||||||
outputStream.writeByte(refPad);
|
|
||||||
writeIntArray(alleleOffsets, outputStream, true);
|
|
||||||
outputStream.writeFloat(qual);
|
|
||||||
outputStream.writeUTF(info);
|
|
||||||
outputStream.writeInt(filterOffset);
|
|
||||||
|
|
||||||
int nGenotypes = genotypes.size();
|
|
||||||
int expectedSizeOfGenotypes = nGenotypes == 0 ? 0 : genotypes.get(0).sizeInBytes() * nGenotypes;
|
|
||||||
outputStream.writeInt(nGenotypes);
|
|
||||||
outputStream.writeInt(expectedSizeOfGenotypes);
|
|
||||||
int obsSizeOfGenotypes = 0;
|
|
||||||
for ( GCFGenotype g : genotypes )
|
|
||||||
obsSizeOfGenotypes += g.write(outputStream);
|
|
||||||
if ( obsSizeOfGenotypes != expectedSizeOfGenotypes )
|
|
||||||
throw new RuntimeException("Expect and observed genotype sizes disagree! expect = " + expectedSizeOfGenotypes + " obs =" + obsSizeOfGenotypes);
|
|
||||||
|
|
||||||
outputStream.writeInt(RECORD_TERMINATOR);
|
|
||||||
return outputStream.size() - startSize;
|
|
||||||
}
|
|
||||||
|
|
||||||
private final String infoFieldString(VariantContext vc, final GCFHeaderBuilder GCFHeaderBuilder) {
|
private final String infoFieldString(VariantContext vc, final GCFHeaderBuilder GCFHeaderBuilder) {
|
||||||
StringBuilder s = new StringBuilder();
|
StringBuilder s = new StringBuilder();
|
||||||
|
|
@ -200,13 +206,14 @@ public class GCF {
|
||||||
return s.toString();
|
return s.toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
private final static int BUFFER_SIZE = 1048576; // 2**20
|
protected final static int BUFFER_SIZE = 1048576; // 2**20
|
||||||
public static DataOutputStream createOutputStream(final File file) throws FileNotFoundException {
|
|
||||||
return new DataOutputStream(new BufferedOutputStream(new FileOutputStream(file), BUFFER_SIZE));
|
public static DataInputStream createDataInputStream(final InputStream stream) {
|
||||||
|
return new DataInputStream(new BufferedInputStream(stream, BUFFER_SIZE));
|
||||||
}
|
}
|
||||||
|
|
||||||
public static DataInputStream createInputStream(final File file) throws FileNotFoundException {
|
public static FileInputStream createFileInputStream(final File file) throws FileNotFoundException {
|
||||||
return new DataInputStream(new BufferedInputStream(new FileInputStream(file), BUFFER_SIZE));
|
return new FileInputStream(file);
|
||||||
}
|
}
|
||||||
|
|
||||||
protected final static int[] readIntArray(final DataInputStream inputStream) throws IOException {
|
protected final static int[] readIntArray(final DataInputStream inputStream) throws IOException {
|
||||||
|
|
|
||||||
|
|
@ -30,9 +30,7 @@ import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
|
||||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||||
|
|
||||||
import java.io.DataInputStream;
|
import java.io.*;
|
||||||
import java.io.DataOutputStream;
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -65,25 +63,45 @@ import java.util.*;
|
||||||
public class GCFHeader {
|
public class GCFHeader {
|
||||||
final protected static Logger logger = Logger.getLogger(GCFHeader.class);
|
final protected static Logger logger = Logger.getLogger(GCFHeader.class);
|
||||||
|
|
||||||
private static byte[] MAGIC_HEADER = "GVCF0.1\1".getBytes();
|
public final static int GCF_VERSION = 1;
|
||||||
|
public final static byte[] GCF_FILE_START_MARKER = "GCF\1".getBytes();
|
||||||
|
public final static int FOOTER_START_MARKER = -1;
|
||||||
|
public final static long HEADER_FORWARD_REFERENCE_OFFSET = GCF_FILE_START_MARKER.length + 4; // for the version
|
||||||
|
|
||||||
|
final int version;
|
||||||
|
long footerPosition;
|
||||||
final List<Allele> alleles;
|
final List<Allele> alleles;
|
||||||
final List<String> strings;
|
final List<String> strings;
|
||||||
final List<String> samples;
|
final List<String> samples;
|
||||||
final List<Set<String>> filters;
|
final List<Set<String>> filters;
|
||||||
|
|
||||||
public GCFHeader(final Map<Allele, Integer> allelesIn, final Map<String, Integer> stringIn, final Map<String, Integer> samplesIn) {
|
public GCFHeader(final Map<Allele, Integer> allelesIn, final Map<String, Integer> stringIn, final Map<String, Integer> samplesIn) {
|
||||||
|
version = GCF_VERSION;
|
||||||
|
footerPosition = 0;
|
||||||
this.alleles = linearize(allelesIn);
|
this.alleles = linearize(allelesIn);
|
||||||
this.strings = linearize(stringIn);
|
this.strings = linearize(stringIn);
|
||||||
this.samples = linearize(samplesIn);
|
this.samples = linearize(samplesIn);
|
||||||
this.filters = null; // not used with this constructor
|
this.filters = null; // not used with this constructor
|
||||||
}
|
}
|
||||||
|
|
||||||
public GCFHeader(DataInputStream inputStream) throws IOException {
|
public GCFHeader(FileInputStream fileInputStream) throws IOException {
|
||||||
byte[] headerTest = new byte[MAGIC_HEADER.length];
|
DataInputStream inputStream = new DataInputStream(fileInputStream);
|
||||||
|
byte[] headerTest = new byte[GCF_FILE_START_MARKER.length];
|
||||||
inputStream.read(headerTest);
|
inputStream.read(headerTest);
|
||||||
if ( ! Arrays.equals(headerTest, MAGIC_HEADER) ) {
|
if ( ! Arrays.equals(headerTest, GCF_FILE_START_MARKER) ) {
|
||||||
throw new UserException("Could not read GVCF file. MAGIC_HEADER missing. Saw " + headerTest);
|
throw new UserException("Could not read GVCF file. GCF_FILE_START_MARKER missing. Saw " + new String(headerTest));
|
||||||
} else {
|
} else {
|
||||||
|
version = inputStream.readInt();
|
||||||
|
logger.info("Read GCF version " + version);
|
||||||
|
footerPosition = inputStream.readLong();
|
||||||
|
logger.info("Read footer position of " + footerPosition);
|
||||||
|
long lastPos = fileInputStream.getChannel().position();
|
||||||
|
logger.info(" Last position is " + lastPos);
|
||||||
|
|
||||||
|
// seek to the footer
|
||||||
|
fileInputStream.getChannel().position(footerPosition);
|
||||||
|
if ( inputStream.readInt() != FOOTER_START_MARKER )
|
||||||
|
throw new UserException.MalformedFile("Malformed GCF file: couldn't find the footer marker");
|
||||||
alleles = stringsToAlleles(readStrings(inputStream));
|
alleles = stringsToAlleles(readStrings(inputStream));
|
||||||
strings = readStrings(inputStream);
|
strings = readStrings(inputStream);
|
||||||
samples = readStrings(inputStream);
|
samples = readStrings(inputStream);
|
||||||
|
|
@ -91,19 +109,28 @@ public class GCFHeader {
|
||||||
logger.info(String.format("String map of %d elements", strings.size()));
|
logger.info(String.format("String map of %d elements", strings.size()));
|
||||||
logger.info(String.format("Sample map of %d elements", samples.size()));
|
logger.info(String.format("Sample map of %d elements", samples.size()));
|
||||||
filters = initializeFilterCache();
|
filters = initializeFilterCache();
|
||||||
|
fileInputStream.getChannel().position(lastPos);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public int write(final DataOutputStream outputStream) throws IOException {
|
public static int writeHeader(final DataOutputStream outputStream) throws IOException {
|
||||||
int startBytes = outputStream.size();
|
int startBytes = outputStream.size();
|
||||||
outputStream.write(MAGIC_HEADER);
|
outputStream.write(GCF_FILE_START_MARKER);
|
||||||
|
outputStream.writeInt(GCF_VERSION);
|
||||||
|
outputStream.writeLong(0);
|
||||||
|
return outputStream.size() - startBytes;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int writeFooter(final DataOutputStream outputStream) throws IOException {
|
||||||
|
int startBytes = outputStream.size();
|
||||||
|
outputStream.writeInt(FOOTER_START_MARKER); // has to be the same as chrom encoding
|
||||||
write(outputStream, allelesToStrings(alleles));
|
write(outputStream, allelesToStrings(alleles));
|
||||||
write(outputStream, strings);
|
write(outputStream, strings);
|
||||||
write(outputStream, samples);
|
write(outputStream, samples);
|
||||||
return outputStream.size() - startBytes;
|
return outputStream.size() - startBytes;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void write(DataOutputStream outputStream, List<String> l) throws IOException {
|
private void write(DataOutputStream outputStream, List<String> l) throws IOException {
|
||||||
outputStream.writeInt(l.size());
|
outputStream.writeInt(l.size());
|
||||||
for ( String elt : l ) outputStream.writeUTF(elt);
|
for ( String elt : l ) outputStream.writeUTF(elt);
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,122 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2011, The Broad Institute
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person
|
||||||
|
* obtaining a copy of this software and associated documentation
|
||||||
|
* files (the "Software"), to deal in the Software without
|
||||||
|
* restriction, including without limitation the rights to use,
|
||||||
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following
|
||||||
|
* conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be
|
||||||
|
* included in all copies or substantial portions of the Software.
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||||
|
* OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.broadinstitute.sting.utils.gcf;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.utils.codecs.vcf.IndexingVCFWriter;
|
||||||
|
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
|
||||||
|
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||||
|
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||||
|
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||||
|
|
||||||
|
import java.io.*;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* GCFWriter implementing the VCFWriter interface
|
||||||
|
* @author Your Name
|
||||||
|
* @since Date created
|
||||||
|
*/
|
||||||
|
public class GCFWriter extends IndexingVCFWriter {
|
||||||
|
final boolean skipGenotypes;
|
||||||
|
final FileOutputStream fileOutputStream;
|
||||||
|
final DataOutputStream dataOutputStream;
|
||||||
|
final GCFHeaderBuilder gcfHeaderBuilder;
|
||||||
|
int nbytes = 0;
|
||||||
|
VCFHeader header = null;
|
||||||
|
File location;
|
||||||
|
|
||||||
|
// --------------------------------------------------------------------------------
|
||||||
|
//
|
||||||
|
// Constructors
|
||||||
|
//
|
||||||
|
// --------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
public GCFWriter(File location, boolean enableOnTheFlyIndexing, boolean doNotWriteGenotypes) {
|
||||||
|
super(writerName(location, null), location, null, enableOnTheFlyIndexing);
|
||||||
|
this.location = location;
|
||||||
|
this.skipGenotypes = doNotWriteGenotypes;
|
||||||
|
|
||||||
|
// write the output
|
||||||
|
try {
|
||||||
|
fileOutputStream = new FileOutputStream(location);
|
||||||
|
dataOutputStream = createDataOutputStream(fileOutputStream);
|
||||||
|
gcfHeaderBuilder = new GCFHeaderBuilder();
|
||||||
|
} catch ( FileNotFoundException e ) {
|
||||||
|
throw new UserException.CouldNotCreateOutputFile(location, e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// --------------------------------------------------------------------------------
|
||||||
|
//
|
||||||
|
// VCFWriter interface functions
|
||||||
|
//
|
||||||
|
// --------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void writeHeader(VCFHeader header) {
|
||||||
|
this.header = header;
|
||||||
|
try {
|
||||||
|
nbytes += GCFHeader.writeHeader(dataOutputStream);
|
||||||
|
} catch ( IOException e ) {
|
||||||
|
throw new UserException.CouldNotCreateOutputFile(getStreamName(), "Couldn't write header", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void add(VariantContext vc) {
|
||||||
|
super.add(vc);
|
||||||
|
GCF gcf = new GCF(gcfHeaderBuilder, vc, skipGenotypes);
|
||||||
|
try {
|
||||||
|
nbytes += gcf.write(dataOutputStream);
|
||||||
|
} catch ( IOException e ) {
|
||||||
|
throw new UserException.CouldNotCreateOutputFile(getStreamName(), "Failed to add gcf record " + gcf + " to stream " + getStreamName(), e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void close() {
|
||||||
|
// todo -- write out VCF header lines
|
||||||
|
GCFHeader gcfHeader = gcfHeaderBuilder.createHeader();
|
||||||
|
try {
|
||||||
|
long headerPosition = nbytes;
|
||||||
|
nbytes += gcfHeader.writeFooter(dataOutputStream);
|
||||||
|
dataOutputStream.close();
|
||||||
|
//System.out.println("Writing forward reference to " + headerPosition);
|
||||||
|
|
||||||
|
RandomAccessFile raFile = new RandomAccessFile(location, "rw");
|
||||||
|
raFile.seek(GCFHeader.HEADER_FORWARD_REFERENCE_OFFSET);
|
||||||
|
raFile.writeLong(headerPosition);
|
||||||
|
raFile.close();
|
||||||
|
} catch ( IOException e ) {
|
||||||
|
throw new ReviewedStingException("Failed to close GCFWriter " + getStreamName(), e);
|
||||||
|
}
|
||||||
|
|
||||||
|
super.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
private static final DataOutputStream createDataOutputStream(final OutputStream stream) {
|
||||||
|
return new DataOutputStream(new BufferedOutputStream(stream, GCF.BUFFER_SIZE));
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
Loading…
Reference in New Issue