diff --git a/public/java/src/org/broadinstitute/sting/utils/gcf/GCF.java b/public/java/src/org/broadinstitute/sting/utils/gcf/GCF.java index 5ab241ebf..ef0d9ca42 100644 --- a/public/java/src/org/broadinstitute/sting/utils/gcf/GCF.java +++ b/public/java/src/org/broadinstitute/sting/utils/gcf/GCF.java @@ -79,8 +79,13 @@ public class GCF { } } - public GCF(DataInputStream inputStream, boolean skipGenotypes) throws IOException { + public GCF(DataInputStream inputStream, boolean skipGenotypes) throws IOException, EOFException { chromOffset = inputStream.readInt(); + + // have we reached the footer? + if ( chromOffset == GCFHeader.FOOTER_START_MARKER ) + throw new EOFException(); + start = inputStream.readInt(); stop = inputStream.readInt(); id = inputStream.readUTF(); @@ -106,6 +111,32 @@ public class GCF { throw new UserException.MalformedFile("Record not terminated by RECORD_TERMINATOR key"); } + public int write(DataOutputStream outputStream) throws IOException { + int startSize = outputStream.size(); + outputStream.writeInt(chromOffset); + outputStream.writeInt(start); + outputStream.writeInt(stop); + outputStream.writeUTF(id); + outputStream.writeByte(refPad); + writeIntArray(alleleOffsets, outputStream, true); + outputStream.writeFloat(qual); + outputStream.writeUTF(info); + outputStream.writeInt(filterOffset); + + int nGenotypes = genotypes.size(); + int expectedSizeOfGenotypes = nGenotypes == 0 ? 0 : genotypes.get(0).sizeInBytes() * nGenotypes; + outputStream.writeInt(nGenotypes); + outputStream.writeInt(expectedSizeOfGenotypes); + int obsSizeOfGenotypes = 0; + for ( GCFGenotype g : genotypes ) + obsSizeOfGenotypes += g.write(outputStream); + if ( obsSizeOfGenotypes != expectedSizeOfGenotypes ) + throw new RuntimeException("Expect and observed genotype sizes disagree! expect = " + expectedSizeOfGenotypes + " obs =" + obsSizeOfGenotypes); + + outputStream.writeInt(RECORD_TERMINATOR); + return outputStream.size() - startSize; + } + public VariantContext decode(final String source, final GCFHeader header) { final String contig = header.getString(chromOffset); alleleMap = header.getAlleles(alleleOffsets); @@ -154,31 +185,6 @@ public class GCF { public int getNAlleles() { return alleleOffsets.length; } - public int write(DataOutputStream outputStream) throws IOException { - int startSize = outputStream.size(); - outputStream.writeInt(chromOffset); - outputStream.writeInt(start); - outputStream.writeInt(stop); - outputStream.writeUTF(id); - outputStream.writeByte(refPad); - writeIntArray(alleleOffsets, outputStream, true); - outputStream.writeFloat(qual); - outputStream.writeUTF(info); - outputStream.writeInt(filterOffset); - - int nGenotypes = genotypes.size(); - int expectedSizeOfGenotypes = nGenotypes == 0 ? 0 : genotypes.get(0).sizeInBytes() * nGenotypes; - outputStream.writeInt(nGenotypes); - outputStream.writeInt(expectedSizeOfGenotypes); - int obsSizeOfGenotypes = 0; - for ( GCFGenotype g : genotypes ) - obsSizeOfGenotypes += g.write(outputStream); - if ( obsSizeOfGenotypes != expectedSizeOfGenotypes ) - throw new RuntimeException("Expect and observed genotype sizes disagree! expect = " + expectedSizeOfGenotypes + " obs =" + obsSizeOfGenotypes); - - outputStream.writeInt(RECORD_TERMINATOR); - return outputStream.size() - startSize; - } private final String infoFieldString(VariantContext vc, final GCFHeaderBuilder GCFHeaderBuilder) { StringBuilder s = new StringBuilder(); @@ -200,13 +206,14 @@ public class GCF { return s.toString(); } - private final static int BUFFER_SIZE = 1048576; // 2**20 - public static DataOutputStream createOutputStream(final File file) throws FileNotFoundException { - return new DataOutputStream(new BufferedOutputStream(new FileOutputStream(file), BUFFER_SIZE)); + protected final static int BUFFER_SIZE = 1048576; // 2**20 + + public static DataInputStream createDataInputStream(final InputStream stream) { + return new DataInputStream(new BufferedInputStream(stream, BUFFER_SIZE)); } - public static DataInputStream createInputStream(final File file) throws FileNotFoundException { - return new DataInputStream(new BufferedInputStream(new FileInputStream(file), BUFFER_SIZE)); + public static FileInputStream createFileInputStream(final File file) throws FileNotFoundException { + return new FileInputStream(file); } protected final static int[] readIntArray(final DataInputStream inputStream) throws IOException { diff --git a/public/java/src/org/broadinstitute/sting/utils/gcf/GCFHeader.java b/public/java/src/org/broadinstitute/sting/utils/gcf/GCFHeader.java index d0c765cc4..6d96eda56 100644 --- a/public/java/src/org/broadinstitute/sting/utils/gcf/GCFHeader.java +++ b/public/java/src/org/broadinstitute/sting/utils/gcf/GCFHeader.java @@ -30,9 +30,7 @@ import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.variantcontext.Allele; -import java.io.DataInputStream; -import java.io.DataOutputStream; -import java.io.IOException; +import java.io.*; import java.util.*; /** @@ -65,25 +63,45 @@ import java.util.*; public class GCFHeader { final protected static Logger logger = Logger.getLogger(GCFHeader.class); - private static byte[] MAGIC_HEADER = "GVCF0.1\1".getBytes(); + public final static int GCF_VERSION = 1; + public final static byte[] GCF_FILE_START_MARKER = "GCF\1".getBytes(); + public final static int FOOTER_START_MARKER = -1; + public final static long HEADER_FORWARD_REFERENCE_OFFSET = GCF_FILE_START_MARKER.length + 4; // for the version + + final int version; + long footerPosition; final List alleles; final List strings; final List samples; final List> filters; public GCFHeader(final Map allelesIn, final Map stringIn, final Map samplesIn) { + version = GCF_VERSION; + footerPosition = 0; this.alleles = linearize(allelesIn); this.strings = linearize(stringIn); this.samples = linearize(samplesIn); this.filters = null; // not used with this constructor } - public GCFHeader(DataInputStream inputStream) throws IOException { - byte[] headerTest = new byte[MAGIC_HEADER.length]; + public GCFHeader(FileInputStream fileInputStream) throws IOException { + DataInputStream inputStream = new DataInputStream(fileInputStream); + byte[] headerTest = new byte[GCF_FILE_START_MARKER.length]; inputStream.read(headerTest); - if ( ! Arrays.equals(headerTest, MAGIC_HEADER) ) { - throw new UserException("Could not read GVCF file. MAGIC_HEADER missing. Saw " + headerTest); + if ( ! Arrays.equals(headerTest, GCF_FILE_START_MARKER) ) { + throw new UserException("Could not read GVCF file. GCF_FILE_START_MARKER missing. Saw " + new String(headerTest)); } else { + version = inputStream.readInt(); + logger.info("Read GCF version " + version); + footerPosition = inputStream.readLong(); + logger.info("Read footer position of " + footerPosition); + long lastPos = fileInputStream.getChannel().position(); + logger.info(" Last position is " + lastPos); + + // seek to the footer + fileInputStream.getChannel().position(footerPosition); + if ( inputStream.readInt() != FOOTER_START_MARKER ) + throw new UserException.MalformedFile("Malformed GCF file: couldn't find the footer marker"); alleles = stringsToAlleles(readStrings(inputStream)); strings = readStrings(inputStream); samples = readStrings(inputStream); @@ -91,19 +109,28 @@ public class GCFHeader { logger.info(String.format("String map of %d elements", strings.size())); logger.info(String.format("Sample map of %d elements", samples.size())); filters = initializeFilterCache(); + fileInputStream.getChannel().position(lastPos); } } - public int write(final DataOutputStream outputStream) throws IOException { + public static int writeHeader(final DataOutputStream outputStream) throws IOException { int startBytes = outputStream.size(); - outputStream.write(MAGIC_HEADER); + outputStream.write(GCF_FILE_START_MARKER); + outputStream.writeInt(GCF_VERSION); + outputStream.writeLong(0); + return outputStream.size() - startBytes; + } + + public int writeFooter(final DataOutputStream outputStream) throws IOException { + int startBytes = outputStream.size(); + outputStream.writeInt(FOOTER_START_MARKER); // has to be the same as chrom encoding write(outputStream, allelesToStrings(alleles)); write(outputStream, strings); write(outputStream, samples); return outputStream.size() - startBytes; } - public void write(DataOutputStream outputStream, List l) throws IOException { + private void write(DataOutputStream outputStream, List l) throws IOException { outputStream.writeInt(l.size()); for ( String elt : l ) outputStream.writeUTF(elt); } diff --git a/public/java/src/org/broadinstitute/sting/utils/gcf/GCFWriter.java b/public/java/src/org/broadinstitute/sting/utils/gcf/GCFWriter.java new file mode 100644 index 000000000..7ff6e27a2 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/gcf/GCFWriter.java @@ -0,0 +1,122 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.gcf; + +import org.broadinstitute.sting.utils.codecs.vcf.IndexingVCFWriter; +import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; + +import java.io.*; + +/** + * GCFWriter implementing the VCFWriter interface + * @author Your Name + * @since Date created + */ +public class GCFWriter extends IndexingVCFWriter { + final boolean skipGenotypes; + final FileOutputStream fileOutputStream; + final DataOutputStream dataOutputStream; + final GCFHeaderBuilder gcfHeaderBuilder; + int nbytes = 0; + VCFHeader header = null; + File location; + + // -------------------------------------------------------------------------------- + // + // Constructors + // + // -------------------------------------------------------------------------------- + + public GCFWriter(File location, boolean enableOnTheFlyIndexing, boolean doNotWriteGenotypes) { + super(writerName(location, null), location, null, enableOnTheFlyIndexing); + this.location = location; + this.skipGenotypes = doNotWriteGenotypes; + + // write the output + try { + fileOutputStream = new FileOutputStream(location); + dataOutputStream = createDataOutputStream(fileOutputStream); + gcfHeaderBuilder = new GCFHeaderBuilder(); + } catch ( FileNotFoundException e ) { + throw new UserException.CouldNotCreateOutputFile(location, e); + } + } + + // -------------------------------------------------------------------------------- + // + // VCFWriter interface functions + // + // -------------------------------------------------------------------------------- + + @Override + public void writeHeader(VCFHeader header) { + this.header = header; + try { + nbytes += GCFHeader.writeHeader(dataOutputStream); + } catch ( IOException e ) { + throw new UserException.CouldNotCreateOutputFile(getStreamName(), "Couldn't write header", e); + } + } + + @Override + public void add(VariantContext vc) { + super.add(vc); + GCF gcf = new GCF(gcfHeaderBuilder, vc, skipGenotypes); + try { + nbytes += gcf.write(dataOutputStream); + } catch ( IOException e ) { + throw new UserException.CouldNotCreateOutputFile(getStreamName(), "Failed to add gcf record " + gcf + " to stream " + getStreamName(), e); + } + } + + @Override + public void close() { + // todo -- write out VCF header lines + GCFHeader gcfHeader = gcfHeaderBuilder.createHeader(); + try { + long headerPosition = nbytes; + nbytes += gcfHeader.writeFooter(dataOutputStream); + dataOutputStream.close(); + //System.out.println("Writing forward reference to " + headerPosition); + + RandomAccessFile raFile = new RandomAccessFile(location, "rw"); + raFile.seek(GCFHeader.HEADER_FORWARD_REFERENCE_OFFSET); + raFile.writeLong(headerPosition); + raFile.close(); + } catch ( IOException e ) { + throw new ReviewedStingException("Failed to close GCFWriter " + getStreamName(), e); + } + + super.close(); + } + + private static final DataOutputStream createDataOutputStream(final OutputStream stream) { + return new DataOutputStream(new BufferedOutputStream(stream, GCF.BUFFER_SIZE)); + } + +}