gatk-3.8/java/lib/edu/mit/broad/sam/util/BlockCompressedOutputStream...

178 lines
7.5 KiB
Java

/*
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*/
package edu.mit.broad.sam.util;
import java.io.File;
import java.io.IOException;
import java.io.OutputStream;
import java.util.zip.CRC32;
import java.util.zip.Deflater;
/**
* Writer for a file that is a series of gzip blocks. The caller just treats it as an
* OutputStream, and under the covers a gzip block is written when the amount of uncompressed as-yet-unwritten
* bytes reaches a threshold. Note that the flush() method should not be called by client
* unless you know what you're doing, because it forces a gzip block to be written even if the
* number of buffered bytes has not reached threshold. close(), on the other hand, must be called
* when done writing in order to force the last gzip block to be written.
*/
public class BlockCompressedOutputStream
extends OutputStream
{
private final BinaryCodec codec;
private final byte[] uncompressedBuffer = new byte[BlockCompressedStreamConstants.DEFAULT_UNCOMPRESSED_BLOCK_SIZE];
private int numUncompressedBytes = 0;
private final byte[] compressedBuffer =
new byte[BlockCompressedStreamConstants.MAX_COMPRESSED_BLOCK_SIZE -
BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH];
private final Deflater deflater = new Deflater(BlockCompressedStreamConstants.GZIP_CM_DEFLATE, true);
private final CRC32 crc32 = new CRC32();
private final byte[] singleByteArray = new byte[1];
private int numberOfThrottleBacks = 0;
public BlockCompressedOutputStream(final String filename) {
codec = new BinaryCodec(filename, true);
}
public BlockCompressedOutputStream(final File file) {
codec = new BinaryCodec(file, true);
}
@Override
public void write(final byte[] bytes) throws IOException {
write(bytes, 0, bytes.length);
}
@Override
public void write(final byte[] bytes, int startIndex, int numBytes) throws IOException {
assert(numUncompressedBytes < uncompressedBuffer.length);
while (numBytes > 0) {
final int bytesToWrite = Math.min(uncompressedBuffer.length - numUncompressedBytes, numBytes);
System.arraycopy(bytes, startIndex, uncompressedBuffer, numUncompressedBytes, bytesToWrite);
numUncompressedBytes += bytesToWrite;
startIndex += bytesToWrite;
numBytes -= bytesToWrite;
assert(numBytes >= 0);
if (numUncompressedBytes == uncompressedBuffer.length) {
deflateBlock();
}
}
}
/**
* WARNING: flush() affects the output format, because it causes the current contents of uncompressedBuffer
* to be compressed and written, even if it isn't full. Unless you know what you're doing, don't call flush().
* Instead, call close(), which will flush any unwritten data before closing the underlying stream.
*
*/
@Override
public void flush() throws IOException {
while (numUncompressedBytes > 0) {
deflateBlock();
}
codec.getOutputStream().flush();
}
/**
* close() must be called in order to flush any remaining buffered bytes.
*
*/
@Override
public void close() throws IOException {
flush();
if (numberOfThrottleBacks > 0) {
System.err.println("In BlockCompressedOutputStream, had to throttle back " + numberOfThrottleBacks +
" times for file " + codec.getOutputFileName());
}
codec.close();
}
public void write(final int i) throws IOException {
singleByteArray[0] = (byte)i;
write(singleByteArray);
}
/**
* Attempt to write the data in uncompressedBuffer to the underlying file in a gzip block.
* If the entire uncompressedBuffer does not fit in the maximum allowed size, reduce the amount
* of data to be compressed, and slide the excess down in uncompressedBuffer so it can be picked
* up in the next deflate event.
* @return size of gzip block that was written.
*/
private int deflateBlock() {
if (numUncompressedBytes == 0) {
return 0;
}
int bytesToCompress = numUncompressedBytes;
while (true) {
// Compress the input
deflater.reset();
deflater.setInput(uncompressedBuffer, 0, bytesToCompress);
deflater.finish();
final int compressedSize = deflater.deflate(compressedBuffer, 0, compressedBuffer.length);
// If it didn't all fit in compressedBuffer.length, reduce the amount to
// be compressed and try again.
if (deflater.getBytesRead() < bytesToCompress) {
bytesToCompress -= BlockCompressedStreamConstants.UNCOMPRESSED_THROTTLE_AMOUNT;
++numberOfThrottleBacks;
assert(bytesToCompress > 0);
continue;
}
// Data compressed small enough, so write it out.
crc32.reset();
crc32.update(uncompressedBuffer, 0, bytesToCompress);
final int totalBlockSize = writeGzipBlock(compressedSize, bytesToCompress, crc32.getValue());
assert(bytesToCompress <= numUncompressedBytes);
// Clear out from uncompressedBuffer the data that was written
if (bytesToCompress == numUncompressedBytes) {
numUncompressedBytes = 0;
} else {
System.arraycopy(uncompressedBuffer, bytesToCompress, uncompressedBuffer, 0,
numUncompressedBytes - bytesToCompress);
numUncompressedBytes -= bytesToCompress;
}
return totalBlockSize;
}
// unreachable
}
/**
* Writes the entire gzip block, assuming the compressed data is stored in compressedBuffer
* @return size of gzip block that was written.
*/
private int writeGzipBlock(final int compressedSize, final int uncompressedSize, final long crc) {
// Init gzip header
codec.writeByte(BlockCompressedStreamConstants.GZIP_ID1);
codec.writeByte(BlockCompressedStreamConstants.GZIP_ID2);
codec.writeByte(BlockCompressedStreamConstants.GZIP_CM_DEFLATE);
codec.writeByte(BlockCompressedStreamConstants.GZIP_FLG);
codec.writeInt(0); // Modification time
codec.writeByte(BlockCompressedStreamConstants.GZIP_XFL);
codec.writeByte(BlockCompressedStreamConstants.GZIP_OS_UNKNOWN);
codec.writeShort(BlockCompressedStreamConstants.GZIP_XLEN);
codec.writeByte(BlockCompressedStreamConstants.BGZF_ID1);
codec.writeByte(BlockCompressedStreamConstants.BGZF_ID2);
codec.writeShort(BlockCompressedStreamConstants.BGZF_LEN);
final int totalBlockSize = compressedSize + BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH +
BlockCompressedStreamConstants.BLOCK_FOOTER_LENGTH;
// I don't know why we store block size - 1, but that is what the spec says
codec.writeShort((short)(totalBlockSize - 1));
codec.writeBytes(compressedBuffer, 0, compressedSize);
codec.writeInt((int)crc);
codec.writeInt(uncompressedSize);
return totalBlockSize;
}
}