178 lines
7.5 KiB
Java
178 lines
7.5 KiB
Java
|
|
/*
|
||
|
|
* The Broad Institute
|
||
|
|
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||
|
|
* This software and its documentation are copyright 2009 by the
|
||
|
|
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||
|
|
*
|
||
|
|
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||
|
|
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||
|
|
*/
|
||
|
|
package edu.mit.broad.sam.util;
|
||
|
|
|
||
|
|
import java.io.File;
|
||
|
|
import java.io.IOException;
|
||
|
|
import java.io.OutputStream;
|
||
|
|
import java.util.zip.CRC32;
|
||
|
|
import java.util.zip.Deflater;
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Writer for a file that is a series of gzip blocks. The caller just treats it as an
|
||
|
|
* OutputStream, and under the covers a gzip block is written when the amount of uncompressed as-yet-unwritten
|
||
|
|
* bytes reaches a threshold. Note that the flush() method should not be called by client
|
||
|
|
* unless you know what you're doing, because it forces a gzip block to be written even if the
|
||
|
|
* number of buffered bytes has not reached threshold. close(), on the other hand, must be called
|
||
|
|
* when done writing in order to force the last gzip block to be written.
|
||
|
|
*/
|
||
|
|
public class BlockCompressedOutputStream
|
||
|
|
extends OutputStream
|
||
|
|
{
|
||
|
|
private final BinaryCodec codec;
|
||
|
|
private final byte[] uncompressedBuffer = new byte[BlockCompressedStreamConstants.DEFAULT_UNCOMPRESSED_BLOCK_SIZE];
|
||
|
|
private int numUncompressedBytes = 0;
|
||
|
|
private final byte[] compressedBuffer =
|
||
|
|
new byte[BlockCompressedStreamConstants.MAX_COMPRESSED_BLOCK_SIZE -
|
||
|
|
BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH];
|
||
|
|
private final Deflater deflater = new Deflater(BlockCompressedStreamConstants.GZIP_CM_DEFLATE, true);
|
||
|
|
private final CRC32 crc32 = new CRC32();
|
||
|
|
private final byte[] singleByteArray = new byte[1];
|
||
|
|
|
||
|
|
private int numberOfThrottleBacks = 0;
|
||
|
|
|
||
|
|
public BlockCompressedOutputStream(final String filename) {
|
||
|
|
codec = new BinaryCodec(filename, true);
|
||
|
|
}
|
||
|
|
|
||
|
|
public BlockCompressedOutputStream(final File file) {
|
||
|
|
codec = new BinaryCodec(file, true);
|
||
|
|
}
|
||
|
|
|
||
|
|
@Override
|
||
|
|
public void write(final byte[] bytes) throws IOException {
|
||
|
|
write(bytes, 0, bytes.length);
|
||
|
|
}
|
||
|
|
|
||
|
|
@Override
|
||
|
|
public void write(final byte[] bytes, int startIndex, int numBytes) throws IOException {
|
||
|
|
assert(numUncompressedBytes < uncompressedBuffer.length);
|
||
|
|
while (numBytes > 0) {
|
||
|
|
final int bytesToWrite = Math.min(uncompressedBuffer.length - numUncompressedBytes, numBytes);
|
||
|
|
System.arraycopy(bytes, startIndex, uncompressedBuffer, numUncompressedBytes, bytesToWrite);
|
||
|
|
numUncompressedBytes += bytesToWrite;
|
||
|
|
startIndex += bytesToWrite;
|
||
|
|
numBytes -= bytesToWrite;
|
||
|
|
assert(numBytes >= 0);
|
||
|
|
if (numUncompressedBytes == uncompressedBuffer.length) {
|
||
|
|
deflateBlock();
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* WARNING: flush() affects the output format, because it causes the current contents of uncompressedBuffer
|
||
|
|
* to be compressed and written, even if it isn't full. Unless you know what you're doing, don't call flush().
|
||
|
|
* Instead, call close(), which will flush any unwritten data before closing the underlying stream.
|
||
|
|
*
|
||
|
|
*/
|
||
|
|
@Override
|
||
|
|
public void flush() throws IOException {
|
||
|
|
while (numUncompressedBytes > 0) {
|
||
|
|
deflateBlock();
|
||
|
|
}
|
||
|
|
codec.getOutputStream().flush();
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* close() must be called in order to flush any remaining buffered bytes.
|
||
|
|
*
|
||
|
|
*/
|
||
|
|
@Override
|
||
|
|
public void close() throws IOException {
|
||
|
|
flush();
|
||
|
|
if (numberOfThrottleBacks > 0) {
|
||
|
|
System.err.println("In BlockCompressedOutputStream, had to throttle back " + numberOfThrottleBacks +
|
||
|
|
" times for file " + codec.getOutputFileName());
|
||
|
|
}
|
||
|
|
codec.close();
|
||
|
|
}
|
||
|
|
|
||
|
|
public void write(final int i) throws IOException {
|
||
|
|
singleByteArray[0] = (byte)i;
|
||
|
|
write(singleByteArray);
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Attempt to write the data in uncompressedBuffer to the underlying file in a gzip block.
|
||
|
|
* If the entire uncompressedBuffer does not fit in the maximum allowed size, reduce the amount
|
||
|
|
* of data to be compressed, and slide the excess down in uncompressedBuffer so it can be picked
|
||
|
|
* up in the next deflate event.
|
||
|
|
* @return size of gzip block that was written.
|
||
|
|
*/
|
||
|
|
private int deflateBlock() {
|
||
|
|
if (numUncompressedBytes == 0) {
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
int bytesToCompress = numUncompressedBytes;
|
||
|
|
while (true) {
|
||
|
|
// Compress the input
|
||
|
|
deflater.reset();
|
||
|
|
deflater.setInput(uncompressedBuffer, 0, bytesToCompress);
|
||
|
|
deflater.finish();
|
||
|
|
final int compressedSize = deflater.deflate(compressedBuffer, 0, compressedBuffer.length);
|
||
|
|
|
||
|
|
// If it didn't all fit in compressedBuffer.length, reduce the amount to
|
||
|
|
// be compressed and try again.
|
||
|
|
if (deflater.getBytesRead() < bytesToCompress) {
|
||
|
|
bytesToCompress -= BlockCompressedStreamConstants.UNCOMPRESSED_THROTTLE_AMOUNT;
|
||
|
|
++numberOfThrottleBacks;
|
||
|
|
assert(bytesToCompress > 0);
|
||
|
|
continue;
|
||
|
|
}
|
||
|
|
// Data compressed small enough, so write it out.
|
||
|
|
crc32.reset();
|
||
|
|
crc32.update(uncompressedBuffer, 0, bytesToCompress);
|
||
|
|
|
||
|
|
final int totalBlockSize = writeGzipBlock(compressedSize, bytesToCompress, crc32.getValue());
|
||
|
|
assert(bytesToCompress <= numUncompressedBytes);
|
||
|
|
|
||
|
|
// Clear out from uncompressedBuffer the data that was written
|
||
|
|
if (bytesToCompress == numUncompressedBytes) {
|
||
|
|
numUncompressedBytes = 0;
|
||
|
|
} else {
|
||
|
|
System.arraycopy(uncompressedBuffer, bytesToCompress, uncompressedBuffer, 0,
|
||
|
|
numUncompressedBytes - bytesToCompress);
|
||
|
|
numUncompressedBytes -= bytesToCompress;
|
||
|
|
}
|
||
|
|
return totalBlockSize;
|
||
|
|
}
|
||
|
|
// unreachable
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Writes the entire gzip block, assuming the compressed data is stored in compressedBuffer
|
||
|
|
* @return size of gzip block that was written.
|
||
|
|
*/
|
||
|
|
private int writeGzipBlock(final int compressedSize, final int uncompressedSize, final long crc) {
|
||
|
|
// Init gzip header
|
||
|
|
codec.writeByte(BlockCompressedStreamConstants.GZIP_ID1);
|
||
|
|
codec.writeByte(BlockCompressedStreamConstants.GZIP_ID2);
|
||
|
|
codec.writeByte(BlockCompressedStreamConstants.GZIP_CM_DEFLATE);
|
||
|
|
codec.writeByte(BlockCompressedStreamConstants.GZIP_FLG);
|
||
|
|
codec.writeInt(0); // Modification time
|
||
|
|
codec.writeByte(BlockCompressedStreamConstants.GZIP_XFL);
|
||
|
|
codec.writeByte(BlockCompressedStreamConstants.GZIP_OS_UNKNOWN);
|
||
|
|
codec.writeShort(BlockCompressedStreamConstants.GZIP_XLEN);
|
||
|
|
codec.writeByte(BlockCompressedStreamConstants.BGZF_ID1);
|
||
|
|
codec.writeByte(BlockCompressedStreamConstants.BGZF_ID2);
|
||
|
|
codec.writeShort(BlockCompressedStreamConstants.BGZF_LEN);
|
||
|
|
final int totalBlockSize = compressedSize + BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH +
|
||
|
|
BlockCompressedStreamConstants.BLOCK_FOOTER_LENGTH;
|
||
|
|
|
||
|
|
// I don't know why we store block size - 1, but that is what the spec says
|
||
|
|
codec.writeShort((short)(totalBlockSize - 1));
|
||
|
|
codec.writeBytes(compressedBuffer, 0, compressedSize);
|
||
|
|
codec.writeInt((int)crc);
|
||
|
|
codec.writeInt(uncompressedSize);
|
||
|
|
return totalBlockSize;
|
||
|
|
}
|
||
|
|
}
|