From 497ae700c41414529a66cc2993378aea5d74fa5d Mon Sep 17 00:00:00 2001 From: hanna Date: Wed, 30 Dec 2009 18:19:51 +0000 Subject: [PATCH] A rethink of the existing BAM block extraction code: rather than working in chunk space directly, stream data in block space, converting to chunk space on demand. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2484 348d0f76-0448-11de-a6fe-93d51630548a --- ...unkIterator.java => BAMBlockIterator.java} | 31 +++++--- .../net/sf/samtools/BAMFileHeaderLoader.java | 77 +++++++++++++++++++ java/src/net/sf/samtools/Block.java | 34 ++++++++ java/src/net/sf/samtools/BlockReader.java | 35 ++++++--- .../src/net/sf/samtools/BlockTestHarness.java | 46 +++++++++++ java/src/net/sf/samtools/Chunk.java | 5 ++ .../src/net/sf/samtools/ChunkTestHarness.java | 51 ------------ 7 files changed, 208 insertions(+), 71 deletions(-) rename java/src/net/sf/samtools/{BAMChunkIterator.java => BAMBlockIterator.java} (65%) create mode 100644 java/src/net/sf/samtools/BAMFileHeaderLoader.java create mode 100644 java/src/net/sf/samtools/Block.java create mode 100644 java/src/net/sf/samtools/BlockTestHarness.java delete mode 100644 java/src/net/sf/samtools/ChunkTestHarness.java diff --git a/java/src/net/sf/samtools/BAMChunkIterator.java b/java/src/net/sf/samtools/BAMBlockIterator.java similarity index 65% rename from java/src/net/sf/samtools/BAMChunkIterator.java rename to java/src/net/sf/samtools/BAMBlockIterator.java index 8a2571cd5..0898237f8 100644 --- a/java/src/net/sf/samtools/BAMChunkIterator.java +++ b/java/src/net/sf/samtools/BAMBlockIterator.java @@ -1,9 +1,18 @@ package net.sf.samtools; +import net.sf.samtools.util.BinaryCodec; +import net.sf.samtools.util.BlockCompressedInputStream; +import net.sf.samtools.util.StringLineReader; +import net.sf.samtools.util.CloseableIterator; +import net.sf.samtools.util.SeekableStream; + import java.util.Iterator; import java.util.NoSuchElementException; +import java.util.Arrays; import java.nio.channels.FileChannel; import java.io.IOException; +import java.io.FileInputStream; +import java.io.File; /** * Walks over a BAM file, discovering and returning the starting location of each block @@ -12,7 +21,7 @@ import java.io.IOException; * @author mhanna * @version 0.1 */ -public class BAMChunkIterator implements Iterator { +public class BAMBlockIterator implements CloseableIterator { /** * File channel from which to read chunks. */ @@ -25,10 +34,14 @@ public class BAMChunkIterator implements Iterator { /** * Iterate through the BAM chunks in a file. - * @param channel File channel to use when accessing the BAM. + * @param file stream File to use when accessing the BAM. */ - public BAMChunkIterator(FileChannel channel) { - this.blockReader = new BlockReader(channel); + public BAMBlockIterator(File file) throws IOException { + FileInputStream inputStream = new FileInputStream(file); + this.blockReader = new BlockReader(inputStream); + } + + public void close() { } /** @@ -48,19 +61,19 @@ public class BAMChunkIterator implements Iterator { * @return The next chunk. * @throw NoSuchElementException if no next chunk is available. */ - public Chunk next() { + public Block next() { if(!hasNext()) throw new NoSuchElementException("No next chunk is available."); - Chunk chunk = null; + Block block = null; try { - chunk = blockReader.getChunkAt(position); - position = (chunk.getChunkEnd() >> 16) + 1; + block = blockReader.getBlockAt(position); + position = block.position + block.compressedBlockSize; } catch(IOException ex) { throw new SAMException("Unable to completely read chunk at end of file.", ex); } - return chunk; + return block; } /** diff --git a/java/src/net/sf/samtools/BAMFileHeaderLoader.java b/java/src/net/sf/samtools/BAMFileHeaderLoader.java new file mode 100644 index 000000000..d1a156874 --- /dev/null +++ b/java/src/net/sf/samtools/BAMFileHeaderLoader.java @@ -0,0 +1,77 @@ +package net.sf.samtools; + +import net.sf.samtools.util.BlockCompressedInputStream; +import net.sf.samtools.util.BinaryCodec; +import net.sf.samtools.util.StringLineReader; + +import java.io.File; +import java.io.IOException; +import java.io.DataInputStream; +import java.util.Arrays; + +/** + * Loads a BAM file header from an file, optionally providing its position + * within the file. + * + * @author mhanna + * @version 0.1 + */ +public class BAMFileHeaderLoader { + /** + * The contents of the BAM file header. + */ + private final SAMFileHeader header; + + /** + * Location of the header within the BAM file. + */ + private final Chunk location; + + /** + * Load the header from the given file. + * @param header the parsed haeder for the BAM file. + * @param location the location of the header (start and stop) within the BAM. + */ + private BAMFileHeaderLoader(SAMFileHeader header, Chunk location) { + this.header = header; + this.location = location; + } + + /** + * Gets the header for the given BAM file. + * @return The header for this BAM file. + */ + public SAMFileHeader getHeader() { + return header; + } + + /** + * Gets the location of the header within the given BAM file, in chunk format. + * @return the location of the header, in chunk coordinates. + */ + public Chunk getLocation() { + return location; + } + + public static BAMFileHeaderLoader load(File file) throws IOException { + BlockCompressedInputStream inputStream = new BlockCompressedInputStream(file); + BinaryCodec binaryCodec = new BinaryCodec(new DataInputStream(inputStream)); + + final byte[] buffer = new byte[4]; + binaryCodec.readBytes(buffer); + if (!Arrays.equals(buffer, BAMFileConstants.BAM_MAGIC)) { + throw new IOException("Invalid BAM file header"); + } + + final int headerTextLength = binaryCodec.readInt(); + final String textHeader = binaryCodec.readString(headerTextLength); + final SAMTextHeaderCodec headerCodec = new SAMTextHeaderCodec(); + headerCodec.setValidationStringency(SAMFileReader.ValidationStringency.SILENT); + SAMFileHeader header = headerCodec.decode(new StringLineReader(textHeader),file.getAbsolutePath()); + + inputStream.close(); + + return new BAMFileHeaderLoader(header,new Chunk(buffer.length,inputStream.getFilePointer())); + } + +} diff --git a/java/src/net/sf/samtools/Block.java b/java/src/net/sf/samtools/Block.java new file mode 100644 index 000000000..d3b7c5393 --- /dev/null +++ b/java/src/net/sf/samtools/Block.java @@ -0,0 +1,34 @@ +package net.sf.samtools; + +/** + * Represents the position of a block on disk. + * + * @author mhanna + * @version 0.1 + */ +public class Block { + public final long position; + public final int compressedBlockSize; + public final long uncompressedBlockSize; + + /** + * Create a block, loading no data into memory. + * @param position Position of this block on disk.s + * @param compressedBlockSize Size of the block on disk; if compressedData is present, should match compressedData.length. + * @param uncompressedBlockSize Size of the data in the block. + */ + public Block(final long position, final int compressedBlockSize, final long uncompressedBlockSize) { + this.position = position; + this.compressedBlockSize = compressedBlockSize; + this.uncompressedBlockSize = uncompressedBlockSize; + } + + /** + * Build a string representation of the block. + * @return A string indicating position and size. + */ + @Override + public String toString() { + return String.format("Block: pos = %d, compressed size = %d, uncompressed size = %d",position,compressedBlockSize,uncompressedBlockSize); + } +} diff --git a/java/src/net/sf/samtools/BlockReader.java b/java/src/net/sf/samtools/BlockReader.java index a2f91080e..1da6b5cd3 100644 --- a/java/src/net/sf/samtools/BlockReader.java +++ b/java/src/net/sf/samtools/BlockReader.java @@ -3,12 +3,13 @@ package net.sf.samtools; import net.sf.samtools.util.BlockCompressedStreamConstants; import java.io.IOException; +import java.io.FileInputStream; import java.nio.ByteBuffer; import java.nio.ByteOrder; import java.nio.channels.FileChannel; /** - * Read an individual block or chunk from the BGZF file. + * Read an individual block from the BGZF file. * * @author mhanna * @version 0.1 @@ -26,14 +27,22 @@ public class BlockReader { /** * Create a new block reader. Block readers can operate independently on the same input file. - * @param channel File channel from which to read. + * @param inputStream InputStream from which to read. */ - public BlockReader(final FileChannel channel) { - this.channel = channel; + public BlockReader(final FileInputStream inputStream) { + this.channel = inputStream.getChannel(); this.buffer = ByteBuffer.allocateDirect(BlockCompressedStreamConstants.MAX_COMPRESSED_BLOCK_SIZE); buffer.order(ByteOrder.LITTLE_ENDIAN); } + /** + * Closes the block reader's channel. + * @throws IOException On failure to close channel. + */ + public void close() throws IOException { + this.channel.close(); + } + /** * Determine whether the given offset is at the end of the file. * @param position Position at which to start reading. Must be at the beginning of a block. @@ -51,7 +60,7 @@ public class BlockReader { * @return Chunk of the BGZF file, starting at position. * @throws IOException if the chunk could not be read. */ - public Chunk getChunkAt(long position) throws IOException { + public Block getBlockAt(long position) throws IOException { buffer.rewind(); buffer.limit(BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH); int count = channel.read(buffer,position); @@ -78,14 +87,18 @@ public class BlockReader { // Skip blocksize subfield intro buffer.position(buffer.position() + 4); // Read ushort - final int totalBlockSize = (buffer.getShort() & 0xffff) + 1; - if (totalBlockSize < BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH || totalBlockSize > buffer.capacity()) { - throw new IOException("Unexpected compressed block length: " + totalBlockSize); + final int compressedBlockSize = (buffer.getShort() & 0xffff) + 1; + if (compressedBlockSize < BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH || compressedBlockSize > buffer.capacity()) { + throw new IOException("Unexpected compressed block length: " + compressedBlockSize); } - final long chunkStart = position << 16; - final long chunkEnd = (position+totalBlockSize-1) << 16; + // Read the uncompressed block size + buffer.rewind(); + buffer.limit(4); + channel.read(buffer,position+compressedBlockSize-4); + buffer.flip(); + final int uncompressedBlockSize = buffer.getInt(); - return new Chunk(chunkStart,chunkEnd); + return new Block(position,compressedBlockSize,uncompressedBlockSize); } } diff --git a/java/src/net/sf/samtools/BlockTestHarness.java b/java/src/net/sf/samtools/BlockTestHarness.java new file mode 100644 index 000000000..cc0d60757 --- /dev/null +++ b/java/src/net/sf/samtools/BlockTestHarness.java @@ -0,0 +1,46 @@ +package net.sf.samtools; + +import java.io.File; +import java.io.IOException; + +/** + * Test harness for playing with sharding by BGZF block. + * + * @author mhanna + * @version 0.1 + */ +public class BlockTestHarness { + private static void usage() { + System.out.println("Usage: ChunkTestHarness .bam"); + System.exit(1); + } + + public static void main(String args[]) throws IOException { + if(args.length == 0) + usage(); + + String bamFileName = args[0]; + if(!bamFileName.endsWith(".bam")) + usage(); + + File bamFile = new File(bamFileName); + if(!bamFile.exists()) + usage(); + + Chunk headerLocation = BAMFileHeaderLoader.load(bamFile).getLocation(); + System.out.printf("Header location = %s%n", headerLocation); + + BAMBlockIterator blockIterator = new BAMBlockIterator(bamFile); + long blockCount = 0; + + long startTime = System.currentTimeMillis(); + while(blockIterator.hasNext()) { + Block block = blockIterator.next(); + blockCount++; + //System.out.println(block); + } + long endTime = System.currentTimeMillis(); + + System.out.printf("Number of chunks: %d; elapsed time: %dms%n", blockCount, endTime-startTime); + } +} diff --git a/java/src/net/sf/samtools/Chunk.java b/java/src/net/sf/samtools/Chunk.java index a7bb1d565..aacbd89e1 100644 --- a/java/src/net/sf/samtools/Chunk.java +++ b/java/src/net/sf/samtools/Chunk.java @@ -60,4 +60,9 @@ class Chunk implements Comparable { result = 31 * result + (int) (mChunkEnd ^ (mChunkEnd >>> 32)); return result; } + + @Override + public String toString() { + return String.format("%d:%d-%d:%d",mChunkStart >> 16,mChunkStart & 0xFFFF,mChunkEnd >> 16,mChunkEnd & 0xFFFF); + } } diff --git a/java/src/net/sf/samtools/ChunkTestHarness.java b/java/src/net/sf/samtools/ChunkTestHarness.java deleted file mode 100644 index dd37b347e..000000000 --- a/java/src/net/sf/samtools/ChunkTestHarness.java +++ /dev/null @@ -1,51 +0,0 @@ -package net.sf.samtools; - -import java.io.File; -import java.io.FileInputStream; -import java.io.IOException; -import java.nio.channels.FileChannel; - -/** - * Test harness for playing with sharding by BGZF block. - * - * @author mhanna - * @version 0.1 - */ -public class ChunkTestHarness { - private static void usage() { - System.out.println("Usage: ChunkTestHarness .bam"); - System.exit(1); - } - - public static void main(String args[]) throws IOException { - if(args.length == 0) - usage(); - - String bamFileName = args[0]; - if(!bamFileName.endsWith(".bam")) - usage(); - - File bamFile = new File(bamFileName); - if(!bamFile.exists()) - usage(); - - FileInputStream bamInputStream = new FileInputStream(bamFile); - FileChannel bamInputChannel = bamInputStream.getChannel(); - - BAMChunkIterator chunkIterator = new BAMChunkIterator(bamInputChannel); - long chunkCount = 0; - - long startTime = System.currentTimeMillis(); - while(chunkIterator.hasNext()) { - Chunk chunk = chunkIterator.next(); - chunkCount++; - System.out.printf("Chunk: [%d,%d)\tByte offsets: [%d,%d)%n",chunk.getChunkStart(), - chunk.getChunkEnd(), - chunk.getChunkStart()>>16, - chunk.getChunkEnd()>>16); - } - long endTime = System.currentTimeMillis(); - - System.out.printf("Number of chunks: %d; elasped time: %dms%n", chunkCount, endTime-startTime); - } -}