From 8ae14793f246abe4897fe4e8e2e5f5a37341b7ad Mon Sep 17 00:00:00 2001 From: hanna Date: Mon, 28 Mar 2011 22:25:45 +0000 Subject: [PATCH] Small standalone utility to aggregate BGZF block statistics in a BAM file. Works in the same coordinate space as BAM chunks, so this will be used to calibrate chunk weighting. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5531 348d0f76-0448-11de-a6fe-93d51630548a --- .../reads/utilities/PrintBGZFBounds.java | 137 ++++++++++++++++++ 1 file changed, 137 insertions(+) create mode 100644 java/src/org/broadinstitute/sting/gatk/datasources/reads/utilities/PrintBGZFBounds.java diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/reads/utilities/PrintBGZFBounds.java b/java/src/org/broadinstitute/sting/gatk/datasources/reads/utilities/PrintBGZFBounds.java new file mode 100644 index 000000000..df7dccaa9 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/datasources/reads/utilities/PrintBGZFBounds.java @@ -0,0 +1,137 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.datasources.reads.utilities; + +import net.sf.samtools.SAMFileReader; +import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.CommandLineProgram; + +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; + +/** + * Calculates the bounds of each BGZF block in a BAM index file, along with + */ +public class PrintBGZFBounds extends CommandLineProgram { + @Argument(fullName="input",shortName="I",doc="Input bai file to process",required=true) + private File input = null; + + private final int BYTE_SIZE_IN_BYTES = Byte.SIZE / 8; + private final int INT_SIZE_IN_BYTES = Integer.SIZE / 8; + private final int SHORT_SIZE_IN_BYTES = INT_SIZE_IN_BYTES / 2; + + /** + * ID1 + ID2 + CM + FLG + MTIME + XFL + OS + XLEN. + */ + private final int HEADER_SIZE = BYTE_SIZE_IN_BYTES*4+INT_SIZE_IN_BYTES+BYTE_SIZE_IN_BYTES*2+SHORT_SIZE_IN_BYTES + BYTE_SIZE_IN_BYTES*2 + SHORT_SIZE_IN_BYTES*2;; + + /** + * CRC32 + ISIZE + */ + private final int FOOTER_SIZE = INT_SIZE_IN_BYTES*2; + + @Override + public int execute() throws IOException { + FileInputStream fis = new FileInputStream(input); + ByteBuffer headerBuffer = allocateBuffer(HEADER_SIZE); + ByteBuffer footerBuffer = allocateBuffer(FOOTER_SIZE); + + float compressedSize = 0; + float uncompressedSize = 0; + long totalBlocks = 0; + + //SAMFileReader reader = new SAMFileReader(input); + + while(true) { + final long blockStart = fis.getChannel().position(); + + int totalRead = fis.getChannel().read(headerBuffer); + if(totalRead <= 0) + break; + headerBuffer.flip(); + + // Read out header information, including subfield IDs. + headerBuffer.position(headerBuffer.capacity()-BYTE_SIZE_IN_BYTES*2); + final int cDataSize = headerBuffer.getShort()-HEADER_SIZE-FOOTER_SIZE+1; + compressedSize += cDataSize; + + // Skip past body. + fis.getChannel().position(fis.getChannel().position()+cDataSize); + + // Read the footer + fis.getChannel().read(footerBuffer); + footerBuffer.flip(); + + // Retrieve the uncompressed size from the footer. + footerBuffer.position(footerBuffer.capacity()-INT_SIZE_IN_BYTES); + uncompressedSize += footerBuffer.getInt(); + + // Reset buffers for subsequent reads. + headerBuffer.flip(); + footerBuffer.flip(); + + totalBlocks++; + + final long blockStop = fis.getChannel().position() - 1; + + System.out.printf("BGZF block %d: [%d-%d]%n",totalBlocks,blockStart,blockStop); + } + + System.out.printf("SUCCESS! Average compressed block size = %f, average uncompressed size = %f, compressed/uncompressed ratio: %f%n",compressedSize/totalBlocks,uncompressedSize/totalBlocks,compressedSize/uncompressedSize); + + return 0; + } + + private ByteBuffer allocateBuffer(final int size) { + ByteBuffer buffer = ByteBuffer.allocate(size); + buffer.order(ByteOrder.LITTLE_ENDIAN); + return buffer; + } + + /** + * Required main method implementation. + * @param argv Command-line argument text. + * @throws Exception on error. + */ + public static void main(String[] argv) throws Exception { + int returnCode = 0; + try { + PrintBGZFBounds instance = new PrintBGZFBounds(); + start(instance, argv); + returnCode = 0; + } + catch(Exception ex) { + returnCode = 1; + ex.printStackTrace(); + throw ex; + } + finally { + System.exit(returnCode); + } + } +}