Add an alternate implementation of the BAM file reader that keeps the entire index in memory. Initial revision of BAMFileStat, a tool to inspect BAM file BGZF blocks and index entries.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2769 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
hanna 2010-02-03 19:48:15 +00:00
parent c89ba7b1a4
commit 3f35e181d5
5 changed files with 435 additions and 2 deletions

View File

@ -0,0 +1,256 @@
/*
* The MIT License
*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package net.sf.samtools;
import net.sf.samtools.util.RuntimeIOException;
import java.io.*;
import java.nio.*;
import java.nio.channels.*;
import java.util.*;
/**
* Class for reading BAM file indexes.
*/
public class BAMFileIndex2
{
private static final int MAX_BINS = 37450; // =(8^6-1)/7+1
private static final int BAM_LIDX_SHIFT = 14;
/**
* A mapping of reference sequence index to list of bins.
*/
protected final SortedMap<Integer,Bin[]> referenceToBins = new TreeMap<Integer,Bin[]>();
protected final SortedMap<Integer,LinearIndex> referenceToLinearIndices = new TreeMap<Integer,LinearIndex>();
protected BAMFileIndex2(final File file) {
loadIndex(file);
}
/**
* Completely load the index into memory.
* @param file File to load.
*/
private void loadIndex(final File file) {
FileInputStream fileStream;
FileChannel fileChannel;
MappedByteBuffer fileBuffer;
try {
fileStream = new FileInputStream(file);
fileChannel = fileStream.getChannel();
fileBuffer = fileChannel.map(FileChannel.MapMode.READ_ONLY, 0L, fileChannel.size());
fileBuffer.order(ByteOrder.LITTLE_ENDIAN);
} catch (IOException exc) {
throw new RuntimeIOException(exc.getMessage(), exc);
}
try {
final byte[] buffer = new byte[4];
readBytes(fileBuffer,buffer);
if (!Arrays.equals(buffer, BAMFileConstants.BAM_INDEX_MAGIC)) {
throw new RuntimeException("Invalid file header in BAM index " + file +
": " + new String(buffer));
}
final int sequenceCount = readInteger(fileBuffer);
for(int sequence = 0; sequence < sequenceCount; sequence++) {
final int binCount = readInteger(fileBuffer);
final Bin[] bins = new Bin[binCount];
for(int bin = 0; bin < binCount; bin++) {
List<Chunk> chunkList = new ArrayList<Chunk>();
final int indexBin = readInteger(fileBuffer);
final int nChunks = readInteger(fileBuffer);
for (int ci = 0; ci < nChunks; ci++) {
final long chunkBegin = readLong(fileBuffer);
final long chunkEnd = readLong(fileBuffer);
chunkList.add(new Chunk(chunkBegin, chunkEnd));
}
bins[bin] = new Bin(sequence,indexBin,chunkList);
}
referenceToBins.put(sequence,bins);
int linearIndexSize = readInteger(fileBuffer);
long[] linearIndex = new long[linearIndexSize];
for(int indexEntry = 0; indexEntry < linearIndexSize; indexEntry++)
linearIndex[indexEntry] = readLong(fileBuffer);
referenceToLinearIndices.put(sequence,new LinearIndex(sequence,linearIndex));
}
}
finally {
try {
fileChannel.close();
fileStream.close();
} catch (IOException exc) {
throw new RuntimeIOException(exc.getMessage(), exc);
}
}
}
/**
* Get list of regions of BAM file that may contain SAMRecords for the given range
* @param referenceIndex sequence of desired SAMRecords
* @param startPos 1-based start of the desired interval, inclusive
* @param endPos 1-based end of the desired interval, inclusive
* @return array of pairs of virtual file positions. Each pair is the first and last
* virtual file position in a range that can be scanned to find SAMRecords that overlap the given
* positions. The last position in each pair is a virtual file pointer to the first SAMRecord beyond
* the range that may contain the indicated SAMRecords.
*/
long[] getSearchBins(final int referenceIndex, final int startPos, final int endPos) {
// System.out.println("# Sequence count: " + sequenceCount);
if (referenceIndex >= referenceToBins.size()) {
return null;
}
final BitSet regionBins = regionToBins(startPos, endPos);
if (regionBins == null) {
return null;
}
Bin[] bins = referenceToBins.get(referenceIndex);
// System.out.println("# Sequence target TID: " + referenceIndex);
if (bins.length == 0) {
return null;
}
List<Chunk> chunkList = new ArrayList<Chunk>();
for(Bin bin: bins) {
if (regionBins.get(bin.binNumber))
chunkList.addAll(bin.chunks);
}
if (chunkList.isEmpty()) {
return null;
}
final int start = (startPos <= 0) ? 0 : startPos-1;
final int regionLinearBin = start >> BAM_LIDX_SHIFT;
// System.out.println("# regionLinearBin: " + regionLinearBin);
LinearIndex index = referenceToLinearIndices.get(referenceIndex);
long minimumOffset = 0;
if (regionLinearBin < index.indexEntries.length)
minimumOffset = index.indexEntries[regionLinearBin];
chunkList = optimizeChunkList(chunkList, minimumOffset);
return convertToArray(chunkList);
}
/**
* Use to get close to the unmapped reads at the end of a BAM file.
* @return The file offset of the first record in the last linear bin, or -1
* if there are no elements in linear bins (i.e. no mapped reads).
*/
long getStartOfLastLinearBin() {
LinearIndex lastLinearIndex = referenceToLinearIndices.get(referenceToLinearIndices.lastKey());
return lastLinearIndex.indexEntries[lastLinearIndex.indexEntries.length-1];
}
private List<Chunk> optimizeChunkList(final List<Chunk> chunkList, final long minimumOffset) {
Chunk lastChunk = null;
Collections.sort(chunkList);
final List<Chunk> result = new ArrayList<Chunk>();
for (final Chunk chunk : chunkList) {
if (chunk.getChunkEnd() <= minimumOffset) {
continue;
}
if (result.isEmpty()) {
result.add(chunk);
lastChunk = chunk;
continue;
}
// Coalesce chunks that are in adjacent file blocks.
// This is a performance optimization.
final long lastFileBlock = getFileBlock(lastChunk.getChunkEnd());
final long chunkFileBlock = getFileBlock(chunk.getChunkStart());
if (chunkFileBlock - lastFileBlock > 1) {
result.add(chunk);
lastChunk = chunk;
} else {
if (chunk.getChunkEnd() > lastChunk.getChunkEnd()) {
lastChunk.setChunkEnd(chunk.getChunkEnd());
}
}
}
return result;
}
private long[] convertToArray(final List<Chunk> chunkList) {
final int count = chunkList.size() * 2;
if (count == 0) {
return null;
}
int index = 0;
final long[] result = new long[count];
for (final Chunk chunk : chunkList) {
result[index++] = chunk.getChunkStart();
result[index++] = chunk.getChunkEnd();
}
return result;
}
/**
* Get candidate bins for the specified region
* @param startPos 1-based start of target region, inclusive.
* @param endPos 1-based end of target region, inclusive.
* @return bit set for each bin that may contain SAMRecords in the target region.
*/
protected BitSet regionToBins(final int startPos, final int endPos) {
final int maxPos = 0x1FFFFFFF;
final int start = (startPos <= 0) ? 0 : (startPos-1) & maxPos;
final int end = (endPos <= 0) ? maxPos : (endPos-1) & maxPos;
if (start > end) {
return null;
}
int k;
final BitSet bitSet = new BitSet(MAX_BINS);
bitSet.set(0);
for (k = 1 + (start>>26); k <= 1 + (end>>26); ++k) bitSet.set(k);
for (k = 9 + (start>>23); k <= 9 + (end>>23); ++k) bitSet.set(k);
for (k = 73 + (start>>20); k <= 73 + (end>>20); ++k) bitSet.set(k);
for (k = 585 + (start>>17); k <= 585 + (end>>17); ++k) bitSet.set(k);
for (k = 4681 + (start>>14); k <= 4681 + (end>>14); ++k) bitSet.set(k);
return bitSet;
}
private long getFileBlock(final long bgzfOffset) {
return ((bgzfOffset >> 16L) & 0xFFFFFFFFFFFFL);
}
private void readBytes(MappedByteBuffer source, final byte[] target) {
source.get(target);
}
private int readInteger(MappedByteBuffer source) {
return source.getInt();
}
private long readLong(MappedByteBuffer source) {
return source.getLong();
}
}

View File

@ -0,0 +1,32 @@
package net.sf.samtools;
import java.util.List;
/**
* An individual bin in a BAM file, divided into chunks plus a linear index.
*
* @author mhanna
* @version 0.1
*/
public class Bin {
/**
* The reference sequence associated with this bin.
*/
public final int referenceSequence;
/**
* The number of this bin within the BAM file.
*/
public final int binNumber;
/**
* The chunks contained within this bin.
*/
public final List<Chunk> chunks;
public Bin(int referenceSequence, int binNumber, List<Chunk> chunks) {
this.referenceSequence = referenceSequence;
this.binNumber = binNumber;
this.chunks = chunks;
}
}

View File

@ -22,7 +22,7 @@ public class Chunk implements Comparable<Chunk> {
mChunkEnd = end;
}
long getChunkStart() {
public long getChunkStart() {
return mChunkStart;
}
@ -30,7 +30,7 @@ public class Chunk implements Comparable<Chunk> {
mChunkStart = value;
}
long getChunkEnd() {
public long getChunkEnd() {
return mChunkEnd;
}

View File

@ -0,0 +1,24 @@
package net.sf.samtools;
/**
* The linear index associated with a given reference in a BAM index.
*
* @author mhanna
* @version 0.1
*/
public class LinearIndex {
/**
* The reference sequence number for this linear index.
*/
public final int referenceSequence;
/**
* The linear index entries within this bin.
*/
public final long[] indexEntries;
public LinearIndex(final int referenceSequence, final long[] indexEntries) {
this.referenceSequence = referenceSequence;
this.indexEntries = indexEntries;
}
}

View File

@ -0,0 +1,121 @@
package org.broadinstitute.sting.gatk.datasources;
import org.broadinstitute.sting.utils.cmdLine.CommandLineProgram;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.broadinstitute.sting.utils.StingException;
import java.io.File;
import java.io.IOException;
import java.io.FileInputStream;
import java.io.PrintStream;
import java.nio.channels.FileChannel;
import net.sf.samtools.*;
/**
*
*
* @author mhanna
* @version 0.1
*/
public class BAMFileStat extends CommandLineProgram {
public enum CommandType { ShowBlocks, ShowIndex }
@Argument(doc="Which operation to run.",required=true)
private CommandType command;
@Argument(doc="The BAM file to inspect.",required=true)
private File bamFile;
@Argument(doc="The range of blocks to inspect.",required=false)
private String range;
public int execute() {
Integer startPosition = null, stopPosition = null;
if(range != null) {
int dashPosition = range.indexOf('-');
if(dashPosition > 0) {
if(dashPosition > 0)
startPosition = Integer.valueOf(range.substring(0,dashPosition));
if(dashPosition < range.length()-1)
stopPosition = Integer.valueOf(range.substring(dashPosition+1));
}
else
startPosition = Integer.valueOf(range);
}
switch(command) {
case ShowBlocks:
showBlocks(bamFile,startPosition,stopPosition);
break;
case ShowIndex:
showIndexBins(bamFile);
break;
}
return 0;
}
/**
* Required main method implementation.
* @param argv Command-line arguments.
*/
public static void main(String[] argv) {
try {
BAMFileStat instance = new BAMFileStat();
start(instance, argv);
System.exit(CommandLineProgram.result);
} catch (Exception e) {
exitSystemWithError(e);
}
}
private void showBlocks(File bamFile, Integer startPosition, Integer stopPosition) {
int blockNumber = 0;
try {
BAMBlockIterator iterator = new BAMBlockIterator(bamFile);
while(iterator.hasNext()) {
Block block = iterator.next();
blockNumber++;
if(startPosition != null && startPosition > blockNumber) continue;
if(stopPosition != null && stopPosition < blockNumber) break;
System.out.printf("Block number = %d; position = %d; compressed size = %d; uncompressed size = %d%n", blockNumber, block.position, block.compressedBlockSize, block.uncompressedBlockSize);
}
}
catch(IOException ex) {
throw new StingException("Unable to open BAM file");
}
}
private void showIndexBins(File bamFile) {
BAMFileIndexContentInspector inspector = new BAMFileIndexContentInspector(bamFile);
inspector.inspect(System.out,null,null);
}
private class BAMFileIndexContentInspector extends BAMFileIndex2 {
public BAMFileIndexContentInspector(File bamFileIndex) {
super(bamFileIndex);
}
public void inspect(PrintStream outputStream, Integer startPosition, Integer stopPosition) {
outputStream.printf("Number of reference sequences: %d%n", this.referenceToBins.size());
for(int referenceSequence: referenceToBins.keySet()) {
Bin[] bins = referenceToBins.get(referenceSequence);
outputStream.printf("Reference sequence: %d%n",referenceSequence);
outputStream.printf("number of bins: %d%n",bins.length);
for(Bin bin: bins) {
outputStream.printf("\tBin: %d, number of chunks: %d%n", bin.binNumber, bin.chunks.size());
for(Chunk chunk: bin.chunks)
outputStream.printf("\t\tChunk: %s%n", chunk);
}
LinearIndex linearIndex = referenceToLinearIndices.get(referenceSequence);
outputStream.printf("\t\tIndex entries: %d", linearIndex.indexEntries.length);
for(long indexEntry: linearIndex.indexEntries)
outputStream.printf("%d,",indexEntry);
outputStream.printf("%n");
}
}
}
}