Add an alternate implementation of the BAM file reader that keeps the entire index in memory. Initial revision of BAMFileStat, a tool to inspect BAM file BGZF blocks and index entries.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2769 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
c89ba7b1a4
commit
3f35e181d5
|
|
@ -0,0 +1,256 @@
|
|||
/*
|
||||
* The MIT License
|
||||
*
|
||||
* Copyright (c) 2009 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
* THE SOFTWARE.
|
||||
*/
|
||||
package net.sf.samtools;
|
||||
|
||||
|
||||
import net.sf.samtools.util.RuntimeIOException;
|
||||
|
||||
import java.io.*;
|
||||
import java.nio.*;
|
||||
import java.nio.channels.*;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Class for reading BAM file indexes.
|
||||
*/
|
||||
public class BAMFileIndex2
|
||||
{
|
||||
private static final int MAX_BINS = 37450; // =(8^6-1)/7+1
|
||||
private static final int BAM_LIDX_SHIFT = 14;
|
||||
|
||||
/**
|
||||
* A mapping of reference sequence index to list of bins.
|
||||
*/
|
||||
protected final SortedMap<Integer,Bin[]> referenceToBins = new TreeMap<Integer,Bin[]>();
|
||||
|
||||
protected final SortedMap<Integer,LinearIndex> referenceToLinearIndices = new TreeMap<Integer,LinearIndex>();
|
||||
|
||||
protected BAMFileIndex2(final File file) {
|
||||
loadIndex(file);
|
||||
}
|
||||
|
||||
/**
|
||||
* Completely load the index into memory.
|
||||
* @param file File to load.
|
||||
*/
|
||||
private void loadIndex(final File file) {
|
||||
FileInputStream fileStream;
|
||||
FileChannel fileChannel;
|
||||
MappedByteBuffer fileBuffer;
|
||||
|
||||
try {
|
||||
fileStream = new FileInputStream(file);
|
||||
fileChannel = fileStream.getChannel();
|
||||
fileBuffer = fileChannel.map(FileChannel.MapMode.READ_ONLY, 0L, fileChannel.size());
|
||||
fileBuffer.order(ByteOrder.LITTLE_ENDIAN);
|
||||
} catch (IOException exc) {
|
||||
throw new RuntimeIOException(exc.getMessage(), exc);
|
||||
}
|
||||
|
||||
try {
|
||||
final byte[] buffer = new byte[4];
|
||||
readBytes(fileBuffer,buffer);
|
||||
if (!Arrays.equals(buffer, BAMFileConstants.BAM_INDEX_MAGIC)) {
|
||||
throw new RuntimeException("Invalid file header in BAM index " + file +
|
||||
": " + new String(buffer));
|
||||
}
|
||||
|
||||
final int sequenceCount = readInteger(fileBuffer);
|
||||
for(int sequence = 0; sequence < sequenceCount; sequence++) {
|
||||
final int binCount = readInteger(fileBuffer);
|
||||
final Bin[] bins = new Bin[binCount];
|
||||
for(int bin = 0; bin < binCount; bin++) {
|
||||
List<Chunk> chunkList = new ArrayList<Chunk>();
|
||||
final int indexBin = readInteger(fileBuffer);
|
||||
final int nChunks = readInteger(fileBuffer);
|
||||
for (int ci = 0; ci < nChunks; ci++) {
|
||||
final long chunkBegin = readLong(fileBuffer);
|
||||
final long chunkEnd = readLong(fileBuffer);
|
||||
chunkList.add(new Chunk(chunkBegin, chunkEnd));
|
||||
}
|
||||
bins[bin] = new Bin(sequence,indexBin,chunkList);
|
||||
}
|
||||
referenceToBins.put(sequence,bins);
|
||||
|
||||
int linearIndexSize = readInteger(fileBuffer);
|
||||
long[] linearIndex = new long[linearIndexSize];
|
||||
for(int indexEntry = 0; indexEntry < linearIndexSize; indexEntry++)
|
||||
linearIndex[indexEntry] = readLong(fileBuffer);
|
||||
|
||||
referenceToLinearIndices.put(sequence,new LinearIndex(sequence,linearIndex));
|
||||
}
|
||||
}
|
||||
finally {
|
||||
try {
|
||||
fileChannel.close();
|
||||
fileStream.close();
|
||||
} catch (IOException exc) {
|
||||
throw new RuntimeIOException(exc.getMessage(), exc);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get list of regions of BAM file that may contain SAMRecords for the given range
|
||||
* @param referenceIndex sequence of desired SAMRecords
|
||||
* @param startPos 1-based start of the desired interval, inclusive
|
||||
* @param endPos 1-based end of the desired interval, inclusive
|
||||
* @return array of pairs of virtual file positions. Each pair is the first and last
|
||||
* virtual file position in a range that can be scanned to find SAMRecords that overlap the given
|
||||
* positions. The last position in each pair is a virtual file pointer to the first SAMRecord beyond
|
||||
* the range that may contain the indicated SAMRecords.
|
||||
*/
|
||||
long[] getSearchBins(final int referenceIndex, final int startPos, final int endPos) {
|
||||
|
||||
// System.out.println("# Sequence count: " + sequenceCount);
|
||||
if (referenceIndex >= referenceToBins.size()) {
|
||||
return null;
|
||||
}
|
||||
|
||||
final BitSet regionBins = regionToBins(startPos, endPos);
|
||||
if (regionBins == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
Bin[] bins = referenceToBins.get(referenceIndex);
|
||||
|
||||
// System.out.println("# Sequence target TID: " + referenceIndex);
|
||||
if (bins.length == 0) {
|
||||
return null;
|
||||
}
|
||||
|
||||
List<Chunk> chunkList = new ArrayList<Chunk>();
|
||||
for(Bin bin: bins) {
|
||||
if (regionBins.get(bin.binNumber))
|
||||
chunkList.addAll(bin.chunks);
|
||||
}
|
||||
|
||||
if (chunkList.isEmpty()) {
|
||||
return null;
|
||||
}
|
||||
|
||||
final int start = (startPos <= 0) ? 0 : startPos-1;
|
||||
final int regionLinearBin = start >> BAM_LIDX_SHIFT;
|
||||
// System.out.println("# regionLinearBin: " + regionLinearBin);
|
||||
LinearIndex index = referenceToLinearIndices.get(referenceIndex);
|
||||
long minimumOffset = 0;
|
||||
if (regionLinearBin < index.indexEntries.length)
|
||||
minimumOffset = index.indexEntries[regionLinearBin];
|
||||
chunkList = optimizeChunkList(chunkList, minimumOffset);
|
||||
return convertToArray(chunkList);
|
||||
}
|
||||
|
||||
/**
|
||||
* Use to get close to the unmapped reads at the end of a BAM file.
|
||||
* @return The file offset of the first record in the last linear bin, or -1
|
||||
* if there are no elements in linear bins (i.e. no mapped reads).
|
||||
*/
|
||||
long getStartOfLastLinearBin() {
|
||||
LinearIndex lastLinearIndex = referenceToLinearIndices.get(referenceToLinearIndices.lastKey());
|
||||
return lastLinearIndex.indexEntries[lastLinearIndex.indexEntries.length-1];
|
||||
}
|
||||
|
||||
private List<Chunk> optimizeChunkList(final List<Chunk> chunkList, final long minimumOffset) {
|
||||
Chunk lastChunk = null;
|
||||
Collections.sort(chunkList);
|
||||
final List<Chunk> result = new ArrayList<Chunk>();
|
||||
for (final Chunk chunk : chunkList) {
|
||||
if (chunk.getChunkEnd() <= minimumOffset) {
|
||||
continue;
|
||||
}
|
||||
if (result.isEmpty()) {
|
||||
result.add(chunk);
|
||||
lastChunk = chunk;
|
||||
continue;
|
||||
}
|
||||
// Coalesce chunks that are in adjacent file blocks.
|
||||
// This is a performance optimization.
|
||||
final long lastFileBlock = getFileBlock(lastChunk.getChunkEnd());
|
||||
final long chunkFileBlock = getFileBlock(chunk.getChunkStart());
|
||||
if (chunkFileBlock - lastFileBlock > 1) {
|
||||
result.add(chunk);
|
||||
lastChunk = chunk;
|
||||
} else {
|
||||
if (chunk.getChunkEnd() > lastChunk.getChunkEnd()) {
|
||||
lastChunk.setChunkEnd(chunk.getChunkEnd());
|
||||
}
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
private long[] convertToArray(final List<Chunk> chunkList) {
|
||||
final int count = chunkList.size() * 2;
|
||||
if (count == 0) {
|
||||
return null;
|
||||
}
|
||||
int index = 0;
|
||||
final long[] result = new long[count];
|
||||
for (final Chunk chunk : chunkList) {
|
||||
result[index++] = chunk.getChunkStart();
|
||||
result[index++] = chunk.getChunkEnd();
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get candidate bins for the specified region
|
||||
* @param startPos 1-based start of target region, inclusive.
|
||||
* @param endPos 1-based end of target region, inclusive.
|
||||
* @return bit set for each bin that may contain SAMRecords in the target region.
|
||||
*/
|
||||
protected BitSet regionToBins(final int startPos, final int endPos) {
|
||||
final int maxPos = 0x1FFFFFFF;
|
||||
final int start = (startPos <= 0) ? 0 : (startPos-1) & maxPos;
|
||||
final int end = (endPos <= 0) ? maxPos : (endPos-1) & maxPos;
|
||||
if (start > end) {
|
||||
return null;
|
||||
}
|
||||
int k;
|
||||
final BitSet bitSet = new BitSet(MAX_BINS);
|
||||
bitSet.set(0);
|
||||
for (k = 1 + (start>>26); k <= 1 + (end>>26); ++k) bitSet.set(k);
|
||||
for (k = 9 + (start>>23); k <= 9 + (end>>23); ++k) bitSet.set(k);
|
||||
for (k = 73 + (start>>20); k <= 73 + (end>>20); ++k) bitSet.set(k);
|
||||
for (k = 585 + (start>>17); k <= 585 + (end>>17); ++k) bitSet.set(k);
|
||||
for (k = 4681 + (start>>14); k <= 4681 + (end>>14); ++k) bitSet.set(k);
|
||||
return bitSet;
|
||||
}
|
||||
|
||||
private long getFileBlock(final long bgzfOffset) {
|
||||
return ((bgzfOffset >> 16L) & 0xFFFFFFFFFFFFL);
|
||||
}
|
||||
|
||||
private void readBytes(MappedByteBuffer source, final byte[] target) {
|
||||
source.get(target);
|
||||
}
|
||||
|
||||
private int readInteger(MappedByteBuffer source) {
|
||||
return source.getInt();
|
||||
}
|
||||
|
||||
private long readLong(MappedByteBuffer source) {
|
||||
return source.getLong();
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,32 @@
|
|||
package net.sf.samtools;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* An individual bin in a BAM file, divided into chunks plus a linear index.
|
||||
*
|
||||
* @author mhanna
|
||||
* @version 0.1
|
||||
*/
|
||||
public class Bin {
|
||||
/**
|
||||
* The reference sequence associated with this bin.
|
||||
*/
|
||||
public final int referenceSequence;
|
||||
|
||||
/**
|
||||
* The number of this bin within the BAM file.
|
||||
*/
|
||||
public final int binNumber;
|
||||
|
||||
/**
|
||||
* The chunks contained within this bin.
|
||||
*/
|
||||
public final List<Chunk> chunks;
|
||||
|
||||
public Bin(int referenceSequence, int binNumber, List<Chunk> chunks) {
|
||||
this.referenceSequence = referenceSequence;
|
||||
this.binNumber = binNumber;
|
||||
this.chunks = chunks;
|
||||
}
|
||||
}
|
||||
|
|
@ -22,7 +22,7 @@ public class Chunk implements Comparable<Chunk> {
|
|||
mChunkEnd = end;
|
||||
}
|
||||
|
||||
long getChunkStart() {
|
||||
public long getChunkStart() {
|
||||
return mChunkStart;
|
||||
}
|
||||
|
||||
|
|
@ -30,7 +30,7 @@ public class Chunk implements Comparable<Chunk> {
|
|||
mChunkStart = value;
|
||||
}
|
||||
|
||||
long getChunkEnd() {
|
||||
public long getChunkEnd() {
|
||||
return mChunkEnd;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,24 @@
|
|||
package net.sf.samtools;
|
||||
|
||||
/**
|
||||
* The linear index associated with a given reference in a BAM index.
|
||||
*
|
||||
* @author mhanna
|
||||
* @version 0.1
|
||||
*/
|
||||
public class LinearIndex {
|
||||
/**
|
||||
* The reference sequence number for this linear index.
|
||||
*/
|
||||
public final int referenceSequence;
|
||||
|
||||
/**
|
||||
* The linear index entries within this bin.
|
||||
*/
|
||||
public final long[] indexEntries;
|
||||
|
||||
public LinearIndex(final int referenceSequence, final long[] indexEntries) {
|
||||
this.referenceSequence = referenceSequence;
|
||||
this.indexEntries = indexEntries;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,121 @@
|
|||
package org.broadinstitute.sting.gatk.datasources;
|
||||
|
||||
import org.broadinstitute.sting.utils.cmdLine.CommandLineProgram;
|
||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.PrintStream;
|
||||
import java.nio.channels.FileChannel;
|
||||
|
||||
import net.sf.samtools.*;
|
||||
|
||||
/**
|
||||
*
|
||||
*
|
||||
* @author mhanna
|
||||
* @version 0.1
|
||||
*/
|
||||
public class BAMFileStat extends CommandLineProgram {
|
||||
public enum CommandType { ShowBlocks, ShowIndex }
|
||||
|
||||
@Argument(doc="Which operation to run.",required=true)
|
||||
private CommandType command;
|
||||
|
||||
@Argument(doc="The BAM file to inspect.",required=true)
|
||||
private File bamFile;
|
||||
|
||||
@Argument(doc="The range of blocks to inspect.",required=false)
|
||||
private String range;
|
||||
|
||||
public int execute() {
|
||||
Integer startPosition = null, stopPosition = null;
|
||||
if(range != null) {
|
||||
int dashPosition = range.indexOf('-');
|
||||
if(dashPosition > 0) {
|
||||
if(dashPosition > 0)
|
||||
startPosition = Integer.valueOf(range.substring(0,dashPosition));
|
||||
if(dashPosition < range.length()-1)
|
||||
stopPosition = Integer.valueOf(range.substring(dashPosition+1));
|
||||
}
|
||||
else
|
||||
startPosition = Integer.valueOf(range);
|
||||
}
|
||||
|
||||
switch(command) {
|
||||
case ShowBlocks:
|
||||
showBlocks(bamFile,startPosition,stopPosition);
|
||||
break;
|
||||
case ShowIndex:
|
||||
showIndexBins(bamFile);
|
||||
break;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Required main method implementation.
|
||||
* @param argv Command-line arguments.
|
||||
*/
|
||||
public static void main(String[] argv) {
|
||||
try {
|
||||
BAMFileStat instance = new BAMFileStat();
|
||||
start(instance, argv);
|
||||
System.exit(CommandLineProgram.result);
|
||||
} catch (Exception e) {
|
||||
exitSystemWithError(e);
|
||||
}
|
||||
}
|
||||
|
||||
private void showBlocks(File bamFile, Integer startPosition, Integer stopPosition) {
|
||||
int blockNumber = 0;
|
||||
|
||||
try {
|
||||
BAMBlockIterator iterator = new BAMBlockIterator(bamFile);
|
||||
while(iterator.hasNext()) {
|
||||
Block block = iterator.next();
|
||||
blockNumber++;
|
||||
|
||||
if(startPosition != null && startPosition > blockNumber) continue;
|
||||
if(stopPosition != null && stopPosition < blockNumber) break;
|
||||
|
||||
System.out.printf("Block number = %d; position = %d; compressed size = %d; uncompressed size = %d%n", blockNumber, block.position, block.compressedBlockSize, block.uncompressedBlockSize);
|
||||
}
|
||||
}
|
||||
catch(IOException ex) {
|
||||
throw new StingException("Unable to open BAM file");
|
||||
}
|
||||
}
|
||||
|
||||
private void showIndexBins(File bamFile) {
|
||||
BAMFileIndexContentInspector inspector = new BAMFileIndexContentInspector(bamFile);
|
||||
inspector.inspect(System.out,null,null);
|
||||
}
|
||||
|
||||
private class BAMFileIndexContentInspector extends BAMFileIndex2 {
|
||||
public BAMFileIndexContentInspector(File bamFileIndex) {
|
||||
super(bamFileIndex);
|
||||
}
|
||||
|
||||
public void inspect(PrintStream outputStream, Integer startPosition, Integer stopPosition) {
|
||||
outputStream.printf("Number of reference sequences: %d%n", this.referenceToBins.size());
|
||||
for(int referenceSequence: referenceToBins.keySet()) {
|
||||
Bin[] bins = referenceToBins.get(referenceSequence);
|
||||
outputStream.printf("Reference sequence: %d%n",referenceSequence);
|
||||
outputStream.printf("number of bins: %d%n",bins.length);
|
||||
for(Bin bin: bins) {
|
||||
outputStream.printf("\tBin: %d, number of chunks: %d%n", bin.binNumber, bin.chunks.size());
|
||||
for(Chunk chunk: bin.chunks)
|
||||
outputStream.printf("\t\tChunk: %s%n", chunk);
|
||||
}
|
||||
LinearIndex linearIndex = referenceToLinearIndices.get(referenceSequence);
|
||||
outputStream.printf("\t\tIndex entries: %d", linearIndex.indexEntries.length);
|
||||
for(long indexEntry: linearIndex.indexEntries)
|
||||
outputStream.printf("%d,",indexEntry);
|
||||
outputStream.printf("%n");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue