Load the linear index without forcing the index for the entire contig to be

loaded into memory.


git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5349 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
hanna 2011-03-02 00:08:39 +00:00
parent a20ce1436d
commit 43567b7fe3
3 changed files with 107 additions and 11 deletions

View File

@ -66,6 +66,16 @@ public class BAMIndexBinIterator {
*/
private final int referenceSequence;
/**
* Position of the linear index in the BAM on disk for the given reference sequence.
*/
private final long linearIndexPosition;
/**
* Size of the linear index for the given reference sequence.
*/
private final int linearIndexSize;
/**
* Size of a long in bytes.
*/
@ -95,28 +105,33 @@ public class BAMIndexBinIterator {
FileChannel metaIndexChannel = metaIndex.getChannel();
// zero out the contents of the file. Arrays of primitives in java are always zeroed out by default.
byte[] emptyContents = new byte[GATKBAMIndex.MAX_BINS*(Long.SIZE/8)]; // byte array is zeroed out by default.
byte[] emptyContents = new byte[GATKBAMIndex.MAX_BINS*LONG_SIZE_IN_BYTES]; // byte array is zeroed out by default.
metaIndexChannel.write(ByteBuffer.wrap(emptyContents));
ByteBuffer binPositionBuffer = ByteBuffer.allocate(emptyContents.length);
binPositionBuffer.order(ByteOrder.LITTLE_ENDIAN);
ByteBuffer positionBuffer = ByteBuffer.allocate(emptyContents.length);
positionBuffer.order(ByteOrder.LITTLE_ENDIAN);
// Write the positions of the bins in the index
for (int binNumber = 0; binNumber < binCount; binNumber++) {
long position = index.position();
final int indexBin = index.readInteger();
metaIndexChannel.position(indexBin*LONG_SIZE_IN_BYTES);
binPositionBuffer.putLong(position);
binPositionBuffer.flip();
positionBuffer.putLong(position);
positionBuffer.flip();
metaIndexChannel.write(binPositionBuffer);
binPositionBuffer.flip();
metaIndexChannel.write(positionBuffer);
positionBuffer.flip();
final int nChunks = index.readInteger();
index.skipBytes(16 * nChunks);
}
// Read the size of the linear index and the first position.
linearIndexSize = index.readInteger();
linearIndexPosition = index.position();
metaIndexChannel.close();
metaIndex.close();
}
@ -133,6 +148,55 @@ public class BAMIndexBinIterator {
return new LevelIterator(level);
}
/**
* Gets the linear index entry for the specified genomic start.
* @param genomicStart Starting location for which to search.
* @return Linear index entry corresponding to this genomic start.
*/
public long getLinearIndexEntry(final int genomicStart) {
final int indexOffset = (genomicStart-1 >> 14);
// If the number of bins is exceeded, don't perform any trimming.
if(indexOffset >= linearIndexSize)
return 0L;
FileInputStream indexInputStream;
try {
indexInputStream = new FileInputStream(indexFile);
}
catch(IOException ex) {
throw new ReviewedStingException("Unable to open index file for reading");
}
try {
indexInputStream.getChannel().position(linearIndexPosition+(indexOffset*LONG_SIZE_IN_BYTES));
}
catch(IOException ex) {
throw new ReviewedStingException("Unable to position index file for reading");
}
ByteBuffer indexReader = ByteBuffer.allocate(LONG_SIZE_IN_BYTES);
indexReader.order(ByteOrder.LITTLE_ENDIAN);
try {
indexInputStream.getChannel().read(indexReader);
}
catch(IOException ex) {
throw new ReviewedStingException("Unable to position index file for reading");
}
indexReader.flip();
final long linearIndexEntry = indexReader.getLong();
try {
indexInputStream.close();
}
catch(IOException ex) {
throw new ReviewedStingException("Unable to close index file.");
}
return linearIndexEntry;
}
private class LevelIterator implements CloseableIterator<GATKBin> {
/**
* The raw BAM index file with unordered bins.

View File

@ -53,10 +53,16 @@ public class BinTree {
*/
private final int binTreeStop;
public BinTree(final int binTreeStart, final int binTreeStop,final GATKBin[] bins) {
/**
* Linear index entry associated with this location.
*/
private final long linearIndexEntry;
public BinTree(final int binTreeStart, final int binTreeStop,final GATKBin[] bins, final long linearIndexEntry) {
this.binTreeStart = binTreeStart;
this.binTreeStop = binTreeStop;
this.bins = bins;
this.linearIndexEntry = linearIndexEntry;
}
/**
@ -91,6 +97,14 @@ public class BinTree {
return binTreeStop;
}
/**
* The linear index entry associated with this bin tree.
* @return Linear index entry.
*/
public long getLinearIndexEntry() {
return linearIndexEntry;
}
/**
* Returns true if the location of this bin tree is before the given position.
* @param locus Locus to test.
@ -127,6 +141,11 @@ class BinTreeIterator implements Iterator<BinTree> {
*/
private final GATKBAMIndex index;
/**
* Master iterator over the BAM index.
*/
private final BAMIndexBinIterator binIterator;
/**
* Iterators over each individual level.
*/
@ -146,7 +165,7 @@ class BinTreeIterator implements Iterator<BinTree> {
public BinTreeIterator(final GATKBAMIndex index, final File indexFile, final int referenceSequence) {
this.index = index;
BAMIndexBinIterator binIterator = new BAMIndexBinIterator(index,indexFile,referenceSequence);
binIterator = new BAMIndexBinIterator(index,indexFile,referenceSequence);
levelIterators = new PeekableIterator[GATKBAMIndex.getNumIndexLevels()];
for(int level = 0; level < GATKBAMIndex.getNumIndexLevels(); level++)
levelIterators[level] = new PeekableIterator<GATKBin>(binIterator.getIteratorOverLevel(level));
@ -158,6 +177,11 @@ class BinTreeIterator implements Iterator<BinTree> {
advance();
}
public void close() {
for(PeekableIterator<GATKBin> levelIterator: levelIterators)
levelIterator.close();
}
public boolean hasNext() {
return nextBinTree != null;
}
@ -212,7 +236,10 @@ class BinTreeIterator implements Iterator<BinTree> {
for(int level = 0; level <= lowestLevel; level++) {
if(bins[level] != null) {
Bin lowestLevelBin = new Bin(bins[level].getReferenceSequence(),currentBinInLowestLevel);
nextBinTree = new BinTree(index.getFirstLocusInBin(lowestLevelBin),index.getLastLocusInBin(lowestLevelBin),bins);
final int firstLocusInBin = index.getFirstLocusInBin(lowestLevelBin);
final int lastLocusInBin = index.getLastLocusInBin(lowestLevelBin);
final long linearIndexEntry = binIterator.getLinearIndexEntry(firstLocusInBin);
nextBinTree = new BinTree(firstLocusInBin,lastLocusInBin,bins,linearIndexEntry);
break;
}
}

View File

@ -31,6 +31,7 @@ import net.sf.samtools.GATKChunk;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.util.ArrayList;
import java.util.Arrays;
@ -203,9 +204,13 @@ public class LowMemoryIntervalSharder implements Iterator<FilePointer> {
for(GATKChunk chunk: bin.getChunkList())
chunks.add(chunk.clone());
}
final long referenceLinearIndexMinimumOffset = index.getLinearIndex(initialRegion.getContigIndex()).getMinimumOffset(initialRegion.getStart());
final long linearIndexMinimumOffset = binTree.getLinearIndexEntry();
if(linearIndexMinimumOffset != referenceLinearIndexMinimumOffset)
throw new ReviewedStingException(String.format("Loaded linear index offset is incorrect; is %d, should be %d",linearIndexMinimumOffset,referenceLinearIndexMinimumOffset));
// Optimize the chunk list with a linear index optimization
chunks = index.optimizeChunkList(chunks,index.getLinearIndex(initialRegion.getContigIndex()).getMinimumOffset(initialRegion.getStart()));
chunks = index.optimizeChunkList(chunks,linearIndexMinimumOffset);
GATKBAMFileSpan fileSpan = new GATKBAMFileSpan(chunks.toArray(new GATKChunk[chunks.size()]));