Rewrite of GATKBAMIndex to avoid mmaps causing false reports of heavy memory

usage.


git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5620 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
hanna 2011-04-12 23:49:58 +00:00
parent 36d8f55286
commit 22a11e41e1
8 changed files with 127 additions and 713 deletions

View File

@ -1,328 +0,0 @@
/*
* Copyright (c) 2011, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.datasources.reads;
import net.sf.samtools.GATKBin;
import net.sf.samtools.GATKChunk;
import net.sf.samtools.util.CloseableIterator;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.nio.channels.FileChannel;
import java.util.ArrayList;
import java.util.List;
import java.util.NoSuchElementException;
/**
* Creates an 'index of the index' for a particular reference sequence
* within the BAM file, for easier whole-BAM-file traversal.
* file.
*/
public class BAMIndexBinIterator {
/**
* The source of index data.
*/
private final GATKBAMIndex index;
/**
* The file storing the index data.
*/
private final File indexFile;
/**
* File storing index metadata.
*/
private final File metaIndexFile;
/**
* Reference sequence that temporary file is based on.
*/
private final int referenceSequence;
/**
* Position of the linear index in the BAM on disk for the given reference sequence.
*/
private final long linearIndexPosition;
/**
* Size of the linear index for the given reference sequence.
*/
private final int linearIndexSize;
/**
* Size of a long in bytes.
*/
private static final int LONG_SIZE_IN_BYTES = Long.SIZE / 8;
public BAMIndexBinIterator(final GATKBAMIndex index, final File indexFile, final int referenceSequence) {
this.index = index;
this.indexFile = indexFile;
this.referenceSequence = referenceSequence;
index.seek(4);
final int sequenceCount = index.readInteger();
if (referenceSequence >= sequenceCount)
throw new ReviewedStingException(String.format("Reference sequence past end of genome; reference sequence = %d, sequence count = %d",referenceSequence,sequenceCount));
index.skipToSequence(referenceSequence);
int binCount = index.readInteger();
try {
metaIndexFile = File.createTempFile("bammetaindex."+referenceSequence,null);
metaIndexFile.deleteOnExit();
FileOutputStream metaIndex = new FileOutputStream(metaIndexFile);
FileChannel metaIndexChannel = metaIndex.getChannel();
// zero out the contents of the file. Arrays of primitives in java are always zeroed out by default.
byte[] emptyContents = new byte[GATKBAMIndex.MAX_BINS*LONG_SIZE_IN_BYTES]; // byte array is zeroed out by default.
metaIndexChannel.write(ByteBuffer.wrap(emptyContents));
ByteBuffer positionBuffer = ByteBuffer.allocate(emptyContents.length);
positionBuffer.order(ByteOrder.LITTLE_ENDIAN);
// Write the positions of the bins in the index
for (int binNumber = 0; binNumber < binCount; binNumber++) {
long position = index.position();
final int indexBin = index.readInteger();
metaIndexChannel.position(indexBin*LONG_SIZE_IN_BYTES);
positionBuffer.putLong(position);
positionBuffer.flip();
metaIndexChannel.write(positionBuffer);
positionBuffer.flip();
final int nChunks = index.readInteger();
index.skipBytes(16 * nChunks);
}
// Read the size of the linear index and the first position.
linearIndexSize = index.readInteger();
linearIndexPosition = index.position();
metaIndexChannel.close();
metaIndex.close();
}
catch(IOException ex) {
throw new ReviewedStingException("Unable to write BAM metaindex",ex);
}
}
public void close() {
metaIndexFile.delete();
}
public CloseableIterator<GATKBin> getIteratorOverLevel(int level) {
return new LevelIterator(level);
}
/**
* Gets the linear index entry for the specified genomic start.
* @param genomicStart Starting location for which to search.
* @return Linear index entry corresponding to this genomic start.
*/
public long getLinearIndexEntry(final int genomicStart) {
final int indexOffset = (genomicStart-1 >> 14);
// If the number of bins is exceeded, don't perform any trimming.
if(indexOffset >= linearIndexSize)
return 0L;
FileInputStream indexInputStream;
try {
indexInputStream = new FileInputStream(indexFile);
}
catch(IOException ex) {
throw new ReviewedStingException("Unable to open index file for reading");
}
try {
indexInputStream.getChannel().position(linearIndexPosition+(indexOffset*LONG_SIZE_IN_BYTES));
}
catch(IOException ex) {
throw new ReviewedStingException("Unable to position index file for reading");
}
ByteBuffer indexReader = ByteBuffer.allocate(LONG_SIZE_IN_BYTES);
indexReader.order(ByteOrder.LITTLE_ENDIAN);
try {
indexInputStream.getChannel().read(indexReader);
}
catch(IOException ex) {
throw new ReviewedStingException("Unable to position index file for reading");
}
indexReader.flip();
final long linearIndexEntry = indexReader.getLong();
try {
indexInputStream.close();
}
catch(IOException ex) {
throw new ReviewedStingException("Unable to close index file.");
}
return linearIndexEntry;
}
private class LevelIterator implements CloseableIterator<GATKBin> {
/**
* The raw BAM index file with unordered bins.
*/
private final FileInputStream indexInputStream;
/**
* The index metafile, with pointers to ordered bins.
*/
private final FileInputStream metaIndexInputStream;
/**
* The first and last bins in the level.
*/
private final int firstBinNumber, lastBinNumber;
/**
* The current bin in the index.
*/
private int currentBinNumber;
/**
* Position of the most recent chunk data.
*/
private GATKBin nextBin = null;
public LevelIterator(final int level) {
try {
indexInputStream = new FileInputStream(indexFile);
}
catch(IOException ex) {
throw new ReviewedStingException("Unable to open index file for reading");
}
try {
metaIndexInputStream = new FileInputStream(metaIndexFile);
}
catch(IOException ex) {
throw new ReviewedStingException("Unable to open index metafile for reading");
}
firstBinNumber = GATKBAMIndex.getFirstBinInLevel(level);
lastBinNumber = firstBinNumber + index.getLevelSize(level) - 1;
currentBinNumber = firstBinNumber - 1;
advance();
}
public void close() {
try {
indexInputStream.close();
}
catch(IOException ex) {
throw new ReviewedStingException("Unable to close index file.");
}
try {
metaIndexInputStream.close();
}
catch(IOException ex) {
throw new ReviewedStingException("Unable to close index metafile");
}
}
public boolean hasNext() {
return nextBin != null;
}
public GATKBin next() {
if(!hasNext())
throw new NoSuchElementException("Out of elements in BAMIndexBinIterator");
GATKBin currentPosition = nextBin;
advance();
return currentPosition;
}
public void remove() { throw new UnsupportedOperationException("Cannot remove from a LevelIterator"); }
private void advance() {
ByteBuffer indexFilePositionDecoder = ByteBuffer.allocate(LONG_SIZE_IN_BYTES*2);
indexFilePositionDecoder.order(ByteOrder.LITTLE_ENDIAN);
nextBin = null;
try {
indexFilePositionDecoder.limit(LONG_SIZE_IN_BYTES);
while(nextBin == null && currentBinNumber < lastBinNumber) {
currentBinNumber++;
metaIndexInputStream.getChannel().position(currentBinNumber*LONG_SIZE_IN_BYTES);
metaIndexInputStream.getChannel().read(indexFilePositionDecoder);
indexFilePositionDecoder.flip();
long currentPosition = indexFilePositionDecoder.getLong();
indexFilePositionDecoder.flip();
if(currentPosition != 0) {
indexInputStream.getChannel().position(currentPosition);
indexInputStream.getChannel().read(indexFilePositionDecoder);
indexFilePositionDecoder.flip();
int binNumber = indexFilePositionDecoder.getInt();
if(binNumber != currentBinNumber)
throw new ReviewedStingException("Index file and metaindex file are out of sync.");
int nChunks = indexFilePositionDecoder.getInt();
GATKChunk[] chunks = new GATKChunk[nChunks];
indexFilePositionDecoder.limit(LONG_SIZE_IN_BYTES*2);
indexFilePositionDecoder.clear();
for (int ci = 0; ci < nChunks; ci++) {
indexInputStream.getChannel().read(indexFilePositionDecoder);
indexFilePositionDecoder.flip();
final long chunkBegin = indexFilePositionDecoder.getLong();
final long chunkEnd = indexFilePositionDecoder.getLong();
chunks[ci] = new GATKChunk(chunkBegin, chunkEnd);
indexFilePositionDecoder.flip();
}
nextBin = new GATKBin(referenceSequence,binNumber);
nextBin.setChunkList(chunks);
}
}
}
catch(IOException ex) {
throw new ReviewedStingException("Unable to close index metafile");
}
}
}
}

View File

@ -88,10 +88,10 @@ public class BAMSchedule implements CloseableIterator<BAMScheduleEntry> {
/**
* Create a new BAM schedule based on the given index.
* @param indices Mapping from readers to indices.
* @param indexFiles Index files.
* @param intervals List of
*/
public BAMSchedule(final Map<SAMReaderID,GATKBAMIndex> indices, final List<GenomeLoc> intervals) {
public BAMSchedule(final Map<SAMReaderID,File> indexFiles, final List<GenomeLoc> intervals) {
if(intervals.isEmpty())
throw new ReviewedStingException("Tried to write schedule for empty interval list.");
@ -99,11 +99,10 @@ public class BAMSchedule implements CloseableIterator<BAMScheduleEntry> {
createScheduleFile();
readerIDs.addAll(indices.keySet());
readerIDs.addAll(indexFiles.keySet());
for(final SAMReaderID reader: readerIDs) {
GATKBAMIndex index = indices.get(reader);
BAMIndexContent indexContent = index.getQueryResults(referenceSequence);
GATKBAMIndex index = new GATKBAMIndex(indexFiles.get(reader),referenceSequence);
int currentBinInLowestLevel = GATKBAMIndex.getFirstBinInLevel(GATKBAMIndex.getNumIndexLevels()-1);
Iterator<GenomeLoc> locusIterator = intervals.iterator();
@ -133,7 +132,7 @@ public class BAMSchedule implements CloseableIterator<BAMScheduleEntry> {
// Code at this point knows that the current bin is neither before nor after the current locus,
// so it must overlap. Add this region to the filesystem.
final GATKBAMFileSpan fileSpan = index.getSpanOverlapping(indexContent,bin);
final GATKBAMFileSpan fileSpan = index.getSpanOverlapping(bin);
if(!fileSpan.isEmpty()) {
// File format is binary in little endian; start of region, end of region, num chunks, then the chunks themselves.

View File

@ -30,6 +30,7 @@ import org.apache.log4j.Logger;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
import java.io.File;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
@ -46,7 +47,7 @@ import java.util.NoSuchElementException;
public class BAMScheduler implements Iterator<FilePointer> {
private final SAMDataSource dataSource;
private final Map<SAMReaderID,GATKBAMIndex> indices = new HashMap<SAMReaderID,GATKBAMIndex>();
private final Map<SAMReaderID, File> indexFiles = new HashMap<SAMReaderID,File>();
private FilePointer nextFilePointer = null;
@ -59,7 +60,7 @@ public class BAMScheduler implements Iterator<FilePointer> {
public BAMScheduler(final SAMDataSource dataSource, final GenomeLocSortedSet loci) {
this.dataSource = dataSource;
for(SAMReaderID reader: dataSource.getReaderIDs())
indices.put(reader,(GATKBAMIndex)dataSource.getIndex(reader));
indexFiles.put(reader,(File)dataSource.getIndex(reader));
this.loci = loci;
locusIterator = new PeekableIterator<GenomeLoc>(loci.iterator());
if(locusIterator.hasNext())
@ -104,7 +105,7 @@ public class BAMScheduler implements Iterator<FilePointer> {
int coveredRegionStop = Integer.MAX_VALUE;
GenomeLoc coveredRegion = null;
BAMScheduleEntry scheduleEntry = getNextOverlappingBAMScheduleEntry(indices,currentLocus);
BAMScheduleEntry scheduleEntry = getNextOverlappingBAMScheduleEntry(indexFiles,currentLocus);
// No overlapping data at all.
if(scheduleEntry != null) {
@ -117,7 +118,7 @@ public class BAMScheduler implements Iterator<FilePointer> {
else {
// Always create a file span, whether there was covered data or not. If there was no covered data, then the binTree is empty.
//System.out.printf("Shard: index file = %s; reference sequence = %d; ",index.getIndexFile(),currentLocus.getContigIndex());
for(SAMReaderID reader: indices.keySet())
for(SAMReaderID reader: indexFiles.keySet())
nextFilePointer.addFileSpans(reader,new GATKBAMFileSpan());
}
@ -183,11 +184,11 @@ public class BAMScheduler implements Iterator<FilePointer> {
/**
* Get the next overlapping tree of bins associated with the given BAM file.
* @param indices BAM index representation.
* @param indexFiles BAM index file.
* @param currentLocus The actual locus for which to check overlap.
* @return The next schedule entry overlapping with the given list of loci.
*/
private BAMScheduleEntry getNextOverlappingBAMScheduleEntry(final Map<SAMReaderID,GATKBAMIndex> indices, final GenomeLoc currentLocus) {
private BAMScheduleEntry getNextOverlappingBAMScheduleEntry(final Map<SAMReaderID,File> indexFiles, final GenomeLoc currentLocus) {
// Stale reference sequence or first invocation. (Re)create the binTreeIterator.
if(lastReferenceSequenceLoaded == null || lastReferenceSequenceLoaded != currentLocus.getContigIndex()) {
if(bamScheduleIterator != null)
@ -201,7 +202,7 @@ public class BAMScheduler implements Iterator<FilePointer> {
lociInContig.add(locus);
}
bamScheduleIterator = new PeekableIterator<BAMScheduleEntry>(new BAMSchedule(indices,lociInContig));
bamScheduleIterator = new PeekableIterator<BAMScheduleEntry>(new BAMSchedule(indexFiles,lociInContig));
}
if(!bamScheduleIterator.hasNext())

View File

@ -23,24 +23,20 @@
*/
package org.broadinstitute.sting.gatk.datasources.reads;
import net.sf.samtools.BAMIndex;
import net.sf.samtools.BAMIndexMetaData;
import net.sf.samtools.Bin;
import net.sf.samtools.BrowseableBAMIndex;
import net.sf.samtools.GATKBAMFileSpan;
import net.sf.samtools.GATKBin;
import net.sf.samtools.GATKBinList;
import net.sf.samtools.GATKChunk;
import net.sf.samtools.LinearIndex;
import net.sf.samtools.SAMException;
import net.sf.samtools.SAMSequenceDictionary;
import net.sf.samtools.util.RuntimeIOException;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.nio.MappedByteBuffer;
import java.nio.channels.FileChannel;
import java.util.*;
@ -50,7 +46,7 @@ import java.util.*;
* @author mhanna
* @version 0.1
*/
public class GATKBAMIndex implements BAMIndex, BrowseableBAMIndex {
public class GATKBAMIndex {
/**
* Reports the total amount of genomic data that any bin can index.
*/
@ -66,46 +62,75 @@ public class GATKBAMIndex implements BAMIndex, BrowseableBAMIndex {
*/
public static final int MAX_BINS = 37450; // =(8^6-1)/7+1
public static final int MAX_LINEAR_INDEX_SIZE = MAX_BINS+1-LEVEL_STARTS[LEVEL_STARTS.length-1];
private final File mFile;
private final MappedByteBuffer mFileBuffer;
private SAMSequenceDictionary mBamDictionary = null;
private final List<GATKBin> bins = new ArrayList<GATKBin>();
private final LinearIndex linearIndex;
public GATKBAMIndex(final File file, final SAMSequenceDictionary dictionary) {
public GATKBAMIndex(final File file, final int referenceSequence) {
mFile = file;
mBamDictionary = dictionary;
// Open the file stream.
try {
FileInputStream fileStream = new FileInputStream(mFile);
FileChannel fileChannel = fileStream.getChannel();
mFileBuffer = fileChannel.map(FileChannel.MapMode.READ_ONLY, 0L, fileChannel.size());
mFileBuffer.order(ByteOrder.LITTLE_ENDIAN);
fileChannel.close();
fileStream.close();
FileInputStream fileStream = null;
FileChannel fileChannel = null;
try {
fileStream = new FileInputStream(mFile);
fileChannel = fileStream.getChannel();
}
catch (IOException exc) {
throw new RuntimeIOException(exc.getMessage(), exc);
}
// Verify the magic number.
seek(0);
seek(fileChannel,0);
final byte[] buffer = new byte[4];
readBytes(buffer);
readBytes(fileChannel,buffer);
if (!Arrays.equals(buffer, GATKBAMFileConstants.BAM_INDEX_MAGIC)) {
throw new RuntimeException("Invalid file header in BAM index " + mFile +
": " + new String(buffer));
}
}
/**
* Gets the file backing this index.
* @return The index file.
*/
public File getIndexFile() {
return mFile;
seek(fileChannel,4);
final int sequenceCount = readInteger(fileChannel);
if (referenceSequence >= sequenceCount)
throw new ReviewedStingException("Invalid sequence number " + referenceSequence);
skipToSequence(fileChannel,referenceSequence);
int binCount = readInteger(fileChannel);
for (int binNumber = 0; binNumber < binCount; binNumber++) {
final int indexBin = readInteger(fileChannel);
final int nChunks = readInteger(fileChannel);
List<GATKChunk> chunks = new ArrayList<GATKChunk>(nChunks);
for (int ci = 0; ci < nChunks; ci++) {
final long chunkBegin = readLong(fileChannel);
final long chunkEnd = readLong(fileChannel);
chunks.add(new GATKChunk(chunkBegin, chunkEnd));
}
GATKBin bin = new GATKBin(referenceSequence, indexBin);
bin.setChunkList(chunks.toArray(new GATKChunk[chunks.size()]));
while(indexBin >= bins.size())
bins.add(null);
bins.set(indexBin,bin);
}
final int nLinearBins = readInteger(fileChannel);
long[] linearIndexEntries = new long[nLinearBins];
for(int linearIndexOffset = 0; linearIndexOffset < nLinearBins; linearIndexOffset++)
linearIndexEntries[linearIndexOffset] = readLong(fileChannel);
linearIndex = new LinearIndex(referenceSequence,0,linearIndexEntries);
try {
fileChannel.close();
fileStream.close();
}
catch (IOException exc) {
throw new RuntimeIOException(exc.getMessage(), exc);
}
}
/**
@ -142,7 +167,6 @@ public class GATKBAMIndex implements BAMIndex, BrowseableBAMIndex {
* @param bin The bin for which to determine the level.
* @return the level associated with the given bin number.
*/
@Override
public int getLevelForBin(final Bin bin) {
GATKBin gatkBin = new GATKBin(bin);
if(gatkBin.getBinNumber() >= MAX_BINS)
@ -171,7 +195,6 @@ public class GATKBAMIndex implements BAMIndex, BrowseableBAMIndex {
* @param bin The bin to test.
* @return The last position that the given bin can represent.
*/
@Override
public int getLastLocusInBin(final Bin bin) {
final int level = getLevelForBin(bin);
final int levelStart = LEVEL_STARTS[level];
@ -179,164 +202,32 @@ public class GATKBAMIndex implements BAMIndex, BrowseableBAMIndex {
return (new GATKBin(bin).getBinNumber()-levelStart+1)*(BIN_GENOMIC_SPAN /levelSize);
}
public int getNumberOfReferences() {
seek(4);
return readInteger();
}
/**
* Use to get close to the unmapped reads at the end of a BAM file.
* @return The file offset of the first record in the last linear bin, or -1
* if there are no elements in linear bins (i.e. no mapped reads).
*/
public long getStartOfLastLinearBin() {
seek(4);
final int sequenceCount = readInteger();
// Because no reads may align to the last sequence in the sequence dictionary,
// grab the last element of the linear index for each sequence, and return
// the last one from the last sequence that has one.
long lastLinearIndexPointer = -1;
for (int i = 0; i < sequenceCount; i++) {
// System.out.println("# Sequence TID: " + i);
final int nBins = readInteger();
// System.out.println("# nBins: " + nBins);
for (int j1 = 0; j1 < nBins; j1++) {
// Skip bin #
skipBytes(4);
final int nChunks = readInteger();
// Skip chunks
skipBytes(16 * nChunks);
}
final int nLinearBins = readInteger();
if (nLinearBins > 0) {
// Skip to last element of list of linear bins
skipBytes(8 * (nLinearBins - 1));
lastLinearIndexPointer = readLong();
}
}
return lastLinearIndexPointer;
}
/**
* Gets meta data for the given reference including information about number of aligned, unaligned, and noCoordinate records
* @param reference the reference of interest
* @return meta data for the reference
*/
public BAMIndexMetaData getMetaData(int reference) {
throw new UnsupportedOperationException("Cannot retrieve metadata for GATKBAMIndex");
}
/**
* Returns count of records unassociated with any reference. Call before the index file is closed
*
* @return meta data at the end of the bam index that indicates count of records holding no coordinates
* or null if no meta data (old index format)
*/
public Long getNoCoordinateCount() {
seek(4);
final int sequenceCount = readInteger();
skipToSequence(sequenceCount);
try { // in case of old index file without meta data
return readLong();
} catch (Exception e) {
return null;
}
}
/**
* Get list of regions of BAM file that may contain SAMRecords for the given range
* @param referenceIndex sequence of desired SAMRecords
* @param startPos 1-based start of the desired interval, inclusive
* @param endPos 1-based end of the desired interval, inclusive
* @return the virtual file position. Each pair is the first and last virtual file position
* in a range that can be scanned to find SAMRecords that overlap the given positions.
*/
@Override
public GATKBAMFileSpan getSpanOverlapping(final int referenceIndex, final int startPos, final int endPos) {
BAMIndexContent queryResults = getQueryResults(referenceIndex);
if(queryResults == null)
return null;
GATKBinList overlappingBins = getBinsOverlapping(referenceIndex,startPos,endPos);
// System.out.println("# Sequence target TID: " + referenceIndex);
List<GATKBin> bins = new ArrayList<GATKBin>();
for(GATKBin bin: queryResults.getBins()) {
if (overlappingBins.getBins().get(bin.getBinNumber()))
bins.add(bin);
}
if (bins.isEmpty()) {
return null;
}
List<GATKChunk> chunkList = new ArrayList<GATKChunk>();
for(GATKBin bin: bins) {
for(GATKChunk chunk: bin.getChunkList())
chunkList.add(chunk.clone());
}
if (chunkList.isEmpty()) {
return null;
}
chunkList = optimizeChunkList(chunkList,queryResults.getLinearIndex().getMinimumOffset(startPos));
return new GATKBAMFileSpan(chunkList.toArray(new GATKChunk[chunkList.size()]));
}
/**
* Perform an overlapping query of all bins bounding the given location.
* @param bin The bin over which to perform an overlapping query.
* @return The file pointers
*/
@Override
public GATKBAMFileSpan getSpanOverlapping(final Bin bin) {
if(bin == null)
return null;
GATKBin gatkBin = new GATKBin(bin);
final int referenceSequence = gatkBin.getReferenceSequence();
BAMIndexContent indexQuery = getQueryResults(referenceSequence);
return getSpanOverlapping(indexQuery,bin);
}
/**
* Perform an overlapping query of all bins bounding the given location.
* @param bin The bin over which to perform an overlapping query.
* @return The file pointers
*/
public GATKBAMFileSpan getSpanOverlapping(final BAMIndexContent indexContent, final Bin bin) {
if(bin == null)
return null;
GATKBin gatkBin = new GATKBin(bin);
if(indexContent == null)
return null;
final int binLevel = getLevelForBin(bin);
final int firstLocusInBin = getFirstLocusInBin(bin);
// Add the specified bin to the tree if it exists.
List<GATKBin> binTree = new ArrayList<GATKBin>();
if(indexContent.containsBin(gatkBin))
binTree.add(indexContent.getBins().getBin(gatkBin.getBinNumber()));
if(gatkBin.getBinNumber() < bins.size() && bins.get(gatkBin.getBinNumber()) != null)
binTree.add(bins.get(gatkBin.getBinNumber()));
int currentBinLevel = binLevel;
while(--currentBinLevel >= 0) {
final int binStart = getFirstBinInLevel(currentBinLevel);
final int binWidth = getMaxAddressibleGenomicLocation()/getLevelSize(currentBinLevel);
final int binNumber = firstLocusInBin/binWidth + binStart;
GATKBin parentBin = indexContent.getBins().getBin(binNumber);
if(parentBin != null && indexContent.containsBin(parentBin))
binTree.add(parentBin);
if(binNumber < bins.size() && bins.get(binNumber) != null)
binTree.add(bins.get(binNumber));
}
List<GATKChunk> chunkList = new ArrayList<GATKChunk>();
@ -346,173 +237,10 @@ public class GATKBAMIndex implements BAMIndex, BrowseableBAMIndex {
}
final int start = getFirstLocusInBin(bin);
chunkList = optimizeChunkList(chunkList,indexContent.getLinearIndex().getMinimumOffset(start));
chunkList = optimizeChunkList(chunkList,linearIndex.getMinimumOffset(start));
return new GATKBAMFileSpan(chunkList.toArray(new GATKChunk[chunkList.size()]));
}
public GATKBAMFileSpan getContentsOfBin(final Bin bin) {
if(bin == null)
return null;
GATKBin gatkBin = new GATKBin(bin);
BAMIndexContent indexQuery = getQueryResults(gatkBin.getReferenceSequence());
if(indexQuery == null)
return null;
GATKBin queriedBin = indexQuery.getBins().getBin(gatkBin.getBinNumber());
return queriedBin != null ? new GATKBAMFileSpan(queriedBin.getChunkList()) : null;
}
/**
* Retrieves the linear index for the given reference sequence.
* @param referenceSequence Reference sequence number for which to retrieve the reference.
* @return The linear index for the given reference sequence.
*/
public LinearIndex getLinearIndex(int referenceSequence) {
return getQueryResults(referenceSequence).getLinearIndex();
}
/**
* Get a list of bins in the BAM file that may contain SAMRecords for the given range.
* @param referenceIndex sequence of desired SAMRecords
* @param startPos 1-based start of the desired interval, inclusive
* @param endPos 1-based end of the desired interval, inclusive
* @return a list of bins that contain relevant data.
*/
public GATKBinList getBinsOverlapping(final int referenceIndex, final int startPos, final int endPos) {
final BitSet regionBins = regionToBins(startPos,endPos);
if (regionBins == null) {
return null;
}
return new GATKBinList(referenceIndex,regionBins);
}
protected BAMIndexContent query(final int referenceSequence, final int startPos, final int endPos) {
seek(4);
List<GATKChunk> metaDataChunks = new ArrayList<GATKChunk>();
final int sequenceCount = readInteger();
if (referenceSequence >= sequenceCount) {
return null;
}
final BitSet regionBins = regionToBins(startPos, endPos);
if (regionBins == null) {
return null;
}
skipToSequence(referenceSequence);
int binCount = readInteger();
boolean metaDataSeen = false;
GATKBin[] bins = new GATKBin[getMaxBinNumberForReference(referenceSequence) +1];
for (int binNumber = 0; binNumber < binCount; binNumber++) {
final int indexBin = readInteger();
final int nChunks = readInteger();
List<GATKChunk> chunks = new ArrayList<GATKChunk>(nChunks);
// System.out.println("# bin[" + i + "] = " + indexBin + ", nChunks = " + nChunks);
GATKChunk lastChunk = null;
if (regionBins.get(indexBin)) {
for (int ci = 0; ci < nChunks; ci++) {
final long chunkBegin = readLong();
final long chunkEnd = readLong();
lastChunk = new GATKChunk(chunkBegin, chunkEnd);
chunks.add(lastChunk);
}
} else if (indexBin == MAX_BINS) {
// meta data - build the bin so that the count of bins is correct;
// but don't attach meta chunks to the bin, or normal queries will be off
for (int ci = 0; ci < nChunks; ci++) {
final long chunkBegin = readLong();
final long chunkEnd = readLong();
lastChunk = new GATKChunk(chunkBegin, chunkEnd);
metaDataChunks.add(lastChunk);
}
metaDataSeen = true;
continue; // don't create a Bin
} else {
skipBytes(16 * nChunks);
}
GATKBin bin = new GATKBin(referenceSequence, indexBin);
bin.setChunkList(chunks.toArray(new GATKChunk[chunks.size()]));
bins[indexBin] = bin;
}
final int nLinearBins = readInteger();
final int regionLinearBinStart = LinearIndex.convertToLinearIndexOffset(startPos);
final int regionLinearBinStop = endPos > 0 ? LinearIndex.convertToLinearIndexOffset(endPos) : nLinearBins-1;
final int actualStop = Math.min(regionLinearBinStop, nLinearBins -1);
long[] linearIndexEntries = new long[0];
if (regionLinearBinStart < nLinearBins) {
linearIndexEntries = new long[actualStop-regionLinearBinStart+1];
skipBytes(8 * regionLinearBinStart);
for(int linearBin = regionLinearBinStart; linearBin <= actualStop; linearBin++)
linearIndexEntries[linearBin-regionLinearBinStart] = readLong();
}
final LinearIndex linearIndex = new LinearIndex(referenceSequence,regionLinearBinStart,linearIndexEntries);
return new BAMIndexContent(referenceSequence, bins, binCount - (metaDataSeen? 1 : 0), linearIndex);
}
/**
* The maxiumum bin number for a reference sequence of a given length
*/
static int getMaxBinNumberForSequenceLength(int sequenceLength) {
return getFirstBinInLevel(getNumIndexLevels() - 1) + (sequenceLength >> 14);
// return 4680 + (sequenceLength >> 14); // note 4680 = getFirstBinInLevel(getNumIndexLevels() - 1)
}
/**
* Looks up the cached BAM query results if they're still in the cache and not expired. Otherwise,
* retrieves the cache results from disk.
* @param referenceIndex The reference to load. CachingBAMFileIndex only stores index data for entire references.
* @return The index information for this reference.
*/
public BAMIndexContent getQueryResults(final int referenceIndex) {
// If not in the cache, attempt to load it from disk.
return query(referenceIndex,1,-1);
}
/**
* Gets the possible number of bins for a given reference sequence.
* @return How many bins could possibly be used according to this indexing scheme to index a single contig.
*/
protected int getMaxAddressibleGenomicLocation() {
return BIN_GENOMIC_SPAN;
}
/**
* Get candidate bins for the specified region
* @param startPos 1-based start of target region, inclusive.
* @param endPos 1-based end of target region, inclusive.
* @return bit set for each bin that may contain SAMRecords in the target region.
*/
protected BitSet regionToBins(final int startPos, final int endPos) {
final int maxPos = 0x1FFFFFFF;
final int start = (startPos <= 0) ? 0 : (startPos-1) & maxPos;
final int end = (endPos <= 0) ? maxPos : (endPos-1) & maxPos;
if (start > end) {
return null;
}
int k;
final BitSet bitSet = new BitSet(MAX_BINS);
bitSet.set(0);
for (k = 1 + (start>>26); k <= 1 + (end>>26); ++k) bitSet.set(k);
for (k = 9 + (start>>23); k <= 9 + (end>>23); ++k) bitSet.set(k);
for (k = 73 + (start>>20); k <= 73 + (end>>20); ++k) bitSet.set(k);
for (k = 585 + (start>>17); k <= 585 + (end>>17); ++k) bitSet.set(k);
for (k = 4681 + (start>>14); k <= 4681 + (end>>14); ++k) bitSet.set(k);
return bitSet;
}
protected List<GATKChunk> optimizeChunkList(final List<GATKChunk> chunks, final long minimumOffset) {
GATKChunk lastChunk = null;
Collections.sort(chunks);
@ -541,61 +269,76 @@ public class GATKBAMIndex implements BAMIndex, BrowseableBAMIndex {
}
/**
* The maximum possible bin number for this reference sequence.
* This is based on the maximum coordinate position of the reference
* which is based on the size of the reference
* Gets the possible number of bins for a given reference sequence.
* @return How many bins could possibly be used according to this indexing scheme to index a single contig.
*/
private int getMaxBinNumberForReference(final int reference) {
try {
final int sequenceLength = mBamDictionary.getSequence(reference).getSequenceLength();
return getMaxBinNumberForSequenceLength(sequenceLength);
} catch (Exception e) {
return MAX_BINS;
}
}
protected int getMaxAddressibleGenomicLocation() {
return BIN_GENOMIC_SPAN;
}
protected void skipToSequence(final int sequenceIndex) {
protected void skipToSequence(final FileChannel fileChannel, final int sequenceIndex) {
for (int i = 0; i < sequenceIndex; i++) {
// System.out.println("# Sequence TID: " + i);
final int nBins = readInteger();
final int nBins = readInteger(fileChannel);
// System.out.println("# nBins: " + nBins);
for (int j = 0; j < nBins; j++) {
final int bin = readInteger();
final int nChunks = readInteger();
final int bin = readInteger(fileChannel);
final int nChunks = readInteger(fileChannel);
// System.out.println("# bin[" + j + "] = " + bin + ", nChunks = " + nChunks);
skipBytes(16 * nChunks);
skipBytes(fileChannel, 16 * nChunks);
}
final int nLinearBins = readInteger();
final int nLinearBins = readInteger(fileChannel);
// System.out.println("# nLinearBins: " + nLinearBins);
skipBytes(8 * nLinearBins);
skipBytes(fileChannel, 8 * nLinearBins);
}
}
private void readBytes(final byte[] bytes) {
mFileBuffer.get(bytes);
private void readBytes(final FileChannel fileChannel, final byte[] bytes) {
ByteBuffer buffer = ByteBuffer.wrap(bytes);
try {
fileChannel.read(buffer);
}
catch(IOException ex) {
throw new ReviewedStingException("Index: unable to read bytes from index file.");
}
}
protected int readInteger() {
return mFileBuffer.getInt();
private static final int INT_SIZE_IN_BYTES = Integer.SIZE / 8;
private static final int LONG_SIZE_IN_BYTES = Long.SIZE / 8;
private ByteBuffer wrapBuffer(final byte[] bytes) {
ByteBuffer buffer = ByteBuffer.wrap(bytes);
buffer.order(ByteOrder.LITTLE_ENDIAN);
return buffer;
}
protected long readLong() {
return mFileBuffer.getLong();
private int readInteger(final FileChannel fileChannel) {
byte[] bytes = new byte[INT_SIZE_IN_BYTES];
readBytes(fileChannel,bytes);
return wrapBuffer(bytes).getInt();
}
protected void skipBytes(final int count) {
mFileBuffer.position(mFileBuffer.position() + count);
private long readLong(final FileChannel fileChannel) {
byte[] bytes = new byte[LONG_SIZE_IN_BYTES];
readBytes(fileChannel,bytes);
return wrapBuffer(bytes).getLong();
}
protected void seek(final int position) {
mFileBuffer.position(position);
private void skipBytes(final FileChannel fileChannel, final int count) {
try {
fileChannel.position(fileChannel.position() + count);
}
catch(IOException ex) {
throw new ReviewedStingException("Index: unable to reposition file channel.");
}
}
protected long position() {
return mFileBuffer.position();
}
@Override
public void close() {
private void seek(final FileChannel fileChannel, final int position) {
try {
fileChannel.position(position);
}
catch(IOException ex) {
throw new ReviewedStingException("Index: unable to reposition of file channel.");
}
}
}

View File

@ -140,7 +140,7 @@ public class IntervalSharder {
// If this contig can't be found in the reference, skip over it.
if(referenceSequence == null && contig != null)
continue;
final BrowseableBAMIndex index = dataSource.getIndex(id);
final BrowseableBAMIndex index = (BrowseableBAMIndex)dataSource.getIndex(id);
binMerger.addReader(id,
index,
referenceSequence.getSequenceIndex(),

View File

@ -45,6 +45,7 @@ public class LowMemoryIntervalSharder implements Iterator<FilePointer> {
private final GenomeLocParser parser;
public LowMemoryIntervalSharder(final SAMDataSource dataSource, final GenomeLocSortedSet loci) {
System.out.println("Low memory interval sharding enabled");
wrappedIterator = new PeekableIterator<FilePointer>(new BAMScheduler(dataSource,loci));
parser = loci.getGenomeLocParser();
}

View File

@ -84,7 +84,7 @@ public class SAMDataSource {
/**
* Store BAM indices for each reader present.
*/
private final Map<SAMReaderID,GATKBAMIndex> bamIndices = new HashMap<SAMReaderID,GATKBAMIndex>();
private final Map<SAMReaderID,Object> bamIndices = new HashMap<SAMReaderID,Object>();
/**
* How far along is each reader?
@ -292,9 +292,7 @@ public class SAMDataSource {
for(SAMReaderID id: readerIDs) {
File indexFile = findIndexFile(id.samFile);
if(indexFile != null) {
SAMSequenceDictionary sequenceDictionary = readers.getReader(id).getFileHeader().getSequenceDictionary();
GATKBAMIndex index = new GATKBAMIndex(indexFile,sequenceDictionary);
bamIndices.put(id,index);
bamIndices.put(id,indexFile);
}
}
}
@ -428,7 +426,7 @@ public class SAMDataSource {
* @param id Id of the reader.
* @return The index. Will preload the index if necessary.
*/
public BrowseableBAMIndex getIndex(final SAMReaderID id) {
public Object getIndex(final SAMReaderID id) {
if(enableLowMemorySharding)
return bamIndices.get(id);
else {

View File

@ -57,7 +57,7 @@ public class GATKBAMIndexUnitTest extends BaseTest {
SAMSequenceDictionary sequenceDictionary = reader.getFileHeader().getSequenceDictionary();
reader.close();
bamIndex = new GATKBAMIndex(bamIndexFile,sequenceDictionary);
bamIndex = new GATKBAMIndex(bamIndexFile,0);
}
@Test