Rewrite of GATKBAMIndex to avoid mmaps causing false reports of heavy memory
usage. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5620 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
36d8f55286
commit
22a11e41e1
|
|
@ -1,328 +0,0 @@
|
||||||
/*
|
|
||||||
* Copyright (c) 2011, The Broad Institute
|
|
||||||
*
|
|
||||||
* Permission is hereby granted, free of charge, to any person
|
|
||||||
* obtaining a copy of this software and associated documentation
|
|
||||||
* files (the "Software"), to deal in the Software without
|
|
||||||
* restriction, including without limitation the rights to use,
|
|
||||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
||||||
* copies of the Software, and to permit persons to whom the
|
|
||||||
* Software is furnished to do so, subject to the following
|
|
||||||
* conditions:
|
|
||||||
*
|
|
||||||
* The above copyright notice and this permission notice shall be
|
|
||||||
* included in all copies or substantial portions of the Software.
|
|
||||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
|
||||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
|
||||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|
||||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
|
||||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
|
||||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
||||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
|
||||||
* OTHER DEALINGS IN THE SOFTWARE.
|
|
||||||
*/
|
|
||||||
|
|
||||||
package org.broadinstitute.sting.gatk.datasources.reads;
|
|
||||||
|
|
||||||
import net.sf.samtools.GATKBin;
|
|
||||||
import net.sf.samtools.GATKChunk;
|
|
||||||
import net.sf.samtools.util.CloseableIterator;
|
|
||||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
|
||||||
|
|
||||||
import java.io.File;
|
|
||||||
import java.io.FileInputStream;
|
|
||||||
import java.io.FileOutputStream;
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.nio.ByteBuffer;
|
|
||||||
import java.nio.ByteOrder;
|
|
||||||
import java.nio.channels.FileChannel;
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.NoSuchElementException;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Creates an 'index of the index' for a particular reference sequence
|
|
||||||
* within the BAM file, for easier whole-BAM-file traversal.
|
|
||||||
* file.
|
|
||||||
*/
|
|
||||||
public class BAMIndexBinIterator {
|
|
||||||
/**
|
|
||||||
* The source of index data.
|
|
||||||
*/
|
|
||||||
private final GATKBAMIndex index;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* The file storing the index data.
|
|
||||||
*/
|
|
||||||
private final File indexFile;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* File storing index metadata.
|
|
||||||
*/
|
|
||||||
private final File metaIndexFile;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Reference sequence that temporary file is based on.
|
|
||||||
*/
|
|
||||||
private final int referenceSequence;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Position of the linear index in the BAM on disk for the given reference sequence.
|
|
||||||
*/
|
|
||||||
private final long linearIndexPosition;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Size of the linear index for the given reference sequence.
|
|
||||||
*/
|
|
||||||
private final int linearIndexSize;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Size of a long in bytes.
|
|
||||||
*/
|
|
||||||
private static final int LONG_SIZE_IN_BYTES = Long.SIZE / 8;
|
|
||||||
|
|
||||||
public BAMIndexBinIterator(final GATKBAMIndex index, final File indexFile, final int referenceSequence) {
|
|
||||||
this.index = index;
|
|
||||||
this.indexFile = indexFile;
|
|
||||||
this.referenceSequence = referenceSequence;
|
|
||||||
|
|
||||||
index.seek(4);
|
|
||||||
|
|
||||||
final int sequenceCount = index.readInteger();
|
|
||||||
|
|
||||||
if (referenceSequence >= sequenceCount)
|
|
||||||
throw new ReviewedStingException(String.format("Reference sequence past end of genome; reference sequence = %d, sequence count = %d",referenceSequence,sequenceCount));
|
|
||||||
|
|
||||||
index.skipToSequence(referenceSequence);
|
|
||||||
|
|
||||||
int binCount = index.readInteger();
|
|
||||||
|
|
||||||
try {
|
|
||||||
metaIndexFile = File.createTempFile("bammetaindex."+referenceSequence,null);
|
|
||||||
metaIndexFile.deleteOnExit();
|
|
||||||
|
|
||||||
FileOutputStream metaIndex = new FileOutputStream(metaIndexFile);
|
|
||||||
FileChannel metaIndexChannel = metaIndex.getChannel();
|
|
||||||
|
|
||||||
// zero out the contents of the file. Arrays of primitives in java are always zeroed out by default.
|
|
||||||
byte[] emptyContents = new byte[GATKBAMIndex.MAX_BINS*LONG_SIZE_IN_BYTES]; // byte array is zeroed out by default.
|
|
||||||
metaIndexChannel.write(ByteBuffer.wrap(emptyContents));
|
|
||||||
|
|
||||||
ByteBuffer positionBuffer = ByteBuffer.allocate(emptyContents.length);
|
|
||||||
positionBuffer.order(ByteOrder.LITTLE_ENDIAN);
|
|
||||||
|
|
||||||
// Write the positions of the bins in the index
|
|
||||||
for (int binNumber = 0; binNumber < binCount; binNumber++) {
|
|
||||||
long position = index.position();
|
|
||||||
|
|
||||||
final int indexBin = index.readInteger();
|
|
||||||
|
|
||||||
metaIndexChannel.position(indexBin*LONG_SIZE_IN_BYTES);
|
|
||||||
positionBuffer.putLong(position);
|
|
||||||
positionBuffer.flip();
|
|
||||||
|
|
||||||
metaIndexChannel.write(positionBuffer);
|
|
||||||
positionBuffer.flip();
|
|
||||||
|
|
||||||
final int nChunks = index.readInteger();
|
|
||||||
index.skipBytes(16 * nChunks);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Read the size of the linear index and the first position.
|
|
||||||
linearIndexSize = index.readInteger();
|
|
||||||
linearIndexPosition = index.position();
|
|
||||||
|
|
||||||
metaIndexChannel.close();
|
|
||||||
metaIndex.close();
|
|
||||||
}
|
|
||||||
catch(IOException ex) {
|
|
||||||
throw new ReviewedStingException("Unable to write BAM metaindex",ex);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public void close() {
|
|
||||||
metaIndexFile.delete();
|
|
||||||
}
|
|
||||||
|
|
||||||
public CloseableIterator<GATKBin> getIteratorOverLevel(int level) {
|
|
||||||
return new LevelIterator(level);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Gets the linear index entry for the specified genomic start.
|
|
||||||
* @param genomicStart Starting location for which to search.
|
|
||||||
* @return Linear index entry corresponding to this genomic start.
|
|
||||||
*/
|
|
||||||
public long getLinearIndexEntry(final int genomicStart) {
|
|
||||||
final int indexOffset = (genomicStart-1 >> 14);
|
|
||||||
// If the number of bins is exceeded, don't perform any trimming.
|
|
||||||
if(indexOffset >= linearIndexSize)
|
|
||||||
return 0L;
|
|
||||||
|
|
||||||
FileInputStream indexInputStream;
|
|
||||||
try {
|
|
||||||
indexInputStream = new FileInputStream(indexFile);
|
|
||||||
}
|
|
||||||
catch(IOException ex) {
|
|
||||||
throw new ReviewedStingException("Unable to open index file for reading");
|
|
||||||
}
|
|
||||||
|
|
||||||
try {
|
|
||||||
indexInputStream.getChannel().position(linearIndexPosition+(indexOffset*LONG_SIZE_IN_BYTES));
|
|
||||||
}
|
|
||||||
catch(IOException ex) {
|
|
||||||
throw new ReviewedStingException("Unable to position index file for reading");
|
|
||||||
}
|
|
||||||
|
|
||||||
ByteBuffer indexReader = ByteBuffer.allocate(LONG_SIZE_IN_BYTES);
|
|
||||||
indexReader.order(ByteOrder.LITTLE_ENDIAN);
|
|
||||||
|
|
||||||
try {
|
|
||||||
indexInputStream.getChannel().read(indexReader);
|
|
||||||
}
|
|
||||||
catch(IOException ex) {
|
|
||||||
throw new ReviewedStingException("Unable to position index file for reading");
|
|
||||||
}
|
|
||||||
|
|
||||||
indexReader.flip();
|
|
||||||
final long linearIndexEntry = indexReader.getLong();
|
|
||||||
|
|
||||||
try {
|
|
||||||
indexInputStream.close();
|
|
||||||
}
|
|
||||||
catch(IOException ex) {
|
|
||||||
throw new ReviewedStingException("Unable to close index file.");
|
|
||||||
}
|
|
||||||
|
|
||||||
return linearIndexEntry;
|
|
||||||
}
|
|
||||||
|
|
||||||
private class LevelIterator implements CloseableIterator<GATKBin> {
|
|
||||||
/**
|
|
||||||
* The raw BAM index file with unordered bins.
|
|
||||||
*/
|
|
||||||
private final FileInputStream indexInputStream;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* The index metafile, with pointers to ordered bins.
|
|
||||||
*/
|
|
||||||
private final FileInputStream metaIndexInputStream;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* The first and last bins in the level.
|
|
||||||
*/
|
|
||||||
private final int firstBinNumber, lastBinNumber;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* The current bin in the index.
|
|
||||||
*/
|
|
||||||
private int currentBinNumber;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Position of the most recent chunk data.
|
|
||||||
*/
|
|
||||||
private GATKBin nextBin = null;
|
|
||||||
|
|
||||||
public LevelIterator(final int level) {
|
|
||||||
try {
|
|
||||||
indexInputStream = new FileInputStream(indexFile);
|
|
||||||
}
|
|
||||||
catch(IOException ex) {
|
|
||||||
throw new ReviewedStingException("Unable to open index file for reading");
|
|
||||||
}
|
|
||||||
try {
|
|
||||||
metaIndexInputStream = new FileInputStream(metaIndexFile);
|
|
||||||
}
|
|
||||||
catch(IOException ex) {
|
|
||||||
throw new ReviewedStingException("Unable to open index metafile for reading");
|
|
||||||
}
|
|
||||||
|
|
||||||
firstBinNumber = GATKBAMIndex.getFirstBinInLevel(level);
|
|
||||||
lastBinNumber = firstBinNumber + index.getLevelSize(level) - 1;
|
|
||||||
|
|
||||||
currentBinNumber = firstBinNumber - 1;
|
|
||||||
advance();
|
|
||||||
}
|
|
||||||
|
|
||||||
public void close() {
|
|
||||||
try {
|
|
||||||
indexInputStream.close();
|
|
||||||
}
|
|
||||||
catch(IOException ex) {
|
|
||||||
throw new ReviewedStingException("Unable to close index file.");
|
|
||||||
}
|
|
||||||
try {
|
|
||||||
metaIndexInputStream.close();
|
|
||||||
}
|
|
||||||
catch(IOException ex) {
|
|
||||||
throw new ReviewedStingException("Unable to close index metafile");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public boolean hasNext() {
|
|
||||||
return nextBin != null;
|
|
||||||
}
|
|
||||||
|
|
||||||
public GATKBin next() {
|
|
||||||
if(!hasNext())
|
|
||||||
throw new NoSuchElementException("Out of elements in BAMIndexBinIterator");
|
|
||||||
GATKBin currentPosition = nextBin;
|
|
||||||
advance();
|
|
||||||
return currentPosition;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void remove() { throw new UnsupportedOperationException("Cannot remove from a LevelIterator"); }
|
|
||||||
|
|
||||||
private void advance() {
|
|
||||||
ByteBuffer indexFilePositionDecoder = ByteBuffer.allocate(LONG_SIZE_IN_BYTES*2);
|
|
||||||
indexFilePositionDecoder.order(ByteOrder.LITTLE_ENDIAN);
|
|
||||||
|
|
||||||
nextBin = null;
|
|
||||||
try {
|
|
||||||
indexFilePositionDecoder.limit(LONG_SIZE_IN_BYTES);
|
|
||||||
while(nextBin == null && currentBinNumber < lastBinNumber) {
|
|
||||||
currentBinNumber++;
|
|
||||||
metaIndexInputStream.getChannel().position(currentBinNumber*LONG_SIZE_IN_BYTES);
|
|
||||||
metaIndexInputStream.getChannel().read(indexFilePositionDecoder);
|
|
||||||
indexFilePositionDecoder.flip();
|
|
||||||
long currentPosition = indexFilePositionDecoder.getLong();
|
|
||||||
indexFilePositionDecoder.flip();
|
|
||||||
|
|
||||||
if(currentPosition != 0) {
|
|
||||||
indexInputStream.getChannel().position(currentPosition);
|
|
||||||
indexInputStream.getChannel().read(indexFilePositionDecoder);
|
|
||||||
|
|
||||||
indexFilePositionDecoder.flip();
|
|
||||||
int binNumber = indexFilePositionDecoder.getInt();
|
|
||||||
if(binNumber != currentBinNumber)
|
|
||||||
throw new ReviewedStingException("Index file and metaindex file are out of sync.");
|
|
||||||
|
|
||||||
int nChunks = indexFilePositionDecoder.getInt();
|
|
||||||
GATKChunk[] chunks = new GATKChunk[nChunks];
|
|
||||||
|
|
||||||
indexFilePositionDecoder.limit(LONG_SIZE_IN_BYTES*2);
|
|
||||||
indexFilePositionDecoder.clear();
|
|
||||||
|
|
||||||
for (int ci = 0; ci < nChunks; ci++) {
|
|
||||||
indexInputStream.getChannel().read(indexFilePositionDecoder);
|
|
||||||
|
|
||||||
indexFilePositionDecoder.flip();
|
|
||||||
final long chunkBegin = indexFilePositionDecoder.getLong();
|
|
||||||
final long chunkEnd = indexFilePositionDecoder.getLong();
|
|
||||||
chunks[ci] = new GATKChunk(chunkBegin, chunkEnd);
|
|
||||||
|
|
||||||
indexFilePositionDecoder.flip();
|
|
||||||
}
|
|
||||||
|
|
||||||
nextBin = new GATKBin(referenceSequence,binNumber);
|
|
||||||
nextBin.setChunkList(chunks);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
catch(IOException ex) {
|
|
||||||
throw new ReviewedStingException("Unable to close index metafile");
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
@ -88,10 +88,10 @@ public class BAMSchedule implements CloseableIterator<BAMScheduleEntry> {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Create a new BAM schedule based on the given index.
|
* Create a new BAM schedule based on the given index.
|
||||||
* @param indices Mapping from readers to indices.
|
* @param indexFiles Index files.
|
||||||
* @param intervals List of
|
* @param intervals List of
|
||||||
*/
|
*/
|
||||||
public BAMSchedule(final Map<SAMReaderID,GATKBAMIndex> indices, final List<GenomeLoc> intervals) {
|
public BAMSchedule(final Map<SAMReaderID,File> indexFiles, final List<GenomeLoc> intervals) {
|
||||||
if(intervals.isEmpty())
|
if(intervals.isEmpty())
|
||||||
throw new ReviewedStingException("Tried to write schedule for empty interval list.");
|
throw new ReviewedStingException("Tried to write schedule for empty interval list.");
|
||||||
|
|
||||||
|
|
@ -99,11 +99,10 @@ public class BAMSchedule implements CloseableIterator<BAMScheduleEntry> {
|
||||||
|
|
||||||
createScheduleFile();
|
createScheduleFile();
|
||||||
|
|
||||||
readerIDs.addAll(indices.keySet());
|
readerIDs.addAll(indexFiles.keySet());
|
||||||
|
|
||||||
for(final SAMReaderID reader: readerIDs) {
|
for(final SAMReaderID reader: readerIDs) {
|
||||||
GATKBAMIndex index = indices.get(reader);
|
GATKBAMIndex index = new GATKBAMIndex(indexFiles.get(reader),referenceSequence);
|
||||||
BAMIndexContent indexContent = index.getQueryResults(referenceSequence);
|
|
||||||
|
|
||||||
int currentBinInLowestLevel = GATKBAMIndex.getFirstBinInLevel(GATKBAMIndex.getNumIndexLevels()-1);
|
int currentBinInLowestLevel = GATKBAMIndex.getFirstBinInLevel(GATKBAMIndex.getNumIndexLevels()-1);
|
||||||
Iterator<GenomeLoc> locusIterator = intervals.iterator();
|
Iterator<GenomeLoc> locusIterator = intervals.iterator();
|
||||||
|
|
@ -133,7 +132,7 @@ public class BAMSchedule implements CloseableIterator<BAMScheduleEntry> {
|
||||||
|
|
||||||
// Code at this point knows that the current bin is neither before nor after the current locus,
|
// Code at this point knows that the current bin is neither before nor after the current locus,
|
||||||
// so it must overlap. Add this region to the filesystem.
|
// so it must overlap. Add this region to the filesystem.
|
||||||
final GATKBAMFileSpan fileSpan = index.getSpanOverlapping(indexContent,bin);
|
final GATKBAMFileSpan fileSpan = index.getSpanOverlapping(bin);
|
||||||
|
|
||||||
if(!fileSpan.isEmpty()) {
|
if(!fileSpan.isEmpty()) {
|
||||||
// File format is binary in little endian; start of region, end of region, num chunks, then the chunks themselves.
|
// File format is binary in little endian; start of region, end of region, num chunks, then the chunks themselves.
|
||||||
|
|
|
||||||
|
|
@ -30,6 +30,7 @@ import org.apache.log4j.Logger;
|
||||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
|
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Collections;
|
import java.util.Collections;
|
||||||
import java.util.Comparator;
|
import java.util.Comparator;
|
||||||
|
|
@ -46,7 +47,7 @@ import java.util.NoSuchElementException;
|
||||||
public class BAMScheduler implements Iterator<FilePointer> {
|
public class BAMScheduler implements Iterator<FilePointer> {
|
||||||
private final SAMDataSource dataSource;
|
private final SAMDataSource dataSource;
|
||||||
|
|
||||||
private final Map<SAMReaderID,GATKBAMIndex> indices = new HashMap<SAMReaderID,GATKBAMIndex>();
|
private final Map<SAMReaderID, File> indexFiles = new HashMap<SAMReaderID,File>();
|
||||||
|
|
||||||
private FilePointer nextFilePointer = null;
|
private FilePointer nextFilePointer = null;
|
||||||
|
|
||||||
|
|
@ -59,7 +60,7 @@ public class BAMScheduler implements Iterator<FilePointer> {
|
||||||
public BAMScheduler(final SAMDataSource dataSource, final GenomeLocSortedSet loci) {
|
public BAMScheduler(final SAMDataSource dataSource, final GenomeLocSortedSet loci) {
|
||||||
this.dataSource = dataSource;
|
this.dataSource = dataSource;
|
||||||
for(SAMReaderID reader: dataSource.getReaderIDs())
|
for(SAMReaderID reader: dataSource.getReaderIDs())
|
||||||
indices.put(reader,(GATKBAMIndex)dataSource.getIndex(reader));
|
indexFiles.put(reader,(File)dataSource.getIndex(reader));
|
||||||
this.loci = loci;
|
this.loci = loci;
|
||||||
locusIterator = new PeekableIterator<GenomeLoc>(loci.iterator());
|
locusIterator = new PeekableIterator<GenomeLoc>(loci.iterator());
|
||||||
if(locusIterator.hasNext())
|
if(locusIterator.hasNext())
|
||||||
|
|
@ -104,7 +105,7 @@ public class BAMScheduler implements Iterator<FilePointer> {
|
||||||
int coveredRegionStop = Integer.MAX_VALUE;
|
int coveredRegionStop = Integer.MAX_VALUE;
|
||||||
GenomeLoc coveredRegion = null;
|
GenomeLoc coveredRegion = null;
|
||||||
|
|
||||||
BAMScheduleEntry scheduleEntry = getNextOverlappingBAMScheduleEntry(indices,currentLocus);
|
BAMScheduleEntry scheduleEntry = getNextOverlappingBAMScheduleEntry(indexFiles,currentLocus);
|
||||||
|
|
||||||
// No overlapping data at all.
|
// No overlapping data at all.
|
||||||
if(scheduleEntry != null) {
|
if(scheduleEntry != null) {
|
||||||
|
|
@ -117,7 +118,7 @@ public class BAMScheduler implements Iterator<FilePointer> {
|
||||||
else {
|
else {
|
||||||
// Always create a file span, whether there was covered data or not. If there was no covered data, then the binTree is empty.
|
// Always create a file span, whether there was covered data or not. If there was no covered data, then the binTree is empty.
|
||||||
//System.out.printf("Shard: index file = %s; reference sequence = %d; ",index.getIndexFile(),currentLocus.getContigIndex());
|
//System.out.printf("Shard: index file = %s; reference sequence = %d; ",index.getIndexFile(),currentLocus.getContigIndex());
|
||||||
for(SAMReaderID reader: indices.keySet())
|
for(SAMReaderID reader: indexFiles.keySet())
|
||||||
nextFilePointer.addFileSpans(reader,new GATKBAMFileSpan());
|
nextFilePointer.addFileSpans(reader,new GATKBAMFileSpan());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -183,11 +184,11 @@ public class BAMScheduler implements Iterator<FilePointer> {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Get the next overlapping tree of bins associated with the given BAM file.
|
* Get the next overlapping tree of bins associated with the given BAM file.
|
||||||
* @param indices BAM index representation.
|
* @param indexFiles BAM index file.
|
||||||
* @param currentLocus The actual locus for which to check overlap.
|
* @param currentLocus The actual locus for which to check overlap.
|
||||||
* @return The next schedule entry overlapping with the given list of loci.
|
* @return The next schedule entry overlapping with the given list of loci.
|
||||||
*/
|
*/
|
||||||
private BAMScheduleEntry getNextOverlappingBAMScheduleEntry(final Map<SAMReaderID,GATKBAMIndex> indices, final GenomeLoc currentLocus) {
|
private BAMScheduleEntry getNextOverlappingBAMScheduleEntry(final Map<SAMReaderID,File> indexFiles, final GenomeLoc currentLocus) {
|
||||||
// Stale reference sequence or first invocation. (Re)create the binTreeIterator.
|
// Stale reference sequence or first invocation. (Re)create the binTreeIterator.
|
||||||
if(lastReferenceSequenceLoaded == null || lastReferenceSequenceLoaded != currentLocus.getContigIndex()) {
|
if(lastReferenceSequenceLoaded == null || lastReferenceSequenceLoaded != currentLocus.getContigIndex()) {
|
||||||
if(bamScheduleIterator != null)
|
if(bamScheduleIterator != null)
|
||||||
|
|
@ -201,7 +202,7 @@ public class BAMScheduler implements Iterator<FilePointer> {
|
||||||
lociInContig.add(locus);
|
lociInContig.add(locus);
|
||||||
}
|
}
|
||||||
|
|
||||||
bamScheduleIterator = new PeekableIterator<BAMScheduleEntry>(new BAMSchedule(indices,lociInContig));
|
bamScheduleIterator = new PeekableIterator<BAMScheduleEntry>(new BAMSchedule(indexFiles,lociInContig));
|
||||||
}
|
}
|
||||||
|
|
||||||
if(!bamScheduleIterator.hasNext())
|
if(!bamScheduleIterator.hasNext())
|
||||||
|
|
|
||||||
|
|
@ -23,24 +23,20 @@
|
||||||
*/
|
*/
|
||||||
package org.broadinstitute.sting.gatk.datasources.reads;
|
package org.broadinstitute.sting.gatk.datasources.reads;
|
||||||
|
|
||||||
import net.sf.samtools.BAMIndex;
|
|
||||||
import net.sf.samtools.BAMIndexMetaData;
|
|
||||||
import net.sf.samtools.Bin;
|
import net.sf.samtools.Bin;
|
||||||
import net.sf.samtools.BrowseableBAMIndex;
|
|
||||||
import net.sf.samtools.GATKBAMFileSpan;
|
import net.sf.samtools.GATKBAMFileSpan;
|
||||||
import net.sf.samtools.GATKBin;
|
import net.sf.samtools.GATKBin;
|
||||||
import net.sf.samtools.GATKBinList;
|
|
||||||
import net.sf.samtools.GATKChunk;
|
import net.sf.samtools.GATKChunk;
|
||||||
import net.sf.samtools.LinearIndex;
|
import net.sf.samtools.LinearIndex;
|
||||||
import net.sf.samtools.SAMException;
|
import net.sf.samtools.SAMException;
|
||||||
import net.sf.samtools.SAMSequenceDictionary;
|
|
||||||
import net.sf.samtools.util.RuntimeIOException;
|
import net.sf.samtools.util.RuntimeIOException;
|
||||||
|
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||||
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.FileInputStream;
|
import java.io.FileInputStream;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.nio.ByteBuffer;
|
||||||
import java.nio.ByteOrder;
|
import java.nio.ByteOrder;
|
||||||
import java.nio.MappedByteBuffer;
|
|
||||||
import java.nio.channels.FileChannel;
|
import java.nio.channels.FileChannel;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
|
||||||
|
|
@ -50,7 +46,7 @@ import java.util.*;
|
||||||
* @author mhanna
|
* @author mhanna
|
||||||
* @version 0.1
|
* @version 0.1
|
||||||
*/
|
*/
|
||||||
public class GATKBAMIndex implements BAMIndex, BrowseableBAMIndex {
|
public class GATKBAMIndex {
|
||||||
/**
|
/**
|
||||||
* Reports the total amount of genomic data that any bin can index.
|
* Reports the total amount of genomic data that any bin can index.
|
||||||
*/
|
*/
|
||||||
|
|
@ -66,46 +62,75 @@ public class GATKBAMIndex implements BAMIndex, BrowseableBAMIndex {
|
||||||
*/
|
*/
|
||||||
public static final int MAX_BINS = 37450; // =(8^6-1)/7+1
|
public static final int MAX_BINS = 37450; // =(8^6-1)/7+1
|
||||||
|
|
||||||
public static final int MAX_LINEAR_INDEX_SIZE = MAX_BINS+1-LEVEL_STARTS[LEVEL_STARTS.length-1];
|
|
||||||
|
|
||||||
private final File mFile;
|
private final File mFile;
|
||||||
private final MappedByteBuffer mFileBuffer;
|
|
||||||
|
|
||||||
private SAMSequenceDictionary mBamDictionary = null;
|
private final List<GATKBin> bins = new ArrayList<GATKBin>();
|
||||||
|
private final LinearIndex linearIndex;
|
||||||
|
|
||||||
public GATKBAMIndex(final File file, final SAMSequenceDictionary dictionary) {
|
public GATKBAMIndex(final File file, final int referenceSequence) {
|
||||||
mFile = file;
|
mFile = file;
|
||||||
mBamDictionary = dictionary;
|
|
||||||
// Open the file stream.
|
// Open the file stream.
|
||||||
try {
|
|
||||||
FileInputStream fileStream = new FileInputStream(mFile);
|
|
||||||
FileChannel fileChannel = fileStream.getChannel();
|
|
||||||
mFileBuffer = fileChannel.map(FileChannel.MapMode.READ_ONLY, 0L, fileChannel.size());
|
|
||||||
mFileBuffer.order(ByteOrder.LITTLE_ENDIAN);
|
|
||||||
|
|
||||||
fileChannel.close();
|
FileInputStream fileStream = null;
|
||||||
fileStream.close();
|
FileChannel fileChannel = null;
|
||||||
|
|
||||||
|
try {
|
||||||
|
fileStream = new FileInputStream(mFile);
|
||||||
|
fileChannel = fileStream.getChannel();
|
||||||
}
|
}
|
||||||
catch (IOException exc) {
|
catch (IOException exc) {
|
||||||
throw new RuntimeIOException(exc.getMessage(), exc);
|
throw new RuntimeIOException(exc.getMessage(), exc);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Verify the magic number.
|
// Verify the magic number.
|
||||||
seek(0);
|
seek(fileChannel,0);
|
||||||
final byte[] buffer = new byte[4];
|
final byte[] buffer = new byte[4];
|
||||||
readBytes(buffer);
|
readBytes(fileChannel,buffer);
|
||||||
if (!Arrays.equals(buffer, GATKBAMFileConstants.BAM_INDEX_MAGIC)) {
|
if (!Arrays.equals(buffer, GATKBAMFileConstants.BAM_INDEX_MAGIC)) {
|
||||||
throw new RuntimeException("Invalid file header in BAM index " + mFile +
|
throw new RuntimeException("Invalid file header in BAM index " + mFile +
|
||||||
": " + new String(buffer));
|
": " + new String(buffer));
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
seek(fileChannel,4);
|
||||||
* Gets the file backing this index.
|
|
||||||
* @return The index file.
|
final int sequenceCount = readInteger(fileChannel);
|
||||||
*/
|
|
||||||
public File getIndexFile() {
|
if (referenceSequence >= sequenceCount)
|
||||||
return mFile;
|
throw new ReviewedStingException("Invalid sequence number " + referenceSequence);
|
||||||
|
|
||||||
|
skipToSequence(fileChannel,referenceSequence);
|
||||||
|
|
||||||
|
int binCount = readInteger(fileChannel);
|
||||||
|
for (int binNumber = 0; binNumber < binCount; binNumber++) {
|
||||||
|
final int indexBin = readInteger(fileChannel);
|
||||||
|
final int nChunks = readInteger(fileChannel);
|
||||||
|
List<GATKChunk> chunks = new ArrayList<GATKChunk>(nChunks);
|
||||||
|
for (int ci = 0; ci < nChunks; ci++) {
|
||||||
|
final long chunkBegin = readLong(fileChannel);
|
||||||
|
final long chunkEnd = readLong(fileChannel);
|
||||||
|
chunks.add(new GATKChunk(chunkBegin, chunkEnd));
|
||||||
|
}
|
||||||
|
GATKBin bin = new GATKBin(referenceSequence, indexBin);
|
||||||
|
bin.setChunkList(chunks.toArray(new GATKChunk[chunks.size()]));
|
||||||
|
while(indexBin >= bins.size())
|
||||||
|
bins.add(null);
|
||||||
|
bins.set(indexBin,bin);
|
||||||
|
}
|
||||||
|
|
||||||
|
final int nLinearBins = readInteger(fileChannel);
|
||||||
|
long[] linearIndexEntries = new long[nLinearBins];
|
||||||
|
for(int linearIndexOffset = 0; linearIndexOffset < nLinearBins; linearIndexOffset++)
|
||||||
|
linearIndexEntries[linearIndexOffset] = readLong(fileChannel);
|
||||||
|
|
||||||
|
linearIndex = new LinearIndex(referenceSequence,0,linearIndexEntries);
|
||||||
|
|
||||||
|
try {
|
||||||
|
fileChannel.close();
|
||||||
|
fileStream.close();
|
||||||
|
}
|
||||||
|
catch (IOException exc) {
|
||||||
|
throw new RuntimeIOException(exc.getMessage(), exc);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -142,7 +167,6 @@ public class GATKBAMIndex implements BAMIndex, BrowseableBAMIndex {
|
||||||
* @param bin The bin for which to determine the level.
|
* @param bin The bin for which to determine the level.
|
||||||
* @return the level associated with the given bin number.
|
* @return the level associated with the given bin number.
|
||||||
*/
|
*/
|
||||||
@Override
|
|
||||||
public int getLevelForBin(final Bin bin) {
|
public int getLevelForBin(final Bin bin) {
|
||||||
GATKBin gatkBin = new GATKBin(bin);
|
GATKBin gatkBin = new GATKBin(bin);
|
||||||
if(gatkBin.getBinNumber() >= MAX_BINS)
|
if(gatkBin.getBinNumber() >= MAX_BINS)
|
||||||
|
|
@ -171,7 +195,6 @@ public class GATKBAMIndex implements BAMIndex, BrowseableBAMIndex {
|
||||||
* @param bin The bin to test.
|
* @param bin The bin to test.
|
||||||
* @return The last position that the given bin can represent.
|
* @return The last position that the given bin can represent.
|
||||||
*/
|
*/
|
||||||
@Override
|
|
||||||
public int getLastLocusInBin(final Bin bin) {
|
public int getLastLocusInBin(final Bin bin) {
|
||||||
final int level = getLevelForBin(bin);
|
final int level = getLevelForBin(bin);
|
||||||
final int levelStart = LEVEL_STARTS[level];
|
final int levelStart = LEVEL_STARTS[level];
|
||||||
|
|
@ -179,164 +202,32 @@ public class GATKBAMIndex implements BAMIndex, BrowseableBAMIndex {
|
||||||
return (new GATKBin(bin).getBinNumber()-levelStart+1)*(BIN_GENOMIC_SPAN /levelSize);
|
return (new GATKBin(bin).getBinNumber()-levelStart+1)*(BIN_GENOMIC_SPAN /levelSize);
|
||||||
}
|
}
|
||||||
|
|
||||||
public int getNumberOfReferences() {
|
|
||||||
seek(4);
|
|
||||||
return readInteger();
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Use to get close to the unmapped reads at the end of a BAM file.
|
|
||||||
* @return The file offset of the first record in the last linear bin, or -1
|
|
||||||
* if there are no elements in linear bins (i.e. no mapped reads).
|
|
||||||
*/
|
|
||||||
public long getStartOfLastLinearBin() {
|
|
||||||
seek(4);
|
|
||||||
|
|
||||||
final int sequenceCount = readInteger();
|
|
||||||
// Because no reads may align to the last sequence in the sequence dictionary,
|
|
||||||
// grab the last element of the linear index for each sequence, and return
|
|
||||||
// the last one from the last sequence that has one.
|
|
||||||
long lastLinearIndexPointer = -1;
|
|
||||||
for (int i = 0; i < sequenceCount; i++) {
|
|
||||||
// System.out.println("# Sequence TID: " + i);
|
|
||||||
final int nBins = readInteger();
|
|
||||||
// System.out.println("# nBins: " + nBins);
|
|
||||||
for (int j1 = 0; j1 < nBins; j1++) {
|
|
||||||
// Skip bin #
|
|
||||||
skipBytes(4);
|
|
||||||
final int nChunks = readInteger();
|
|
||||||
// Skip chunks
|
|
||||||
skipBytes(16 * nChunks);
|
|
||||||
}
|
|
||||||
final int nLinearBins = readInteger();
|
|
||||||
if (nLinearBins > 0) {
|
|
||||||
// Skip to last element of list of linear bins
|
|
||||||
skipBytes(8 * (nLinearBins - 1));
|
|
||||||
lastLinearIndexPointer = readLong();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return lastLinearIndexPointer;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Gets meta data for the given reference including information about number of aligned, unaligned, and noCoordinate records
|
|
||||||
* @param reference the reference of interest
|
|
||||||
* @return meta data for the reference
|
|
||||||
*/
|
|
||||||
public BAMIndexMetaData getMetaData(int reference) {
|
|
||||||
throw new UnsupportedOperationException("Cannot retrieve metadata for GATKBAMIndex");
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns count of records unassociated with any reference. Call before the index file is closed
|
|
||||||
*
|
|
||||||
* @return meta data at the end of the bam index that indicates count of records holding no coordinates
|
|
||||||
* or null if no meta data (old index format)
|
|
||||||
*/
|
|
||||||
public Long getNoCoordinateCount() {
|
|
||||||
|
|
||||||
seek(4);
|
|
||||||
final int sequenceCount = readInteger();
|
|
||||||
|
|
||||||
skipToSequence(sequenceCount);
|
|
||||||
try { // in case of old index file without meta data
|
|
||||||
return readLong();
|
|
||||||
} catch (Exception e) {
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Get list of regions of BAM file that may contain SAMRecords for the given range
|
|
||||||
* @param referenceIndex sequence of desired SAMRecords
|
|
||||||
* @param startPos 1-based start of the desired interval, inclusive
|
|
||||||
* @param endPos 1-based end of the desired interval, inclusive
|
|
||||||
* @return the virtual file position. Each pair is the first and last virtual file position
|
|
||||||
* in a range that can be scanned to find SAMRecords that overlap the given positions.
|
|
||||||
*/
|
|
||||||
@Override
|
|
||||||
public GATKBAMFileSpan getSpanOverlapping(final int referenceIndex, final int startPos, final int endPos) {
|
|
||||||
BAMIndexContent queryResults = getQueryResults(referenceIndex);
|
|
||||||
|
|
||||||
if(queryResults == null)
|
|
||||||
return null;
|
|
||||||
|
|
||||||
GATKBinList overlappingBins = getBinsOverlapping(referenceIndex,startPos,endPos);
|
|
||||||
|
|
||||||
// System.out.println("# Sequence target TID: " + referenceIndex);
|
|
||||||
List<GATKBin> bins = new ArrayList<GATKBin>();
|
|
||||||
for(GATKBin bin: queryResults.getBins()) {
|
|
||||||
if (overlappingBins.getBins().get(bin.getBinNumber()))
|
|
||||||
bins.add(bin);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (bins.isEmpty()) {
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
List<GATKChunk> chunkList = new ArrayList<GATKChunk>();
|
|
||||||
for(GATKBin bin: bins) {
|
|
||||||
for(GATKChunk chunk: bin.getChunkList())
|
|
||||||
chunkList.add(chunk.clone());
|
|
||||||
}
|
|
||||||
|
|
||||||
if (chunkList.isEmpty()) {
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
chunkList = optimizeChunkList(chunkList,queryResults.getLinearIndex().getMinimumOffset(startPos));
|
|
||||||
return new GATKBAMFileSpan(chunkList.toArray(new GATKChunk[chunkList.size()]));
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Perform an overlapping query of all bins bounding the given location.
|
* Perform an overlapping query of all bins bounding the given location.
|
||||||
* @param bin The bin over which to perform an overlapping query.
|
* @param bin The bin over which to perform an overlapping query.
|
||||||
* @return The file pointers
|
* @return The file pointers
|
||||||
*/
|
*/
|
||||||
@Override
|
|
||||||
public GATKBAMFileSpan getSpanOverlapping(final Bin bin) {
|
public GATKBAMFileSpan getSpanOverlapping(final Bin bin) {
|
||||||
if(bin == null)
|
if(bin == null)
|
||||||
return null;
|
return null;
|
||||||
|
|
||||||
GATKBin gatkBin = new GATKBin(bin);
|
GATKBin gatkBin = new GATKBin(bin);
|
||||||
|
|
||||||
final int referenceSequence = gatkBin.getReferenceSequence();
|
|
||||||
BAMIndexContent indexQuery = getQueryResults(referenceSequence);
|
|
||||||
|
|
||||||
return getSpanOverlapping(indexQuery,bin);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Perform an overlapping query of all bins bounding the given location.
|
|
||||||
* @param bin The bin over which to perform an overlapping query.
|
|
||||||
* @return The file pointers
|
|
||||||
*/
|
|
||||||
public GATKBAMFileSpan getSpanOverlapping(final BAMIndexContent indexContent, final Bin bin) {
|
|
||||||
if(bin == null)
|
|
||||||
return null;
|
|
||||||
|
|
||||||
GATKBin gatkBin = new GATKBin(bin);
|
|
||||||
|
|
||||||
if(indexContent == null)
|
|
||||||
return null;
|
|
||||||
|
|
||||||
final int binLevel = getLevelForBin(bin);
|
final int binLevel = getLevelForBin(bin);
|
||||||
final int firstLocusInBin = getFirstLocusInBin(bin);
|
final int firstLocusInBin = getFirstLocusInBin(bin);
|
||||||
|
|
||||||
// Add the specified bin to the tree if it exists.
|
// Add the specified bin to the tree if it exists.
|
||||||
List<GATKBin> binTree = new ArrayList<GATKBin>();
|
List<GATKBin> binTree = new ArrayList<GATKBin>();
|
||||||
if(indexContent.containsBin(gatkBin))
|
if(gatkBin.getBinNumber() < bins.size() && bins.get(gatkBin.getBinNumber()) != null)
|
||||||
binTree.add(indexContent.getBins().getBin(gatkBin.getBinNumber()));
|
binTree.add(bins.get(gatkBin.getBinNumber()));
|
||||||
|
|
||||||
int currentBinLevel = binLevel;
|
int currentBinLevel = binLevel;
|
||||||
while(--currentBinLevel >= 0) {
|
while(--currentBinLevel >= 0) {
|
||||||
final int binStart = getFirstBinInLevel(currentBinLevel);
|
final int binStart = getFirstBinInLevel(currentBinLevel);
|
||||||
final int binWidth = getMaxAddressibleGenomicLocation()/getLevelSize(currentBinLevel);
|
final int binWidth = getMaxAddressibleGenomicLocation()/getLevelSize(currentBinLevel);
|
||||||
final int binNumber = firstLocusInBin/binWidth + binStart;
|
final int binNumber = firstLocusInBin/binWidth + binStart;
|
||||||
GATKBin parentBin = indexContent.getBins().getBin(binNumber);
|
if(binNumber < bins.size() && bins.get(binNumber) != null)
|
||||||
if(parentBin != null && indexContent.containsBin(parentBin))
|
binTree.add(bins.get(binNumber));
|
||||||
binTree.add(parentBin);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
List<GATKChunk> chunkList = new ArrayList<GATKChunk>();
|
List<GATKChunk> chunkList = new ArrayList<GATKChunk>();
|
||||||
|
|
@ -346,173 +237,10 @@ public class GATKBAMIndex implements BAMIndex, BrowseableBAMIndex {
|
||||||
}
|
}
|
||||||
|
|
||||||
final int start = getFirstLocusInBin(bin);
|
final int start = getFirstLocusInBin(bin);
|
||||||
chunkList = optimizeChunkList(chunkList,indexContent.getLinearIndex().getMinimumOffset(start));
|
chunkList = optimizeChunkList(chunkList,linearIndex.getMinimumOffset(start));
|
||||||
return new GATKBAMFileSpan(chunkList.toArray(new GATKChunk[chunkList.size()]));
|
return new GATKBAMFileSpan(chunkList.toArray(new GATKChunk[chunkList.size()]));
|
||||||
}
|
}
|
||||||
|
|
||||||
public GATKBAMFileSpan getContentsOfBin(final Bin bin) {
|
|
||||||
if(bin == null)
|
|
||||||
return null;
|
|
||||||
|
|
||||||
GATKBin gatkBin = new GATKBin(bin);
|
|
||||||
|
|
||||||
BAMIndexContent indexQuery = getQueryResults(gatkBin.getReferenceSequence());
|
|
||||||
|
|
||||||
if(indexQuery == null)
|
|
||||||
return null;
|
|
||||||
|
|
||||||
GATKBin queriedBin = indexQuery.getBins().getBin(gatkBin.getBinNumber());
|
|
||||||
|
|
||||||
return queriedBin != null ? new GATKBAMFileSpan(queriedBin.getChunkList()) : null;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Retrieves the linear index for the given reference sequence.
|
|
||||||
* @param referenceSequence Reference sequence number for which to retrieve the reference.
|
|
||||||
* @return The linear index for the given reference sequence.
|
|
||||||
*/
|
|
||||||
public LinearIndex getLinearIndex(int referenceSequence) {
|
|
||||||
return getQueryResults(referenceSequence).getLinearIndex();
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Get a list of bins in the BAM file that may contain SAMRecords for the given range.
|
|
||||||
* @param referenceIndex sequence of desired SAMRecords
|
|
||||||
* @param startPos 1-based start of the desired interval, inclusive
|
|
||||||
* @param endPos 1-based end of the desired interval, inclusive
|
|
||||||
* @return a list of bins that contain relevant data.
|
|
||||||
*/
|
|
||||||
public GATKBinList getBinsOverlapping(final int referenceIndex, final int startPos, final int endPos) {
|
|
||||||
final BitSet regionBins = regionToBins(startPos,endPos);
|
|
||||||
if (regionBins == null) {
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
return new GATKBinList(referenceIndex,regionBins);
|
|
||||||
}
|
|
||||||
|
|
||||||
protected BAMIndexContent query(final int referenceSequence, final int startPos, final int endPos) {
|
|
||||||
seek(4);
|
|
||||||
|
|
||||||
List<GATKChunk> metaDataChunks = new ArrayList<GATKChunk>();
|
|
||||||
|
|
||||||
final int sequenceCount = readInteger();
|
|
||||||
|
|
||||||
if (referenceSequence >= sequenceCount) {
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
final BitSet regionBins = regionToBins(startPos, endPos);
|
|
||||||
if (regionBins == null) {
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
skipToSequence(referenceSequence);
|
|
||||||
|
|
||||||
int binCount = readInteger();
|
|
||||||
boolean metaDataSeen = false;
|
|
||||||
GATKBin[] bins = new GATKBin[getMaxBinNumberForReference(referenceSequence) +1];
|
|
||||||
for (int binNumber = 0; binNumber < binCount; binNumber++) {
|
|
||||||
final int indexBin = readInteger();
|
|
||||||
final int nChunks = readInteger();
|
|
||||||
List<GATKChunk> chunks = new ArrayList<GATKChunk>(nChunks);
|
|
||||||
// System.out.println("# bin[" + i + "] = " + indexBin + ", nChunks = " + nChunks);
|
|
||||||
GATKChunk lastChunk = null;
|
|
||||||
if (regionBins.get(indexBin)) {
|
|
||||||
for (int ci = 0; ci < nChunks; ci++) {
|
|
||||||
final long chunkBegin = readLong();
|
|
||||||
final long chunkEnd = readLong();
|
|
||||||
lastChunk = new GATKChunk(chunkBegin, chunkEnd);
|
|
||||||
chunks.add(lastChunk);
|
|
||||||
}
|
|
||||||
} else if (indexBin == MAX_BINS) {
|
|
||||||
// meta data - build the bin so that the count of bins is correct;
|
|
||||||
// but don't attach meta chunks to the bin, or normal queries will be off
|
|
||||||
for (int ci = 0; ci < nChunks; ci++) {
|
|
||||||
final long chunkBegin = readLong();
|
|
||||||
final long chunkEnd = readLong();
|
|
||||||
lastChunk = new GATKChunk(chunkBegin, chunkEnd);
|
|
||||||
metaDataChunks.add(lastChunk);
|
|
||||||
}
|
|
||||||
metaDataSeen = true;
|
|
||||||
continue; // don't create a Bin
|
|
||||||
} else {
|
|
||||||
skipBytes(16 * nChunks);
|
|
||||||
}
|
|
||||||
GATKBin bin = new GATKBin(referenceSequence, indexBin);
|
|
||||||
bin.setChunkList(chunks.toArray(new GATKChunk[chunks.size()]));
|
|
||||||
bins[indexBin] = bin;
|
|
||||||
}
|
|
||||||
|
|
||||||
final int nLinearBins = readInteger();
|
|
||||||
|
|
||||||
final int regionLinearBinStart = LinearIndex.convertToLinearIndexOffset(startPos);
|
|
||||||
final int regionLinearBinStop = endPos > 0 ? LinearIndex.convertToLinearIndexOffset(endPos) : nLinearBins-1;
|
|
||||||
final int actualStop = Math.min(regionLinearBinStop, nLinearBins -1);
|
|
||||||
|
|
||||||
long[] linearIndexEntries = new long[0];
|
|
||||||
if (regionLinearBinStart < nLinearBins) {
|
|
||||||
linearIndexEntries = new long[actualStop-regionLinearBinStart+1];
|
|
||||||
skipBytes(8 * regionLinearBinStart);
|
|
||||||
for(int linearBin = regionLinearBinStart; linearBin <= actualStop; linearBin++)
|
|
||||||
linearIndexEntries[linearBin-regionLinearBinStart] = readLong();
|
|
||||||
}
|
|
||||||
|
|
||||||
final LinearIndex linearIndex = new LinearIndex(referenceSequence,regionLinearBinStart,linearIndexEntries);
|
|
||||||
|
|
||||||
return new BAMIndexContent(referenceSequence, bins, binCount - (metaDataSeen? 1 : 0), linearIndex);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* The maxiumum bin number for a reference sequence of a given length
|
|
||||||
*/
|
|
||||||
static int getMaxBinNumberForSequenceLength(int sequenceLength) {
|
|
||||||
return getFirstBinInLevel(getNumIndexLevels() - 1) + (sequenceLength >> 14);
|
|
||||||
// return 4680 + (sequenceLength >> 14); // note 4680 = getFirstBinInLevel(getNumIndexLevels() - 1)
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Looks up the cached BAM query results if they're still in the cache and not expired. Otherwise,
|
|
||||||
* retrieves the cache results from disk.
|
|
||||||
* @param referenceIndex The reference to load. CachingBAMFileIndex only stores index data for entire references.
|
|
||||||
* @return The index information for this reference.
|
|
||||||
*/
|
|
||||||
public BAMIndexContent getQueryResults(final int referenceIndex) {
|
|
||||||
// If not in the cache, attempt to load it from disk.
|
|
||||||
return query(referenceIndex,1,-1);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Gets the possible number of bins for a given reference sequence.
|
|
||||||
* @return How many bins could possibly be used according to this indexing scheme to index a single contig.
|
|
||||||
*/
|
|
||||||
protected int getMaxAddressibleGenomicLocation() {
|
|
||||||
return BIN_GENOMIC_SPAN;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Get candidate bins for the specified region
|
|
||||||
* @param startPos 1-based start of target region, inclusive.
|
|
||||||
* @param endPos 1-based end of target region, inclusive.
|
|
||||||
* @return bit set for each bin that may contain SAMRecords in the target region.
|
|
||||||
*/
|
|
||||||
protected BitSet regionToBins(final int startPos, final int endPos) {
|
|
||||||
final int maxPos = 0x1FFFFFFF;
|
|
||||||
final int start = (startPos <= 0) ? 0 : (startPos-1) & maxPos;
|
|
||||||
final int end = (endPos <= 0) ? maxPos : (endPos-1) & maxPos;
|
|
||||||
if (start > end) {
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
int k;
|
|
||||||
final BitSet bitSet = new BitSet(MAX_BINS);
|
|
||||||
bitSet.set(0);
|
|
||||||
for (k = 1 + (start>>26); k <= 1 + (end>>26); ++k) bitSet.set(k);
|
|
||||||
for (k = 9 + (start>>23); k <= 9 + (end>>23); ++k) bitSet.set(k);
|
|
||||||
for (k = 73 + (start>>20); k <= 73 + (end>>20); ++k) bitSet.set(k);
|
|
||||||
for (k = 585 + (start>>17); k <= 585 + (end>>17); ++k) bitSet.set(k);
|
|
||||||
for (k = 4681 + (start>>14); k <= 4681 + (end>>14); ++k) bitSet.set(k);
|
|
||||||
return bitSet;
|
|
||||||
}
|
|
||||||
|
|
||||||
protected List<GATKChunk> optimizeChunkList(final List<GATKChunk> chunks, final long minimumOffset) {
|
protected List<GATKChunk> optimizeChunkList(final List<GATKChunk> chunks, final long minimumOffset) {
|
||||||
GATKChunk lastChunk = null;
|
GATKChunk lastChunk = null;
|
||||||
Collections.sort(chunks);
|
Collections.sort(chunks);
|
||||||
|
|
@ -541,61 +269,76 @@ public class GATKBAMIndex implements BAMIndex, BrowseableBAMIndex {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* The maximum possible bin number for this reference sequence.
|
* Gets the possible number of bins for a given reference sequence.
|
||||||
* This is based on the maximum coordinate position of the reference
|
* @return How many bins could possibly be used according to this indexing scheme to index a single contig.
|
||||||
* which is based on the size of the reference
|
|
||||||
*/
|
*/
|
||||||
private int getMaxBinNumberForReference(final int reference) {
|
protected int getMaxAddressibleGenomicLocation() {
|
||||||
try {
|
return BIN_GENOMIC_SPAN;
|
||||||
final int sequenceLength = mBamDictionary.getSequence(reference).getSequenceLength();
|
}
|
||||||
return getMaxBinNumberForSequenceLength(sequenceLength);
|
|
||||||
} catch (Exception e) {
|
|
||||||
return MAX_BINS;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
protected void skipToSequence(final int sequenceIndex) {
|
protected void skipToSequence(final FileChannel fileChannel, final int sequenceIndex) {
|
||||||
for (int i = 0; i < sequenceIndex; i++) {
|
for (int i = 0; i < sequenceIndex; i++) {
|
||||||
// System.out.println("# Sequence TID: " + i);
|
// System.out.println("# Sequence TID: " + i);
|
||||||
final int nBins = readInteger();
|
final int nBins = readInteger(fileChannel);
|
||||||
// System.out.println("# nBins: " + nBins);
|
// System.out.println("# nBins: " + nBins);
|
||||||
for (int j = 0; j < nBins; j++) {
|
for (int j = 0; j < nBins; j++) {
|
||||||
final int bin = readInteger();
|
final int bin = readInteger(fileChannel);
|
||||||
final int nChunks = readInteger();
|
final int nChunks = readInteger(fileChannel);
|
||||||
// System.out.println("# bin[" + j + "] = " + bin + ", nChunks = " + nChunks);
|
// System.out.println("# bin[" + j + "] = " + bin + ", nChunks = " + nChunks);
|
||||||
skipBytes(16 * nChunks);
|
skipBytes(fileChannel, 16 * nChunks);
|
||||||
}
|
}
|
||||||
final int nLinearBins = readInteger();
|
final int nLinearBins = readInteger(fileChannel);
|
||||||
// System.out.println("# nLinearBins: " + nLinearBins);
|
// System.out.println("# nLinearBins: " + nLinearBins);
|
||||||
skipBytes(8 * nLinearBins);
|
skipBytes(fileChannel, 8 * nLinearBins);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private void readBytes(final byte[] bytes) {
|
private void readBytes(final FileChannel fileChannel, final byte[] bytes) {
|
||||||
mFileBuffer.get(bytes);
|
ByteBuffer buffer = ByteBuffer.wrap(bytes);
|
||||||
|
try {
|
||||||
|
fileChannel.read(buffer);
|
||||||
|
}
|
||||||
|
catch(IOException ex) {
|
||||||
|
throw new ReviewedStingException("Index: unable to read bytes from index file.");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
protected int readInteger() {
|
private static final int INT_SIZE_IN_BYTES = Integer.SIZE / 8;
|
||||||
return mFileBuffer.getInt();
|
private static final int LONG_SIZE_IN_BYTES = Long.SIZE / 8;
|
||||||
|
|
||||||
|
private ByteBuffer wrapBuffer(final byte[] bytes) {
|
||||||
|
ByteBuffer buffer = ByteBuffer.wrap(bytes);
|
||||||
|
buffer.order(ByteOrder.LITTLE_ENDIAN);
|
||||||
|
return buffer;
|
||||||
}
|
}
|
||||||
|
|
||||||
protected long readLong() {
|
private int readInteger(final FileChannel fileChannel) {
|
||||||
return mFileBuffer.getLong();
|
byte[] bytes = new byte[INT_SIZE_IN_BYTES];
|
||||||
|
readBytes(fileChannel,bytes);
|
||||||
|
return wrapBuffer(bytes).getInt();
|
||||||
}
|
}
|
||||||
|
|
||||||
protected void skipBytes(final int count) {
|
private long readLong(final FileChannel fileChannel) {
|
||||||
mFileBuffer.position(mFileBuffer.position() + count);
|
byte[] bytes = new byte[LONG_SIZE_IN_BYTES];
|
||||||
|
readBytes(fileChannel,bytes);
|
||||||
|
return wrapBuffer(bytes).getLong();
|
||||||
}
|
}
|
||||||
|
|
||||||
protected void seek(final int position) {
|
private void skipBytes(final FileChannel fileChannel, final int count) {
|
||||||
mFileBuffer.position(position);
|
try {
|
||||||
|
fileChannel.position(fileChannel.position() + count);
|
||||||
|
}
|
||||||
|
catch(IOException ex) {
|
||||||
|
throw new ReviewedStingException("Index: unable to reposition file channel.");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
protected long position() {
|
private void seek(final FileChannel fileChannel, final int position) {
|
||||||
return mFileBuffer.position();
|
try {
|
||||||
}
|
fileChannel.position(position);
|
||||||
|
}
|
||||||
@Override
|
catch(IOException ex) {
|
||||||
public void close() {
|
throw new ReviewedStingException("Index: unable to reposition of file channel.");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -140,7 +140,7 @@ public class IntervalSharder {
|
||||||
// If this contig can't be found in the reference, skip over it.
|
// If this contig can't be found in the reference, skip over it.
|
||||||
if(referenceSequence == null && contig != null)
|
if(referenceSequence == null && contig != null)
|
||||||
continue;
|
continue;
|
||||||
final BrowseableBAMIndex index = dataSource.getIndex(id);
|
final BrowseableBAMIndex index = (BrowseableBAMIndex)dataSource.getIndex(id);
|
||||||
binMerger.addReader(id,
|
binMerger.addReader(id,
|
||||||
index,
|
index,
|
||||||
referenceSequence.getSequenceIndex(),
|
referenceSequence.getSequenceIndex(),
|
||||||
|
|
|
||||||
|
|
@ -45,6 +45,7 @@ public class LowMemoryIntervalSharder implements Iterator<FilePointer> {
|
||||||
private final GenomeLocParser parser;
|
private final GenomeLocParser parser;
|
||||||
|
|
||||||
public LowMemoryIntervalSharder(final SAMDataSource dataSource, final GenomeLocSortedSet loci) {
|
public LowMemoryIntervalSharder(final SAMDataSource dataSource, final GenomeLocSortedSet loci) {
|
||||||
|
System.out.println("Low memory interval sharding enabled");
|
||||||
wrappedIterator = new PeekableIterator<FilePointer>(new BAMScheduler(dataSource,loci));
|
wrappedIterator = new PeekableIterator<FilePointer>(new BAMScheduler(dataSource,loci));
|
||||||
parser = loci.getGenomeLocParser();
|
parser = loci.getGenomeLocParser();
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -84,7 +84,7 @@ public class SAMDataSource {
|
||||||
/**
|
/**
|
||||||
* Store BAM indices for each reader present.
|
* Store BAM indices for each reader present.
|
||||||
*/
|
*/
|
||||||
private final Map<SAMReaderID,GATKBAMIndex> bamIndices = new HashMap<SAMReaderID,GATKBAMIndex>();
|
private final Map<SAMReaderID,Object> bamIndices = new HashMap<SAMReaderID,Object>();
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* How far along is each reader?
|
* How far along is each reader?
|
||||||
|
|
@ -292,9 +292,7 @@ public class SAMDataSource {
|
||||||
for(SAMReaderID id: readerIDs) {
|
for(SAMReaderID id: readerIDs) {
|
||||||
File indexFile = findIndexFile(id.samFile);
|
File indexFile = findIndexFile(id.samFile);
|
||||||
if(indexFile != null) {
|
if(indexFile != null) {
|
||||||
SAMSequenceDictionary sequenceDictionary = readers.getReader(id).getFileHeader().getSequenceDictionary();
|
bamIndices.put(id,indexFile);
|
||||||
GATKBAMIndex index = new GATKBAMIndex(indexFile,sequenceDictionary);
|
|
||||||
bamIndices.put(id,index);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -428,7 +426,7 @@ public class SAMDataSource {
|
||||||
* @param id Id of the reader.
|
* @param id Id of the reader.
|
||||||
* @return The index. Will preload the index if necessary.
|
* @return The index. Will preload the index if necessary.
|
||||||
*/
|
*/
|
||||||
public BrowseableBAMIndex getIndex(final SAMReaderID id) {
|
public Object getIndex(final SAMReaderID id) {
|
||||||
if(enableLowMemorySharding)
|
if(enableLowMemorySharding)
|
||||||
return bamIndices.get(id);
|
return bamIndices.get(id);
|
||||||
else {
|
else {
|
||||||
|
|
|
||||||
|
|
@ -57,7 +57,7 @@ public class GATKBAMIndexUnitTest extends BaseTest {
|
||||||
SAMSequenceDictionary sequenceDictionary = reader.getFileHeader().getSequenceDictionary();
|
SAMSequenceDictionary sequenceDictionary = reader.getFileHeader().getSequenceDictionary();
|
||||||
reader.close();
|
reader.close();
|
||||||
|
|
||||||
bamIndex = new GATKBAMIndex(bamIndexFile,sequenceDictionary);
|
bamIndex = new GATKBAMIndex(bamIndexFile,0);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue