Correcting my incomplete understanding of how the BAM file index actually works.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2833 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
hanna 2010-02-12 16:15:19 +00:00
parent 5f74fffa02
commit 77af5822d4
5 changed files with 126 additions and 19 deletions

View File

@ -36,7 +36,17 @@ import java.util.*;
*/
public class BAMFileIndex2 extends BAMFileIndex
{
/**
* Reports the total amount of genomic data that any bin can index.
*/
private static final int BIN_SPAN = 512*1024*1024;
/**
* Reports the maximum number of bins in a BAM file index, based on the the pseudocode
* in section 1.2 of the BAM spec.
*/
private static final int MAX_BINS = 37450; // =(8^6-1)/7+1
private static final int BAM_LIDX_SHIFT = 14;
/**
@ -87,6 +97,30 @@ public class BAMFileIndex2 extends BAMFileIndex
throw new SAMException("Unable to find correct bin for bin number "+binNumber);
}
/**
* Gets the first locus that this bin can index into.
* @param bin The bin to test.
* @return The last position that the given bin can represent.
*/
protected int getFirstLocusInBin(final Bin bin) {
final int level = getLevelForBinNumber(bin.binNumber);
final int levelStart = LEVEL_STARTS[level];
final int levelSize = ((level==getNumIndexLevels()-1) ? MAX_BINS-1 : LEVEL_STARTS[level+1]) - levelStart;
return (bin.binNumber - levelStart)*(BIN_SPAN/levelSize);
}
/**
* Gets the last locus that this bin can index into.
* @param bin The bin to test.
* @return The last position that the given bin can represent.
*/
protected int getLastLocusInBin(final Bin bin) {
final int level = getLevelForBinNumber(bin.binNumber);
final int levelStart = LEVEL_STARTS[level];
final int levelSize = ((level==getNumIndexLevels()-1) ? MAX_BINS-1 : LEVEL_STARTS[level+1]) - levelStart;
return (bin.binNumber - levelStart + 1)*(BIN_SPAN/levelSize) - 1;
}
/**
* Completely load the index into memory.
* @param file File to load.
@ -162,7 +196,7 @@ public class BAMFileIndex2 extends BAMFileIndex
long[] getFilePointersContaining(final int referenceIndex, final int startPos, final int endPos) {
List<Bin> bins = getBinsContaining(referenceIndex,startPos,endPos);
// System.out.println("# Sequence target TID: " + referenceIndex);
if (bins.size() == 0) {
if (bins == null) {
return null;
}
@ -185,10 +219,6 @@ public class BAMFileIndex2 extends BAMFileIndex
return convertToArray(chunkList);
}
long[] getFilePointersBounding(final Bin bin) {
return convertToArray(binToChunks.get(bin));
}
/**
* Get a list of bins in the BAM file that may contain SAMRecords for the given range.
* @param referenceIndex sequence of desired SAMRecords

View File

@ -194,8 +194,15 @@ class BAMFileReader2
return bins;
}
public List<Chunk> getFilePointersBounding(Bin bin) {
return Chunk.toChunkList(getFileIndex().getFilePointersBounding(bin));
public List<Chunk> getFilePointersBounding(final String sequence, final int start, final int end) {
final SAMFileHeader fileHeader = getFileHeader();
long[] filePointers = null;
int referenceIndex = fileHeader.getSequenceIndex(sequence);
if (referenceIndex != -1) {
final BAMFileIndex2 fileIndex = getFileIndex();
filePointers = fileIndex.getFilePointersContaining(referenceIndex,start,end);
}
return (filePointers != null) ? Chunk.toChunkList(filePointers) : Collections.<Chunk>emptyList();
}
/**
@ -493,7 +500,7 @@ class BAMFileReader2
throws IOException {
while (true) {
// Advance to next file block if necessary
while (mCompressedInputStream.getFilePointer() > mFilePointerLimit) {
while (mCompressedInputStream.getFilePointer() >= mFilePointerLimit) {
if (mFilePointers == null ||
mFilePointerIndex >= mFilePointers.length) {
return null;

View File

@ -88,7 +88,7 @@ public class SAMFileReader2 extends SAMFileReader {
* @return Number of levels in this index.
*/
public int getNumIndexLevels() {
BAMFileIndex2 fileIndex = (BAMFileIndex2)JVMUtils.getFieldValue(getField("mFileIndex"),this);
final BAMFileIndex2 fileIndex = (BAMFileIndex2)JVMUtils.getFieldValue(getField("mFileIndex"),this);
if(fileIndex == null)
throw new SAMException("Unable to determine number of index levels; BAM file index is not present.");
return fileIndex.getNumIndexLevels();
@ -100,12 +100,36 @@ public class SAMFileReader2 extends SAMFileReader {
* @return the level associated with the given bin number.
*/
public int getLevelForBin(final Bin bin) {
BAMFileIndex2 fileIndex = (BAMFileIndex2)JVMUtils.getFieldValue(getField("mFileIndex"),this);
final BAMFileIndex2 fileIndex = (BAMFileIndex2)JVMUtils.getFieldValue(getField("mFileIndex"),this);
if(fileIndex == null)
throw new SAMException("Unable to determine number of index levels; BAM file index is not present.");
return fileIndex.getLevelForBinNumber(bin.binNumber);
}
/**
* Gets the first locus that this bin can index into.
* @param bin The bin to test.
* @return The last position that the given bin can represent.
*/
public int getFirstLocusInBin(final Bin bin) {
final BAMFileIndex2 fileIndex = (BAMFileIndex2)JVMUtils.getFieldValue(getField("mFileIndex"),this);
if(fileIndex == null)
throw new SAMException("Unable to determine number of index levels; BAM file index is not present.");
return fileIndex.getFirstLocusInBin(bin);
}
/**
* Gets the last locus that this bin can index into.
* @param bin The bin to test.
* @return The last position that the given bin can represent.
*/
public int getLastLocusInBin(final Bin bin) {
final BAMFileIndex2 fileIndex = (BAMFileIndex2)JVMUtils.getFieldValue(getField("mFileIndex"),this);
if(fileIndex == null)
throw new SAMException("Unable to determine number of index levels; BAM file index is not present.");
return fileIndex.getLastLocusInBin(bin);
}
/**
* Iterate through the given chunks in the file.
* @param chunks List of chunks for which to retrieve data.
@ -123,10 +147,10 @@ public class SAMFileReader2 extends SAMFileReader {
return reader.getOverlappingBins(sequence,start,end);
}
public List<Chunk> getFilePointersBounding(final Bin bin) {
public List<Chunk> getFilePointersBounding(final String sequence, final int start, final int end) {
// TODO: Add sanity checks so that we're not doing this against an unsupported BAM file.
BAMFileReader2 reader = (BAMFileReader2)JVMUtils.getFieldValue(getField("mReader"),this);
return reader.getFilePointersBounding(bin);
return reader.getFilePointersBounding(sequence,start,end);
}
private Field getField(String fieldName) {

View File

@ -3,6 +3,7 @@ package org.broadinstitute.sting.gatk.datasources.shards;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.SAMDataSource;
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.BlockDrivenSAMDataSource;
@ -75,21 +76,29 @@ public class IndexDelimitedLocusShardStrategy implements ShardStrategy {
locationToReference.get(location.getContig()).add(location);
}
// Group the loci by bin, sorted in the order in which bins appear in the file. Only use the smallest bins in the set.
for(String contig: locationToReference.keySet()) {
// Gather bins for the given loci, splitting loci as necessary so that each falls into exactly one lowest-level bin.
SortedMap<Bin,List<GenomeLoc>> bins = new TreeMap<Bin,List<GenomeLoc>>();
for(GenomeLoc location: locationToReference.get(contig)) {
List<Bin> binsForLocation = blockDrivenDataSource.getOverlappingBins(location);
for(Bin bin: binsForLocation) {
if(blockDrivenDataSource.getLevelForBin(bin) == deepestBinLevel) {
final int firstLoc = blockDrivenDataSource.getFirstLocusInBin(bin);
final int lastLoc = blockDrivenDataSource.getLastLocusInBin(bin);
if(!bins.containsKey(bin))
bins.put(bin,new ArrayList<GenomeLoc>());
bins.get(bin).add(location);
bins.get(bin).add(GenomeLocParser.createGenomeLoc(location.getContig(),
Math.max(location.getStart(),firstLoc),
Math.min(location.getStop(),lastLoc)));
}
}
}
for(SortedMap.Entry<Bin,List<GenomeLoc>> entry: bins.entrySet())
// Add a record of the new bin structure.
for(SortedMap.Entry<Bin,List<GenomeLoc>> entry: bins.entrySet()) {
Collections.sort(entry.getValue());
filePointers.add(new FilePointer(entry.getKey(),entry.getValue()));
}
}
filePointerIterator = filePointers.iterator();
@ -111,7 +120,14 @@ public class IndexDelimitedLocusShardStrategy implements ShardStrategy {
*/
public IndexDelimitedLocusShard next() {
FilePointer nextFilePointer = filePointerIterator.next();
Map<SAMFileReader2,List<Chunk>> chunksBounding = blockDrivenDataSource.getFilePointersBounding(nextFilePointer.bin);
String contig = null;
long start = Long.MAX_VALUE, stop = 0;
for(GenomeLoc loc: nextFilePointer.locations) {
contig = loc.getContig();
start = Math.min(loc.getStart(),start);
stop = Math.max(loc.getStop(),stop);
}
Map<SAMFileReader2,List<Chunk>> chunksBounding = blockDrivenDataSource.getFilePointersBounding(GenomeLocParser.createGenomeLoc(contig,start,stop));
return new IndexDelimitedLocusShard(nextFilePointer.locations,chunksBounding,Shard.ShardType.LOCUS_INTERVAL);
}

View File

@ -7,6 +7,7 @@ import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
import org.broadinstitute.sting.gatk.iterators.StingSAMIteratorAdapter;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.JVMUtils;
import net.sf.samtools.*;
import net.sf.samtools.util.CloseableIterator;
import net.sf.picard.sam.SamFileHeaderMerger;
@ -69,14 +70,14 @@ public class BlockDrivenSAMDataSource extends SAMDataSource {
/**
* Gets the file pointers bounded by this bin, grouped by the reader of origination.
* @param bin The bin for which to load data.
* @param locus The loci for which to load data.
* @return A map of the file pointers bounding the bin.
*/
public Map<SAMFileReader2,List<Chunk>> getFilePointersBounding(final Bin bin) {
public Map<SAMFileReader2,List<Chunk>> getFilePointersBounding(GenomeLoc locus) {
Map<SAMFileReader2,List<Chunk>> filePointers = new HashMap<SAMFileReader2,List<Chunk>>();
for(SAMFileReader reader: headerMerger.getReaders()) {
SAMFileReader2 reader2 = (SAMFileReader2)reader;
filePointers.put(reader2,reader2.getFilePointersBounding(bin));
filePointers.put(reader2,reader2.getFilePointersBounding(locus.getContig(),(int)locus.getStart(),(int)locus.getStop()));
}
return filePointers;
}
@ -109,6 +110,35 @@ public class BlockDrivenSAMDataSource extends SAMDataSource {
return firstReader.getLevelForBin(bin);
}
/**
* Gets the first locus that this bin can index into.
* @param bin The bin to test.
* @return The last position that the given bin can represent.
*/
public int getFirstLocusInBin(final Bin bin) {
if(headerMerger.getReaders().size() == 0)
throw new StingException("Unable to determine number of level for bin; no BAMs are present.");
if(!hasIndex())
throw new SAMException("Unable to determine number of level for bin; BAM file index is not present.");
SAMFileReader2 firstReader = (SAMFileReader2)headerMerger.getReaders().iterator().next();
return firstReader.getFirstLocusInBin(bin);
}
/**
* Gets the last locus that this bin can index into.
* @param bin The bin to test.
* @return The last position that the given bin can represent.
*/
public int getLastLocusInBin(final Bin bin) {
if(headerMerger.getReaders().size() == 0)
throw new StingException("Unable to determine number of level for bin; no BAMs are present.");
if(!hasIndex())
throw new SAMException("Unable to determine number of level for bin; BAM file index is not present.");
SAMFileReader2 firstReader = (SAMFileReader2)headerMerger.getReaders().iterator().next();
return firstReader.getLastLocusInBin(bin);
}
public StingSAMIterator seek(Shard shard) {
if(!(shard instanceof BAMFormatAwareShard))
throw new StingException("BlockDrivenSAMDataSource cannot operate on shards of type: " + shard.getClass());