Correcting my incomplete understanding of how the BAM file index actually works.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2833 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
5f74fffa02
commit
77af5822d4
|
|
@ -36,7 +36,17 @@ import java.util.*;
|
|||
*/
|
||||
public class BAMFileIndex2 extends BAMFileIndex
|
||||
{
|
||||
/**
|
||||
* Reports the total amount of genomic data that any bin can index.
|
||||
*/
|
||||
private static final int BIN_SPAN = 512*1024*1024;
|
||||
|
||||
/**
|
||||
* Reports the maximum number of bins in a BAM file index, based on the the pseudocode
|
||||
* in section 1.2 of the BAM spec.
|
||||
*/
|
||||
private static final int MAX_BINS = 37450; // =(8^6-1)/7+1
|
||||
|
||||
private static final int BAM_LIDX_SHIFT = 14;
|
||||
|
||||
/**
|
||||
|
|
@ -87,6 +97,30 @@ public class BAMFileIndex2 extends BAMFileIndex
|
|||
throw new SAMException("Unable to find correct bin for bin number "+binNumber);
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the first locus that this bin can index into.
|
||||
* @param bin The bin to test.
|
||||
* @return The last position that the given bin can represent.
|
||||
*/
|
||||
protected int getFirstLocusInBin(final Bin bin) {
|
||||
final int level = getLevelForBinNumber(bin.binNumber);
|
||||
final int levelStart = LEVEL_STARTS[level];
|
||||
final int levelSize = ((level==getNumIndexLevels()-1) ? MAX_BINS-1 : LEVEL_STARTS[level+1]) - levelStart;
|
||||
return (bin.binNumber - levelStart)*(BIN_SPAN/levelSize);
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the last locus that this bin can index into.
|
||||
* @param bin The bin to test.
|
||||
* @return The last position that the given bin can represent.
|
||||
*/
|
||||
protected int getLastLocusInBin(final Bin bin) {
|
||||
final int level = getLevelForBinNumber(bin.binNumber);
|
||||
final int levelStart = LEVEL_STARTS[level];
|
||||
final int levelSize = ((level==getNumIndexLevels()-1) ? MAX_BINS-1 : LEVEL_STARTS[level+1]) - levelStart;
|
||||
return (bin.binNumber - levelStart + 1)*(BIN_SPAN/levelSize) - 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Completely load the index into memory.
|
||||
* @param file File to load.
|
||||
|
|
@ -162,7 +196,7 @@ public class BAMFileIndex2 extends BAMFileIndex
|
|||
long[] getFilePointersContaining(final int referenceIndex, final int startPos, final int endPos) {
|
||||
List<Bin> bins = getBinsContaining(referenceIndex,startPos,endPos);
|
||||
// System.out.println("# Sequence target TID: " + referenceIndex);
|
||||
if (bins.size() == 0) {
|
||||
if (bins == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
|
|
@ -185,10 +219,6 @@ public class BAMFileIndex2 extends BAMFileIndex
|
|||
return convertToArray(chunkList);
|
||||
}
|
||||
|
||||
long[] getFilePointersBounding(final Bin bin) {
|
||||
return convertToArray(binToChunks.get(bin));
|
||||
}
|
||||
|
||||
/**
|
||||
* Get a list of bins in the BAM file that may contain SAMRecords for the given range.
|
||||
* @param referenceIndex sequence of desired SAMRecords
|
||||
|
|
|
|||
|
|
@ -194,8 +194,15 @@ class BAMFileReader2
|
|||
return bins;
|
||||
}
|
||||
|
||||
public List<Chunk> getFilePointersBounding(Bin bin) {
|
||||
return Chunk.toChunkList(getFileIndex().getFilePointersBounding(bin));
|
||||
public List<Chunk> getFilePointersBounding(final String sequence, final int start, final int end) {
|
||||
final SAMFileHeader fileHeader = getFileHeader();
|
||||
long[] filePointers = null;
|
||||
int referenceIndex = fileHeader.getSequenceIndex(sequence);
|
||||
if (referenceIndex != -1) {
|
||||
final BAMFileIndex2 fileIndex = getFileIndex();
|
||||
filePointers = fileIndex.getFilePointersContaining(referenceIndex,start,end);
|
||||
}
|
||||
return (filePointers != null) ? Chunk.toChunkList(filePointers) : Collections.<Chunk>emptyList();
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -493,7 +500,7 @@ class BAMFileReader2
|
|||
throws IOException {
|
||||
while (true) {
|
||||
// Advance to next file block if necessary
|
||||
while (mCompressedInputStream.getFilePointer() > mFilePointerLimit) {
|
||||
while (mCompressedInputStream.getFilePointer() >= mFilePointerLimit) {
|
||||
if (mFilePointers == null ||
|
||||
mFilePointerIndex >= mFilePointers.length) {
|
||||
return null;
|
||||
|
|
|
|||
|
|
@ -88,7 +88,7 @@ public class SAMFileReader2 extends SAMFileReader {
|
|||
* @return Number of levels in this index.
|
||||
*/
|
||||
public int getNumIndexLevels() {
|
||||
BAMFileIndex2 fileIndex = (BAMFileIndex2)JVMUtils.getFieldValue(getField("mFileIndex"),this);
|
||||
final BAMFileIndex2 fileIndex = (BAMFileIndex2)JVMUtils.getFieldValue(getField("mFileIndex"),this);
|
||||
if(fileIndex == null)
|
||||
throw new SAMException("Unable to determine number of index levels; BAM file index is not present.");
|
||||
return fileIndex.getNumIndexLevels();
|
||||
|
|
@ -100,12 +100,36 @@ public class SAMFileReader2 extends SAMFileReader {
|
|||
* @return the level associated with the given bin number.
|
||||
*/
|
||||
public int getLevelForBin(final Bin bin) {
|
||||
BAMFileIndex2 fileIndex = (BAMFileIndex2)JVMUtils.getFieldValue(getField("mFileIndex"),this);
|
||||
final BAMFileIndex2 fileIndex = (BAMFileIndex2)JVMUtils.getFieldValue(getField("mFileIndex"),this);
|
||||
if(fileIndex == null)
|
||||
throw new SAMException("Unable to determine number of index levels; BAM file index is not present.");
|
||||
return fileIndex.getLevelForBinNumber(bin.binNumber);
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the first locus that this bin can index into.
|
||||
* @param bin The bin to test.
|
||||
* @return The last position that the given bin can represent.
|
||||
*/
|
||||
public int getFirstLocusInBin(final Bin bin) {
|
||||
final BAMFileIndex2 fileIndex = (BAMFileIndex2)JVMUtils.getFieldValue(getField("mFileIndex"),this);
|
||||
if(fileIndex == null)
|
||||
throw new SAMException("Unable to determine number of index levels; BAM file index is not present.");
|
||||
return fileIndex.getFirstLocusInBin(bin);
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the last locus that this bin can index into.
|
||||
* @param bin The bin to test.
|
||||
* @return The last position that the given bin can represent.
|
||||
*/
|
||||
public int getLastLocusInBin(final Bin bin) {
|
||||
final BAMFileIndex2 fileIndex = (BAMFileIndex2)JVMUtils.getFieldValue(getField("mFileIndex"),this);
|
||||
if(fileIndex == null)
|
||||
throw new SAMException("Unable to determine number of index levels; BAM file index is not present.");
|
||||
return fileIndex.getLastLocusInBin(bin);
|
||||
}
|
||||
|
||||
/**
|
||||
* Iterate through the given chunks in the file.
|
||||
* @param chunks List of chunks for which to retrieve data.
|
||||
|
|
@ -123,10 +147,10 @@ public class SAMFileReader2 extends SAMFileReader {
|
|||
return reader.getOverlappingBins(sequence,start,end);
|
||||
}
|
||||
|
||||
public List<Chunk> getFilePointersBounding(final Bin bin) {
|
||||
public List<Chunk> getFilePointersBounding(final String sequence, final int start, final int end) {
|
||||
// TODO: Add sanity checks so that we're not doing this against an unsupported BAM file.
|
||||
BAMFileReader2 reader = (BAMFileReader2)JVMUtils.getFieldValue(getField("mReader"),this);
|
||||
return reader.getFilePointersBounding(bin);
|
||||
return reader.getFilePointersBounding(sequence,start,end);
|
||||
}
|
||||
|
||||
private Field getField(String fieldName) {
|
||||
|
|
|
|||
|
|
@ -3,6 +3,7 @@ package org.broadinstitute.sting.gatk.datasources.shards;
|
|||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.SAMDataSource;
|
||||
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.BlockDrivenSAMDataSource;
|
||||
|
||||
|
|
@ -75,21 +76,29 @@ public class IndexDelimitedLocusShardStrategy implements ShardStrategy {
|
|||
locationToReference.get(location.getContig()).add(location);
|
||||
}
|
||||
|
||||
// Group the loci by bin, sorted in the order in which bins appear in the file. Only use the smallest bins in the set.
|
||||
for(String contig: locationToReference.keySet()) {
|
||||
// Gather bins for the given loci, splitting loci as necessary so that each falls into exactly one lowest-level bin.
|
||||
SortedMap<Bin,List<GenomeLoc>> bins = new TreeMap<Bin,List<GenomeLoc>>();
|
||||
for(GenomeLoc location: locationToReference.get(contig)) {
|
||||
List<Bin> binsForLocation = blockDrivenDataSource.getOverlappingBins(location);
|
||||
for(Bin bin: binsForLocation) {
|
||||
if(blockDrivenDataSource.getLevelForBin(bin) == deepestBinLevel) {
|
||||
final int firstLoc = blockDrivenDataSource.getFirstLocusInBin(bin);
|
||||
final int lastLoc = blockDrivenDataSource.getLastLocusInBin(bin);
|
||||
if(!bins.containsKey(bin))
|
||||
bins.put(bin,new ArrayList<GenomeLoc>());
|
||||
bins.get(bin).add(location);
|
||||
bins.get(bin).add(GenomeLocParser.createGenomeLoc(location.getContig(),
|
||||
Math.max(location.getStart(),firstLoc),
|
||||
Math.min(location.getStop(),lastLoc)));
|
||||
}
|
||||
}
|
||||
}
|
||||
for(SortedMap.Entry<Bin,List<GenomeLoc>> entry: bins.entrySet())
|
||||
|
||||
// Add a record of the new bin structure.
|
||||
for(SortedMap.Entry<Bin,List<GenomeLoc>> entry: bins.entrySet()) {
|
||||
Collections.sort(entry.getValue());
|
||||
filePointers.add(new FilePointer(entry.getKey(),entry.getValue()));
|
||||
}
|
||||
}
|
||||
|
||||
filePointerIterator = filePointers.iterator();
|
||||
|
|
@ -111,7 +120,14 @@ public class IndexDelimitedLocusShardStrategy implements ShardStrategy {
|
|||
*/
|
||||
public IndexDelimitedLocusShard next() {
|
||||
FilePointer nextFilePointer = filePointerIterator.next();
|
||||
Map<SAMFileReader2,List<Chunk>> chunksBounding = blockDrivenDataSource.getFilePointersBounding(nextFilePointer.bin);
|
||||
String contig = null;
|
||||
long start = Long.MAX_VALUE, stop = 0;
|
||||
for(GenomeLoc loc: nextFilePointer.locations) {
|
||||
contig = loc.getContig();
|
||||
start = Math.min(loc.getStart(),start);
|
||||
stop = Math.max(loc.getStop(),stop);
|
||||
}
|
||||
Map<SAMFileReader2,List<Chunk>> chunksBounding = blockDrivenDataSource.getFilePointersBounding(GenomeLocParser.createGenomeLoc(contig,start,stop));
|
||||
return new IndexDelimitedLocusShard(nextFilePointer.locations,chunksBounding,Shard.ShardType.LOCUS_INTERVAL);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -7,6 +7,7 @@ import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
|
|||
import org.broadinstitute.sting.gatk.iterators.StingSAMIteratorAdapter;
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.JVMUtils;
|
||||
import net.sf.samtools.*;
|
||||
import net.sf.samtools.util.CloseableIterator;
|
||||
import net.sf.picard.sam.SamFileHeaderMerger;
|
||||
|
|
@ -69,14 +70,14 @@ public class BlockDrivenSAMDataSource extends SAMDataSource {
|
|||
|
||||
/**
|
||||
* Gets the file pointers bounded by this bin, grouped by the reader of origination.
|
||||
* @param bin The bin for which to load data.
|
||||
* @param locus The loci for which to load data.
|
||||
* @return A map of the file pointers bounding the bin.
|
||||
*/
|
||||
public Map<SAMFileReader2,List<Chunk>> getFilePointersBounding(final Bin bin) {
|
||||
public Map<SAMFileReader2,List<Chunk>> getFilePointersBounding(GenomeLoc locus) {
|
||||
Map<SAMFileReader2,List<Chunk>> filePointers = new HashMap<SAMFileReader2,List<Chunk>>();
|
||||
for(SAMFileReader reader: headerMerger.getReaders()) {
|
||||
SAMFileReader2 reader2 = (SAMFileReader2)reader;
|
||||
filePointers.put(reader2,reader2.getFilePointersBounding(bin));
|
||||
filePointers.put(reader2,reader2.getFilePointersBounding(locus.getContig(),(int)locus.getStart(),(int)locus.getStop()));
|
||||
}
|
||||
return filePointers;
|
||||
}
|
||||
|
|
@ -109,6 +110,35 @@ public class BlockDrivenSAMDataSource extends SAMDataSource {
|
|||
return firstReader.getLevelForBin(bin);
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the first locus that this bin can index into.
|
||||
* @param bin The bin to test.
|
||||
* @return The last position that the given bin can represent.
|
||||
*/
|
||||
public int getFirstLocusInBin(final Bin bin) {
|
||||
if(headerMerger.getReaders().size() == 0)
|
||||
throw new StingException("Unable to determine number of level for bin; no BAMs are present.");
|
||||
if(!hasIndex())
|
||||
throw new SAMException("Unable to determine number of level for bin; BAM file index is not present.");
|
||||
SAMFileReader2 firstReader = (SAMFileReader2)headerMerger.getReaders().iterator().next();
|
||||
return firstReader.getFirstLocusInBin(bin);
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the last locus that this bin can index into.
|
||||
* @param bin The bin to test.
|
||||
* @return The last position that the given bin can represent.
|
||||
*/
|
||||
public int getLastLocusInBin(final Bin bin) {
|
||||
if(headerMerger.getReaders().size() == 0)
|
||||
throw new StingException("Unable to determine number of level for bin; no BAMs are present.");
|
||||
if(!hasIndex())
|
||||
throw new SAMException("Unable to determine number of level for bin; BAM file index is not present.");
|
||||
SAMFileReader2 firstReader = (SAMFileReader2)headerMerger.getReaders().iterator().next();
|
||||
return firstReader.getLastLocusInBin(bin);
|
||||
}
|
||||
|
||||
|
||||
public StingSAMIterator seek(Shard shard) {
|
||||
if(!(shard instanceof BAMFormatAwareShard))
|
||||
throw new StingException("BlockDrivenSAMDataSource cannot operate on shards of type: " + shard.getClass());
|
||||
|
|
|
|||
Loading…
Reference in New Issue