Correcting my incomplete understanding of how the BAM file index actually works.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2833 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
5f74fffa02
commit
77af5822d4
|
|
@ -36,7 +36,17 @@ import java.util.*;
|
||||||
*/
|
*/
|
||||||
public class BAMFileIndex2 extends BAMFileIndex
|
public class BAMFileIndex2 extends BAMFileIndex
|
||||||
{
|
{
|
||||||
|
/**
|
||||||
|
* Reports the total amount of genomic data that any bin can index.
|
||||||
|
*/
|
||||||
|
private static final int BIN_SPAN = 512*1024*1024;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Reports the maximum number of bins in a BAM file index, based on the the pseudocode
|
||||||
|
* in section 1.2 of the BAM spec.
|
||||||
|
*/
|
||||||
private static final int MAX_BINS = 37450; // =(8^6-1)/7+1
|
private static final int MAX_BINS = 37450; // =(8^6-1)/7+1
|
||||||
|
|
||||||
private static final int BAM_LIDX_SHIFT = 14;
|
private static final int BAM_LIDX_SHIFT = 14;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -87,6 +97,30 @@ public class BAMFileIndex2 extends BAMFileIndex
|
||||||
throw new SAMException("Unable to find correct bin for bin number "+binNumber);
|
throw new SAMException("Unable to find correct bin for bin number "+binNumber);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Gets the first locus that this bin can index into.
|
||||||
|
* @param bin The bin to test.
|
||||||
|
* @return The last position that the given bin can represent.
|
||||||
|
*/
|
||||||
|
protected int getFirstLocusInBin(final Bin bin) {
|
||||||
|
final int level = getLevelForBinNumber(bin.binNumber);
|
||||||
|
final int levelStart = LEVEL_STARTS[level];
|
||||||
|
final int levelSize = ((level==getNumIndexLevels()-1) ? MAX_BINS-1 : LEVEL_STARTS[level+1]) - levelStart;
|
||||||
|
return (bin.binNumber - levelStart)*(BIN_SPAN/levelSize);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Gets the last locus that this bin can index into.
|
||||||
|
* @param bin The bin to test.
|
||||||
|
* @return The last position that the given bin can represent.
|
||||||
|
*/
|
||||||
|
protected int getLastLocusInBin(final Bin bin) {
|
||||||
|
final int level = getLevelForBinNumber(bin.binNumber);
|
||||||
|
final int levelStart = LEVEL_STARTS[level];
|
||||||
|
final int levelSize = ((level==getNumIndexLevels()-1) ? MAX_BINS-1 : LEVEL_STARTS[level+1]) - levelStart;
|
||||||
|
return (bin.binNumber - levelStart + 1)*(BIN_SPAN/levelSize) - 1;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Completely load the index into memory.
|
* Completely load the index into memory.
|
||||||
* @param file File to load.
|
* @param file File to load.
|
||||||
|
|
@ -162,7 +196,7 @@ public class BAMFileIndex2 extends BAMFileIndex
|
||||||
long[] getFilePointersContaining(final int referenceIndex, final int startPos, final int endPos) {
|
long[] getFilePointersContaining(final int referenceIndex, final int startPos, final int endPos) {
|
||||||
List<Bin> bins = getBinsContaining(referenceIndex,startPos,endPos);
|
List<Bin> bins = getBinsContaining(referenceIndex,startPos,endPos);
|
||||||
// System.out.println("# Sequence target TID: " + referenceIndex);
|
// System.out.println("# Sequence target TID: " + referenceIndex);
|
||||||
if (bins.size() == 0) {
|
if (bins == null) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -185,10 +219,6 @@ public class BAMFileIndex2 extends BAMFileIndex
|
||||||
return convertToArray(chunkList);
|
return convertToArray(chunkList);
|
||||||
}
|
}
|
||||||
|
|
||||||
long[] getFilePointersBounding(final Bin bin) {
|
|
||||||
return convertToArray(binToChunks.get(bin));
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Get a list of bins in the BAM file that may contain SAMRecords for the given range.
|
* Get a list of bins in the BAM file that may contain SAMRecords for the given range.
|
||||||
* @param referenceIndex sequence of desired SAMRecords
|
* @param referenceIndex sequence of desired SAMRecords
|
||||||
|
|
|
||||||
|
|
@ -194,8 +194,15 @@ class BAMFileReader2
|
||||||
return bins;
|
return bins;
|
||||||
}
|
}
|
||||||
|
|
||||||
public List<Chunk> getFilePointersBounding(Bin bin) {
|
public List<Chunk> getFilePointersBounding(final String sequence, final int start, final int end) {
|
||||||
return Chunk.toChunkList(getFileIndex().getFilePointersBounding(bin));
|
final SAMFileHeader fileHeader = getFileHeader();
|
||||||
|
long[] filePointers = null;
|
||||||
|
int referenceIndex = fileHeader.getSequenceIndex(sequence);
|
||||||
|
if (referenceIndex != -1) {
|
||||||
|
final BAMFileIndex2 fileIndex = getFileIndex();
|
||||||
|
filePointers = fileIndex.getFilePointersContaining(referenceIndex,start,end);
|
||||||
|
}
|
||||||
|
return (filePointers != null) ? Chunk.toChunkList(filePointers) : Collections.<Chunk>emptyList();
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -493,7 +500,7 @@ class BAMFileReader2
|
||||||
throws IOException {
|
throws IOException {
|
||||||
while (true) {
|
while (true) {
|
||||||
// Advance to next file block if necessary
|
// Advance to next file block if necessary
|
||||||
while (mCompressedInputStream.getFilePointer() > mFilePointerLimit) {
|
while (mCompressedInputStream.getFilePointer() >= mFilePointerLimit) {
|
||||||
if (mFilePointers == null ||
|
if (mFilePointers == null ||
|
||||||
mFilePointerIndex >= mFilePointers.length) {
|
mFilePointerIndex >= mFilePointers.length) {
|
||||||
return null;
|
return null;
|
||||||
|
|
|
||||||
|
|
@ -88,7 +88,7 @@ public class SAMFileReader2 extends SAMFileReader {
|
||||||
* @return Number of levels in this index.
|
* @return Number of levels in this index.
|
||||||
*/
|
*/
|
||||||
public int getNumIndexLevels() {
|
public int getNumIndexLevels() {
|
||||||
BAMFileIndex2 fileIndex = (BAMFileIndex2)JVMUtils.getFieldValue(getField("mFileIndex"),this);
|
final BAMFileIndex2 fileIndex = (BAMFileIndex2)JVMUtils.getFieldValue(getField("mFileIndex"),this);
|
||||||
if(fileIndex == null)
|
if(fileIndex == null)
|
||||||
throw new SAMException("Unable to determine number of index levels; BAM file index is not present.");
|
throw new SAMException("Unable to determine number of index levels; BAM file index is not present.");
|
||||||
return fileIndex.getNumIndexLevels();
|
return fileIndex.getNumIndexLevels();
|
||||||
|
|
@ -100,12 +100,36 @@ public class SAMFileReader2 extends SAMFileReader {
|
||||||
* @return the level associated with the given bin number.
|
* @return the level associated with the given bin number.
|
||||||
*/
|
*/
|
||||||
public int getLevelForBin(final Bin bin) {
|
public int getLevelForBin(final Bin bin) {
|
||||||
BAMFileIndex2 fileIndex = (BAMFileIndex2)JVMUtils.getFieldValue(getField("mFileIndex"),this);
|
final BAMFileIndex2 fileIndex = (BAMFileIndex2)JVMUtils.getFieldValue(getField("mFileIndex"),this);
|
||||||
if(fileIndex == null)
|
if(fileIndex == null)
|
||||||
throw new SAMException("Unable to determine number of index levels; BAM file index is not present.");
|
throw new SAMException("Unable to determine number of index levels; BAM file index is not present.");
|
||||||
return fileIndex.getLevelForBinNumber(bin.binNumber);
|
return fileIndex.getLevelForBinNumber(bin.binNumber);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Gets the first locus that this bin can index into.
|
||||||
|
* @param bin The bin to test.
|
||||||
|
* @return The last position that the given bin can represent.
|
||||||
|
*/
|
||||||
|
public int getFirstLocusInBin(final Bin bin) {
|
||||||
|
final BAMFileIndex2 fileIndex = (BAMFileIndex2)JVMUtils.getFieldValue(getField("mFileIndex"),this);
|
||||||
|
if(fileIndex == null)
|
||||||
|
throw new SAMException("Unable to determine number of index levels; BAM file index is not present.");
|
||||||
|
return fileIndex.getFirstLocusInBin(bin);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Gets the last locus that this bin can index into.
|
||||||
|
* @param bin The bin to test.
|
||||||
|
* @return The last position that the given bin can represent.
|
||||||
|
*/
|
||||||
|
public int getLastLocusInBin(final Bin bin) {
|
||||||
|
final BAMFileIndex2 fileIndex = (BAMFileIndex2)JVMUtils.getFieldValue(getField("mFileIndex"),this);
|
||||||
|
if(fileIndex == null)
|
||||||
|
throw new SAMException("Unable to determine number of index levels; BAM file index is not present.");
|
||||||
|
return fileIndex.getLastLocusInBin(bin);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Iterate through the given chunks in the file.
|
* Iterate through the given chunks in the file.
|
||||||
* @param chunks List of chunks for which to retrieve data.
|
* @param chunks List of chunks for which to retrieve data.
|
||||||
|
|
@ -123,10 +147,10 @@ public class SAMFileReader2 extends SAMFileReader {
|
||||||
return reader.getOverlappingBins(sequence,start,end);
|
return reader.getOverlappingBins(sequence,start,end);
|
||||||
}
|
}
|
||||||
|
|
||||||
public List<Chunk> getFilePointersBounding(final Bin bin) {
|
public List<Chunk> getFilePointersBounding(final String sequence, final int start, final int end) {
|
||||||
// TODO: Add sanity checks so that we're not doing this against an unsupported BAM file.
|
// TODO: Add sanity checks so that we're not doing this against an unsupported BAM file.
|
||||||
BAMFileReader2 reader = (BAMFileReader2)JVMUtils.getFieldValue(getField("mReader"),this);
|
BAMFileReader2 reader = (BAMFileReader2)JVMUtils.getFieldValue(getField("mReader"),this);
|
||||||
return reader.getFilePointersBounding(bin);
|
return reader.getFilePointersBounding(sequence,start,end);
|
||||||
}
|
}
|
||||||
|
|
||||||
private Field getField(String fieldName) {
|
private Field getField(String fieldName) {
|
||||||
|
|
|
||||||
|
|
@ -3,6 +3,7 @@ package org.broadinstitute.sting.gatk.datasources.shards;
|
||||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
|
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
|
||||||
import org.broadinstitute.sting.utils.StingException;
|
import org.broadinstitute.sting.utils.StingException;
|
||||||
|
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||||
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.SAMDataSource;
|
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.SAMDataSource;
|
||||||
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.BlockDrivenSAMDataSource;
|
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.BlockDrivenSAMDataSource;
|
||||||
|
|
||||||
|
|
@ -75,21 +76,29 @@ public class IndexDelimitedLocusShardStrategy implements ShardStrategy {
|
||||||
locationToReference.get(location.getContig()).add(location);
|
locationToReference.get(location.getContig()).add(location);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Group the loci by bin, sorted in the order in which bins appear in the file. Only use the smallest bins in the set.
|
|
||||||
for(String contig: locationToReference.keySet()) {
|
for(String contig: locationToReference.keySet()) {
|
||||||
|
// Gather bins for the given loci, splitting loci as necessary so that each falls into exactly one lowest-level bin.
|
||||||
SortedMap<Bin,List<GenomeLoc>> bins = new TreeMap<Bin,List<GenomeLoc>>();
|
SortedMap<Bin,List<GenomeLoc>> bins = new TreeMap<Bin,List<GenomeLoc>>();
|
||||||
for(GenomeLoc location: locationToReference.get(contig)) {
|
for(GenomeLoc location: locationToReference.get(contig)) {
|
||||||
List<Bin> binsForLocation = blockDrivenDataSource.getOverlappingBins(location);
|
List<Bin> binsForLocation = blockDrivenDataSource.getOverlappingBins(location);
|
||||||
for(Bin bin: binsForLocation) {
|
for(Bin bin: binsForLocation) {
|
||||||
if(blockDrivenDataSource.getLevelForBin(bin) == deepestBinLevel) {
|
if(blockDrivenDataSource.getLevelForBin(bin) == deepestBinLevel) {
|
||||||
|
final int firstLoc = blockDrivenDataSource.getFirstLocusInBin(bin);
|
||||||
|
final int lastLoc = blockDrivenDataSource.getLastLocusInBin(bin);
|
||||||
if(!bins.containsKey(bin))
|
if(!bins.containsKey(bin))
|
||||||
bins.put(bin,new ArrayList<GenomeLoc>());
|
bins.put(bin,new ArrayList<GenomeLoc>());
|
||||||
bins.get(bin).add(location);
|
bins.get(bin).add(GenomeLocParser.createGenomeLoc(location.getContig(),
|
||||||
|
Math.max(location.getStart(),firstLoc),
|
||||||
|
Math.min(location.getStop(),lastLoc)));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
for(SortedMap.Entry<Bin,List<GenomeLoc>> entry: bins.entrySet())
|
|
||||||
|
// Add a record of the new bin structure.
|
||||||
|
for(SortedMap.Entry<Bin,List<GenomeLoc>> entry: bins.entrySet()) {
|
||||||
|
Collections.sort(entry.getValue());
|
||||||
filePointers.add(new FilePointer(entry.getKey(),entry.getValue()));
|
filePointers.add(new FilePointer(entry.getKey(),entry.getValue()));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
filePointerIterator = filePointers.iterator();
|
filePointerIterator = filePointers.iterator();
|
||||||
|
|
@ -111,7 +120,14 @@ public class IndexDelimitedLocusShardStrategy implements ShardStrategy {
|
||||||
*/
|
*/
|
||||||
public IndexDelimitedLocusShard next() {
|
public IndexDelimitedLocusShard next() {
|
||||||
FilePointer nextFilePointer = filePointerIterator.next();
|
FilePointer nextFilePointer = filePointerIterator.next();
|
||||||
Map<SAMFileReader2,List<Chunk>> chunksBounding = blockDrivenDataSource.getFilePointersBounding(nextFilePointer.bin);
|
String contig = null;
|
||||||
|
long start = Long.MAX_VALUE, stop = 0;
|
||||||
|
for(GenomeLoc loc: nextFilePointer.locations) {
|
||||||
|
contig = loc.getContig();
|
||||||
|
start = Math.min(loc.getStart(),start);
|
||||||
|
stop = Math.max(loc.getStop(),stop);
|
||||||
|
}
|
||||||
|
Map<SAMFileReader2,List<Chunk>> chunksBounding = blockDrivenDataSource.getFilePointersBounding(GenomeLocParser.createGenomeLoc(contig,start,stop));
|
||||||
return new IndexDelimitedLocusShard(nextFilePointer.locations,chunksBounding,Shard.ShardType.LOCUS_INTERVAL);
|
return new IndexDelimitedLocusShard(nextFilePointer.locations,chunksBounding,Shard.ShardType.LOCUS_INTERVAL);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -7,6 +7,7 @@ import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
|
||||||
import org.broadinstitute.sting.gatk.iterators.StingSAMIteratorAdapter;
|
import org.broadinstitute.sting.gatk.iterators.StingSAMIteratorAdapter;
|
||||||
import org.broadinstitute.sting.utils.StingException;
|
import org.broadinstitute.sting.utils.StingException;
|
||||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
|
import org.broadinstitute.sting.utils.JVMUtils;
|
||||||
import net.sf.samtools.*;
|
import net.sf.samtools.*;
|
||||||
import net.sf.samtools.util.CloseableIterator;
|
import net.sf.samtools.util.CloseableIterator;
|
||||||
import net.sf.picard.sam.SamFileHeaderMerger;
|
import net.sf.picard.sam.SamFileHeaderMerger;
|
||||||
|
|
@ -69,14 +70,14 @@ public class BlockDrivenSAMDataSource extends SAMDataSource {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Gets the file pointers bounded by this bin, grouped by the reader of origination.
|
* Gets the file pointers bounded by this bin, grouped by the reader of origination.
|
||||||
* @param bin The bin for which to load data.
|
* @param locus The loci for which to load data.
|
||||||
* @return A map of the file pointers bounding the bin.
|
* @return A map of the file pointers bounding the bin.
|
||||||
*/
|
*/
|
||||||
public Map<SAMFileReader2,List<Chunk>> getFilePointersBounding(final Bin bin) {
|
public Map<SAMFileReader2,List<Chunk>> getFilePointersBounding(GenomeLoc locus) {
|
||||||
Map<SAMFileReader2,List<Chunk>> filePointers = new HashMap<SAMFileReader2,List<Chunk>>();
|
Map<SAMFileReader2,List<Chunk>> filePointers = new HashMap<SAMFileReader2,List<Chunk>>();
|
||||||
for(SAMFileReader reader: headerMerger.getReaders()) {
|
for(SAMFileReader reader: headerMerger.getReaders()) {
|
||||||
SAMFileReader2 reader2 = (SAMFileReader2)reader;
|
SAMFileReader2 reader2 = (SAMFileReader2)reader;
|
||||||
filePointers.put(reader2,reader2.getFilePointersBounding(bin));
|
filePointers.put(reader2,reader2.getFilePointersBounding(locus.getContig(),(int)locus.getStart(),(int)locus.getStop()));
|
||||||
}
|
}
|
||||||
return filePointers;
|
return filePointers;
|
||||||
}
|
}
|
||||||
|
|
@ -109,6 +110,35 @@ public class BlockDrivenSAMDataSource extends SAMDataSource {
|
||||||
return firstReader.getLevelForBin(bin);
|
return firstReader.getLevelForBin(bin);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Gets the first locus that this bin can index into.
|
||||||
|
* @param bin The bin to test.
|
||||||
|
* @return The last position that the given bin can represent.
|
||||||
|
*/
|
||||||
|
public int getFirstLocusInBin(final Bin bin) {
|
||||||
|
if(headerMerger.getReaders().size() == 0)
|
||||||
|
throw new StingException("Unable to determine number of level for bin; no BAMs are present.");
|
||||||
|
if(!hasIndex())
|
||||||
|
throw new SAMException("Unable to determine number of level for bin; BAM file index is not present.");
|
||||||
|
SAMFileReader2 firstReader = (SAMFileReader2)headerMerger.getReaders().iterator().next();
|
||||||
|
return firstReader.getFirstLocusInBin(bin);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Gets the last locus that this bin can index into.
|
||||||
|
* @param bin The bin to test.
|
||||||
|
* @return The last position that the given bin can represent.
|
||||||
|
*/
|
||||||
|
public int getLastLocusInBin(final Bin bin) {
|
||||||
|
if(headerMerger.getReaders().size() == 0)
|
||||||
|
throw new StingException("Unable to determine number of level for bin; no BAMs are present.");
|
||||||
|
if(!hasIndex())
|
||||||
|
throw new SAMException("Unable to determine number of level for bin; BAM file index is not present.");
|
||||||
|
SAMFileReader2 firstReader = (SAMFileReader2)headerMerger.getReaders().iterator().next();
|
||||||
|
return firstReader.getLastLocusInBin(bin);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
public StingSAMIterator seek(Shard shard) {
|
public StingSAMIterator seek(Shard shard) {
|
||||||
if(!(shard instanceof BAMFormatAwareShard))
|
if(!(shard instanceof BAMFormatAwareShard))
|
||||||
throw new StingException("BlockDrivenSAMDataSource cannot operate on shards of type: " + shard.getClass());
|
throw new StingException("BlockDrivenSAMDataSource cannot operate on shards of type: " + shard.getClass());
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue