diff --git a/java/src/net/sf/samtools/BAMFileIndex2.java b/java/src/net/sf/samtools/BAMFileIndex2.java index dac293602..6b93ace94 100644 --- a/java/src/net/sf/samtools/BAMFileIndex2.java +++ b/java/src/net/sf/samtools/BAMFileIndex2.java @@ -36,7 +36,17 @@ import java.util.*; */ public class BAMFileIndex2 extends BAMFileIndex { + /** + * Reports the total amount of genomic data that any bin can index. + */ + private static final int BIN_SPAN = 512*1024*1024; + + /** + * Reports the maximum number of bins in a BAM file index, based on the the pseudocode + * in section 1.2 of the BAM spec. + */ private static final int MAX_BINS = 37450; // =(8^6-1)/7+1 + private static final int BAM_LIDX_SHIFT = 14; /** @@ -87,6 +97,30 @@ public class BAMFileIndex2 extends BAMFileIndex throw new SAMException("Unable to find correct bin for bin number "+binNumber); } + /** + * Gets the first locus that this bin can index into. + * @param bin The bin to test. + * @return The last position that the given bin can represent. + */ + protected int getFirstLocusInBin(final Bin bin) { + final int level = getLevelForBinNumber(bin.binNumber); + final int levelStart = LEVEL_STARTS[level]; + final int levelSize = ((level==getNumIndexLevels()-1) ? MAX_BINS-1 : LEVEL_STARTS[level+1]) - levelStart; + return (bin.binNumber - levelStart)*(BIN_SPAN/levelSize); + } + + /** + * Gets the last locus that this bin can index into. + * @param bin The bin to test. + * @return The last position that the given bin can represent. + */ + protected int getLastLocusInBin(final Bin bin) { + final int level = getLevelForBinNumber(bin.binNumber); + final int levelStart = LEVEL_STARTS[level]; + final int levelSize = ((level==getNumIndexLevels()-1) ? MAX_BINS-1 : LEVEL_STARTS[level+1]) - levelStart; + return (bin.binNumber - levelStart + 1)*(BIN_SPAN/levelSize) - 1; + } + /** * Completely load the index into memory. * @param file File to load. @@ -162,7 +196,7 @@ public class BAMFileIndex2 extends BAMFileIndex long[] getFilePointersContaining(final int referenceIndex, final int startPos, final int endPos) { List bins = getBinsContaining(referenceIndex,startPos,endPos); // System.out.println("# Sequence target TID: " + referenceIndex); - if (bins.size() == 0) { + if (bins == null) { return null; } @@ -185,10 +219,6 @@ public class BAMFileIndex2 extends BAMFileIndex return convertToArray(chunkList); } - long[] getFilePointersBounding(final Bin bin) { - return convertToArray(binToChunks.get(bin)); - } - /** * Get a list of bins in the BAM file that may contain SAMRecords for the given range. * @param referenceIndex sequence of desired SAMRecords diff --git a/java/src/net/sf/samtools/BAMFileReader2.java b/java/src/net/sf/samtools/BAMFileReader2.java index 6d954fc11..1f8f9c386 100644 --- a/java/src/net/sf/samtools/BAMFileReader2.java +++ b/java/src/net/sf/samtools/BAMFileReader2.java @@ -194,8 +194,15 @@ class BAMFileReader2 return bins; } - public List getFilePointersBounding(Bin bin) { - return Chunk.toChunkList(getFileIndex().getFilePointersBounding(bin)); + public List getFilePointersBounding(final String sequence, final int start, final int end) { + final SAMFileHeader fileHeader = getFileHeader(); + long[] filePointers = null; + int referenceIndex = fileHeader.getSequenceIndex(sequence); + if (referenceIndex != -1) { + final BAMFileIndex2 fileIndex = getFileIndex(); + filePointers = fileIndex.getFilePointersContaining(referenceIndex,start,end); + } + return (filePointers != null) ? Chunk.toChunkList(filePointers) : Collections.emptyList(); } /** @@ -493,7 +500,7 @@ class BAMFileReader2 throws IOException { while (true) { // Advance to next file block if necessary - while (mCompressedInputStream.getFilePointer() > mFilePointerLimit) { + while (mCompressedInputStream.getFilePointer() >= mFilePointerLimit) { if (mFilePointers == null || mFilePointerIndex >= mFilePointers.length) { return null; diff --git a/java/src/net/sf/samtools/SAMFileReader2.java b/java/src/net/sf/samtools/SAMFileReader2.java index 7aa5a0c1a..823737f64 100644 --- a/java/src/net/sf/samtools/SAMFileReader2.java +++ b/java/src/net/sf/samtools/SAMFileReader2.java @@ -88,7 +88,7 @@ public class SAMFileReader2 extends SAMFileReader { * @return Number of levels in this index. */ public int getNumIndexLevels() { - BAMFileIndex2 fileIndex = (BAMFileIndex2)JVMUtils.getFieldValue(getField("mFileIndex"),this); + final BAMFileIndex2 fileIndex = (BAMFileIndex2)JVMUtils.getFieldValue(getField("mFileIndex"),this); if(fileIndex == null) throw new SAMException("Unable to determine number of index levels; BAM file index is not present."); return fileIndex.getNumIndexLevels(); @@ -100,12 +100,36 @@ public class SAMFileReader2 extends SAMFileReader { * @return the level associated with the given bin number. */ public int getLevelForBin(final Bin bin) { - BAMFileIndex2 fileIndex = (BAMFileIndex2)JVMUtils.getFieldValue(getField("mFileIndex"),this); + final BAMFileIndex2 fileIndex = (BAMFileIndex2)JVMUtils.getFieldValue(getField("mFileIndex"),this); if(fileIndex == null) throw new SAMException("Unable to determine number of index levels; BAM file index is not present."); return fileIndex.getLevelForBinNumber(bin.binNumber); } + /** + * Gets the first locus that this bin can index into. + * @param bin The bin to test. + * @return The last position that the given bin can represent. + */ + public int getFirstLocusInBin(final Bin bin) { + final BAMFileIndex2 fileIndex = (BAMFileIndex2)JVMUtils.getFieldValue(getField("mFileIndex"),this); + if(fileIndex == null) + throw new SAMException("Unable to determine number of index levels; BAM file index is not present."); + return fileIndex.getFirstLocusInBin(bin); + } + + /** + * Gets the last locus that this bin can index into. + * @param bin The bin to test. + * @return The last position that the given bin can represent. + */ + public int getLastLocusInBin(final Bin bin) { + final BAMFileIndex2 fileIndex = (BAMFileIndex2)JVMUtils.getFieldValue(getField("mFileIndex"),this); + if(fileIndex == null) + throw new SAMException("Unable to determine number of index levels; BAM file index is not present."); + return fileIndex.getLastLocusInBin(bin); + } + /** * Iterate through the given chunks in the file. * @param chunks List of chunks for which to retrieve data. @@ -123,10 +147,10 @@ public class SAMFileReader2 extends SAMFileReader { return reader.getOverlappingBins(sequence,start,end); } - public List getFilePointersBounding(final Bin bin) { + public List getFilePointersBounding(final String sequence, final int start, final int end) { // TODO: Add sanity checks so that we're not doing this against an unsupported BAM file. BAMFileReader2 reader = (BAMFileReader2)JVMUtils.getFieldValue(getField("mReader"),this); - return reader.getFilePointersBounding(bin); + return reader.getFilePointersBounding(sequence,start,end); } private Field getField(String fieldName) { diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/shards/IndexDelimitedLocusShardStrategy.java b/java/src/org/broadinstitute/sting/gatk/datasources/shards/IndexDelimitedLocusShardStrategy.java index bcb94f2f9..695d86fdc 100755 --- a/java/src/org/broadinstitute/sting/gatk/datasources/shards/IndexDelimitedLocusShardStrategy.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/shards/IndexDelimitedLocusShardStrategy.java @@ -3,6 +3,7 @@ package org.broadinstitute.sting.gatk.datasources.shards; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocSortedSet; import org.broadinstitute.sting.utils.StingException; +import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.gatk.datasources.simpleDataSources.SAMDataSource; import org.broadinstitute.sting.gatk.datasources.simpleDataSources.BlockDrivenSAMDataSource; @@ -75,21 +76,29 @@ public class IndexDelimitedLocusShardStrategy implements ShardStrategy { locationToReference.get(location.getContig()).add(location); } - // Group the loci by bin, sorted in the order in which bins appear in the file. Only use the smallest bins in the set. for(String contig: locationToReference.keySet()) { + // Gather bins for the given loci, splitting loci as necessary so that each falls into exactly one lowest-level bin. SortedMap> bins = new TreeMap>(); for(GenomeLoc location: locationToReference.get(contig)) { List binsForLocation = blockDrivenDataSource.getOverlappingBins(location); for(Bin bin: binsForLocation) { if(blockDrivenDataSource.getLevelForBin(bin) == deepestBinLevel) { + final int firstLoc = blockDrivenDataSource.getFirstLocusInBin(bin); + final int lastLoc = blockDrivenDataSource.getLastLocusInBin(bin); if(!bins.containsKey(bin)) bins.put(bin,new ArrayList()); - bins.get(bin).add(location); + bins.get(bin).add(GenomeLocParser.createGenomeLoc(location.getContig(), + Math.max(location.getStart(),firstLoc), + Math.min(location.getStop(),lastLoc))); } } } - for(SortedMap.Entry> entry: bins.entrySet()) + + // Add a record of the new bin structure. + for(SortedMap.Entry> entry: bins.entrySet()) { + Collections.sort(entry.getValue()); filePointers.add(new FilePointer(entry.getKey(),entry.getValue())); + } } filePointerIterator = filePointers.iterator(); @@ -111,7 +120,14 @@ public class IndexDelimitedLocusShardStrategy implements ShardStrategy { */ public IndexDelimitedLocusShard next() { FilePointer nextFilePointer = filePointerIterator.next(); - Map> chunksBounding = blockDrivenDataSource.getFilePointersBounding(nextFilePointer.bin); + String contig = null; + long start = Long.MAX_VALUE, stop = 0; + for(GenomeLoc loc: nextFilePointer.locations) { + contig = loc.getContig(); + start = Math.min(loc.getStart(),start); + stop = Math.max(loc.getStop(),stop); + } + Map> chunksBounding = blockDrivenDataSource.getFilePointersBounding(GenomeLocParser.createGenomeLoc(contig,start,stop)); return new IndexDelimitedLocusShard(nextFilePointer.locations,chunksBounding,Shard.ShardType.LOCUS_INTERVAL); } diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/BlockDrivenSAMDataSource.java b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/BlockDrivenSAMDataSource.java index 40ca60d48..71c66052b 100644 --- a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/BlockDrivenSAMDataSource.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/BlockDrivenSAMDataSource.java @@ -7,6 +7,7 @@ import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; import org.broadinstitute.sting.gatk.iterators.StingSAMIteratorAdapter; import org.broadinstitute.sting.utils.StingException; import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.JVMUtils; import net.sf.samtools.*; import net.sf.samtools.util.CloseableIterator; import net.sf.picard.sam.SamFileHeaderMerger; @@ -69,14 +70,14 @@ public class BlockDrivenSAMDataSource extends SAMDataSource { /** * Gets the file pointers bounded by this bin, grouped by the reader of origination. - * @param bin The bin for which to load data. + * @param locus The loci for which to load data. * @return A map of the file pointers bounding the bin. */ - public Map> getFilePointersBounding(final Bin bin) { + public Map> getFilePointersBounding(GenomeLoc locus) { Map> filePointers = new HashMap>(); for(SAMFileReader reader: headerMerger.getReaders()) { SAMFileReader2 reader2 = (SAMFileReader2)reader; - filePointers.put(reader2,reader2.getFilePointersBounding(bin)); + filePointers.put(reader2,reader2.getFilePointersBounding(locus.getContig(),(int)locus.getStart(),(int)locus.getStop())); } return filePointers; } @@ -109,6 +110,35 @@ public class BlockDrivenSAMDataSource extends SAMDataSource { return firstReader.getLevelForBin(bin); } + /** + * Gets the first locus that this bin can index into. + * @param bin The bin to test. + * @return The last position that the given bin can represent. + */ + public int getFirstLocusInBin(final Bin bin) { + if(headerMerger.getReaders().size() == 0) + throw new StingException("Unable to determine number of level for bin; no BAMs are present."); + if(!hasIndex()) + throw new SAMException("Unable to determine number of level for bin; BAM file index is not present."); + SAMFileReader2 firstReader = (SAMFileReader2)headerMerger.getReaders().iterator().next(); + return firstReader.getFirstLocusInBin(bin); + } + + /** + * Gets the last locus that this bin can index into. + * @param bin The bin to test. + * @return The last position that the given bin can represent. + */ + public int getLastLocusInBin(final Bin bin) { + if(headerMerger.getReaders().size() == 0) + throw new StingException("Unable to determine number of level for bin; no BAMs are present."); + if(!hasIndex()) + throw new SAMException("Unable to determine number of level for bin; BAM file index is not present."); + SAMFileReader2 firstReader = (SAMFileReader2)headerMerger.getReaders().iterator().next(); + return firstReader.getLastLocusInBin(bin); + } + + public StingSAMIterator seek(Shard shard) { if(!(shard instanceof BAMFormatAwareShard)) throw new StingException("BlockDrivenSAMDataSource cannot operate on shards of type: " + shard.getClass());