From e53432d54d78c89a3dfcc5add1d3a4d6aeb12114 Mon Sep 17 00:00:00 2001 From: hanna Date: Fri, 5 Feb 2010 02:48:02 +0000 Subject: [PATCH] Checkpoint for combining adjacent intervals into the same shard. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2782 348d0f76-0448-11de-a6fe-93d51630548a --- java/src/net/sf/samtools/BAMFileIndex2.java | 98 ++++++++++++++----- java/src/net/sf/samtools/BAMFileReader2.java | 26 ++--- java/src/net/sf/samtools/Bin.java | 34 ++++++- java/src/net/sf/samtools/SAMFileReader2.java | 45 +++++++-- .../sting/gatk/datasources/BAMFileStat.java | 6 +- .../IndexDelimitedLocusShardStrategy.java | 77 ++++++++++++--- .../BlockDrivenSAMDataSource.java | 29 +++++- 7 files changed, 250 insertions(+), 65 deletions(-) diff --git a/java/src/net/sf/samtools/BAMFileIndex2.java b/java/src/net/sf/samtools/BAMFileIndex2.java index 17b470b17..39d18df78 100644 --- a/java/src/net/sf/samtools/BAMFileIndex2.java +++ b/java/src/net/sf/samtools/BAMFileIndex2.java @@ -39,17 +39,48 @@ public class BAMFileIndex2 private static final int MAX_BINS = 37450; // =(8^6-1)/7+1 private static final int BAM_LIDX_SHIFT = 14; + /** + * What is the starting bin for each level? + */ + private static final int[] LEVEL_STARTS = {0,1,9,73,585,4681}; + /** * A mapping of reference sequence index to list of bins. */ protected final SortedMap referenceToBins = new TreeMap(); + /** + * A mapping of reference sequence index to linear indices. + */ protected final SortedMap referenceToLinearIndices = new TreeMap(); protected BAMFileIndex2(final File file) { loadIndex(file); } + /** + * Get the number of levels employed by this index. + * @return Number of levels in this index. + */ + protected int getNumIndexLevels() { + return LEVEL_STARTS.length; + } + + /** + * Gets the level associated with the given bin number. + * @param binNumber The bin number for which to determine the level. + * @return the level associated with the given bin number. + */ + protected int getLevelForBinNumber(final int binNumber) { + if(binNumber >= MAX_BINS) + throw new SAMException("Tried to get level for invalid bin."); + for(int i = getNumIndexLevels()-1; i >= 0; i--) { + if(binNumber >= LEVEL_STARTS[i]) + return i; + } + throw new SAMException("Unable to find correct bin for bin number "+binNumber); + } + /** * Completely load the index into memory. * @param file File to load. @@ -121,30 +152,16 @@ public class BAMFileIndex2 * positions. The last position in each pair is a virtual file pointer to the first SAMRecord beyond * the range that may contain the indicated SAMRecords. */ - long[] getSearchBins(final int referenceIndex, final int startPos, final int endPos) { - - // System.out.println("# Sequence count: " + sequenceCount); - if (referenceIndex >= referenceToBins.size()) { - return null; - } - - final BitSet regionBins = regionToBins(startPos, endPos); - if (regionBins == null) { - return null; - } - - Bin[] bins = referenceToBins.get(referenceIndex); - + long[] getFilePointersContaining(final int referenceIndex, final int startPos, final int endPos) { + List bins = getBinsContaining(referenceIndex,startPos,endPos); // System.out.println("# Sequence target TID: " + referenceIndex); - if (bins.length == 0) { + if (bins.size() == 0) { return null; } List chunkList = new ArrayList(); - for(Bin bin: bins) { - if (regionBins.get(bin.binNumber)) - chunkList.addAll(bin.chunks); - } + for(Bin bin: bins) + chunkList.addAll(bin.chunks); if (chunkList.isEmpty()) { return null; @@ -161,6 +178,39 @@ public class BAMFileIndex2 return convertToArray(chunkList); } + long[] getFilePointersBounding(final Bin bin) { + return convertToArray(bin.chunks); + } + + /** + * Get a list of bins in the BAM file that may contain SAMRecords for the given range. + * @param referenceIndex sequence of desired SAMRecords + * @param startPos 1-based start of the desired interval, inclusive + * @param endPos 1-based end of the desired interval, inclusive + * @return a list of bins that contain relevant data. + */ + List getBinsContaining(final int referenceIndex, final int startPos, final int endPos) { + List filteredBins = new ArrayList(); + + if (referenceIndex >= referenceToBins.size()) { + return null; + } + + final BitSet regionBins = regionToBins(startPos, endPos); + if (regionBins == null) { + return null; + } + + Bin[] bins = referenceToBins.get(referenceIndex); + + for(Bin bin: bins) { + if (regionBins.get(bin.binNumber)) + filteredBins.add(bin); + } + + return filteredBins; + } + /** * Use to get close to the unmapped reads at the end of a BAM file. * @return The file offset of the first record in the last linear bin, or -1 @@ -230,11 +280,11 @@ public class BAMFileIndex2 int k; final BitSet bitSet = new BitSet(MAX_BINS); bitSet.set(0); - for (k = 1 + (start>>26); k <= 1 + (end>>26); ++k) bitSet.set(k); - for (k = 9 + (start>>23); k <= 9 + (end>>23); ++k) bitSet.set(k); - for (k = 73 + (start>>20); k <= 73 + (end>>20); ++k) bitSet.set(k); - for (k = 585 + (start>>17); k <= 585 + (end>>17); ++k) bitSet.set(k); - for (k = 4681 + (start>>14); k <= 4681 + (end>>14); ++k) bitSet.set(k); + for (k = LEVEL_STARTS[1] + (start>>26); k <= LEVEL_STARTS[1] + (end>>26); ++k) bitSet.set(k); + for (k = LEVEL_STARTS[2] + (start>>23); k <= LEVEL_STARTS[2] + (end>>23); ++k) bitSet.set(k); + for (k = LEVEL_STARTS[3] + (start>>20); k <= LEVEL_STARTS[3] + (end>>20); ++k) bitSet.set(k); + for (k = LEVEL_STARTS[4] + (start>>17); k <= LEVEL_STARTS[4] + (end>>17); ++k) bitSet.set(k); + for (k = LEVEL_STARTS[5] + (start>>14); k <= LEVEL_STARTS[5] + (end>>14); ++k) bitSet.set(k); return bitSet; } diff --git a/java/src/net/sf/samtools/BAMFileReader2.java b/java/src/net/sf/samtools/BAMFileReader2.java index 51030e534..157e44645 100644 --- a/java/src/net/sf/samtools/BAMFileReader2.java +++ b/java/src/net/sf/samtools/BAMFileReader2.java @@ -50,7 +50,7 @@ class BAMFileReader2 private final BlockCompressedInputStream mCompressedInputStream; private SAMFileHeader mFileHeader = null; // Populated if the file is seekable and an index exists - private BAMFileIndex mFileIndex = null; + private BAMFileIndex2 mFileIndex = null; private long mFirstRecordPointer = 0; private CloseableIterator mCurrentIterator = null; // If true, all SAMRecords are fully decoded as they are read. @@ -115,11 +115,11 @@ class BAMFileReader2 /** * @return the file index, if one exists, else null. */ - BAMFileIndex getFileIndex() { + BAMFileIndex2 getFileIndex() { return mFileIndex; } - void setFileIndex(final BAMFileIndex fileIndex) { + void setFileIndex(final BAMFileIndex2 fileIndex) { mFileIndex = fileIndex; } @@ -184,17 +184,21 @@ class BAMFileReader2 return mCurrentIterator; } - public List getOverlappingFilePointers(final String sequence, final int start, final int end) { - long[] filePointers = null; - + public List getOverlappingBins(final String sequence, final int start, final int end) { + List bins = null; + final SAMFileHeader fileHeader = getFileHeader(); int referenceIndex = fileHeader.getSequenceIndex(sequence); if (referenceIndex != -1) { - final BAMFileIndex fileIndex = getFileIndex(); - filePointers = fileIndex.getSearchBins(referenceIndex, start, end); + final BAMFileIndex2 fileIndex = getFileIndex(); + bins = fileIndex.getBinsContaining(referenceIndex, start, end); } - return Chunk.toChunkList(filePointers); + return bins; + } + + public List getFilePointersBounding(Bin bin) { + return Chunk.toChunkList(getFileIndex().getFilePointersBounding(bin)); } /** @@ -463,8 +467,8 @@ class BAMFileReader2 final SAMFileHeader fileHeader = getFileHeader(); int referenceIndex = fileHeader.getSequenceIndex(sequence); if (referenceIndex != -1) { - final BAMFileIndex fileIndex = getFileIndex(); - filePointers = fileIndex.getSearchBins(referenceIndex, start, end); + final BAMFileIndex2 fileIndex = getFileIndex(); + filePointers = fileIndex.getFilePointersContaining(referenceIndex, start, end); } // Create an iterator over the above chunk boundaries. diff --git a/java/src/net/sf/samtools/Bin.java b/java/src/net/sf/samtools/Bin.java index 734685f32..55dbe6018 100644 --- a/java/src/net/sf/samtools/Bin.java +++ b/java/src/net/sf/samtools/Bin.java @@ -8,7 +8,7 @@ import java.util.List; * @author mhanna * @version 0.1 */ -public class Bin { +public class Bin implements Comparable { /** * The reference sequence associated with this bin. */ @@ -29,4 +29,36 @@ public class Bin { this.binNumber = binNumber; this.chunks = chunks; } + + /** + * See whether two bins are equal. If the ref seq and the bin number + * are equal, assume equality of the chunk list. + * @param other The other Bin to which to compare this. + * @return True if the two bins are equal. False otherwise. + */ + public boolean equals(Object other) { + if(other == null) return false; + if(!(other instanceof Bin)) return false; + + Bin otherBin = (Bin)other; + return this.referenceSequence == otherBin.referenceSequence && this.binNumber == otherBin.binNumber; + } + + /** + * Compare two bins to see what ordering they should appear in. + * @param other Other bin to which this bin should be compared. + * @return -1 if this < other, 0 if this == other, 1 if this > other. + */ + public int compareTo(Object other) { + if(other == null) + throw new ClassCastException("Cannot compare to a null object"); + Bin otherBin = (Bin)other; + + // Check the reference sequences first. + if(this.referenceSequence != otherBin.referenceSequence) + return ((Integer)referenceSequence).compareTo(otherBin.referenceSequence); + + // Then check the bin ordering. + return ((Integer)binNumber).compareTo(otherBin.binNumber); + } } diff --git a/java/src/net/sf/samtools/SAMFileReader2.java b/java/src/net/sf/samtools/SAMFileReader2.java index 06ecd083a..ab01449e8 100644 --- a/java/src/net/sf/samtools/SAMFileReader2.java +++ b/java/src/net/sf/samtools/SAMFileReader2.java @@ -58,7 +58,7 @@ public class SAMFileReader2 implements Iterable { } private boolean mIsBinary = false; - private BAMFileIndex mFileIndex = null; + private BAMFileIndex2 mFileIndex = null; private ReaderImplementation mReader = null; private File samFile = null; @@ -138,9 +138,6 @@ public class SAMFileReader2 implements Iterable { if (mReader != null) { mReader.close(); } - if (mFileIndex != null) { - mFileIndex.close(); - } mReader = null; mFileIndex = null; } @@ -159,6 +156,27 @@ public class SAMFileReader2 implements Iterable { return (mFileIndex != null); } + /** + * Get the number of levels employed by this index. + * @return Number of levels in this index. + */ + public int getNumIndexLevels() { + if(mFileIndex == null) + throw new SAMException("Unable to determine number of index levels; BAM file index is not present."); + return mFileIndex.getNumIndexLevels(); + } + + /** + * Gets the level associated with the given bin number. + * @param bin The bin for which to determine the level. + * @return the level associated with the given bin number. + */ + public int getLevelForBin(final Bin bin) { + if(mFileIndex == null) + throw new SAMException("Unable to determine level of bin number; BAM file index is not present."); + return mFileIndex.getLevelForBinNumber(bin.binNumber); + } + public SAMFileHeader getFileHeader() { return mReader.getFileHeader(); } @@ -190,19 +208,26 @@ public class SAMFileReader2 implements Iterable { * @return An iterator over the given chunks. */ public CloseableIterator iterator(List chunks) { - // TODO: Add sanity checks so that we're not doing this against a BAM file. + // TODO: Add sanity checks so that we're not doing this against an unsupported BAM file. if(!(mReader instanceof BAMFileReader2)) throw new PicardException("This call cannot be performed without a backing BAMFileReader2"); return ((BAMFileReader2)mReader).getIterator(chunks); } - public List getOverlappingFilePointers(final String sequence, final int start, final int end) { - // TODO: Add sanity checks so that we're not doing this against a BAM file. + public List getOverlappingBins(final String sequence, final int start, final int end) { + // TODO: Add sanity checks so that we're not doing this against an unsupported BAM file. if(!(mReader instanceof BAMFileReader2)) throw new PicardException("This call cannot be performed without a backing BAMFileReader2"); - return ((BAMFileReader2)mReader).getOverlappingFilePointers(sequence,start,end); + return ((BAMFileReader2)mReader).getOverlappingBins(sequence,start,end); } + public List getFilePointersBounding(final Bin bin) { + if(!(mReader instanceof BAMFileReader2)) + throw new PicardException("This call cannot be performed without a backing BAMFileReader2"); + return ((BAMFileReader2)mReader).getFilePointersBounding(bin); + } + + /** * Iterate over records that match the given interval. Only valid to call this if hasIndex() == true. *

@@ -382,7 +407,7 @@ public class SAMFileReader2 implements Iterable { final BAMFileReader2 reader = new BAMFileReader2(url, eagerDecode, validationStringency); mReader = reader; if (indexFile != null) { - mFileIndex = new BAMFileIndex(indexFile); + mFileIndex = new BAMFileIndex2(indexFile); reader.setFileIndex(mFileIndex); } } else { @@ -411,7 +436,7 @@ public class SAMFileReader2 implements Iterable { indexFile = findIndexFile(file); } if (indexFile != null) { - mFileIndex = new BAMFileIndex(indexFile); + mFileIndex = new BAMFileIndex2(indexFile); reader.setFileIndex(mFileIndex); if (indexFile.lastModified() < file.lastModified()) { System.err.println("WARNING: BAM index file " + indexFile.getAbsolutePath() + diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/BAMFileStat.java b/java/src/org/broadinstitute/sting/gatk/datasources/BAMFileStat.java index ebc1baf95..7882001e4 100644 --- a/java/src/org/broadinstitute/sting/gatk/datasources/BAMFileStat.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/BAMFileStat.java @@ -25,7 +25,7 @@ public class BAMFileStat extends CommandLineProgram { private CommandType command; @Argument(doc="The BAM file to inspect.",required=true) - private File bamFile; + private String bamFileName; @Argument(doc="The range of blocks to inspect.",required=false) private String range; @@ -46,10 +46,10 @@ public class BAMFileStat extends CommandLineProgram { switch(command) { case ShowBlocks: - showBlocks(bamFile,startPosition,stopPosition); + showBlocks(new File(bamFileName),startPosition,stopPosition); break; case ShowIndex: - showIndexBins(bamFile); + showIndexBins(new File(bamFileName+".bai")); break; } return 0; diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/shards/IndexDelimitedLocusShardStrategy.java b/java/src/org/broadinstitute/sting/gatk/datasources/shards/IndexDelimitedLocusShardStrategy.java index cea179ae2..7466d3504 100755 --- a/java/src/org/broadinstitute/sting/gatk/datasources/shards/IndexDelimitedLocusShardStrategy.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/shards/IndexDelimitedLocusShardStrategy.java @@ -9,6 +9,7 @@ import org.broadinstitute.sting.gatk.datasources.simpleDataSources.BlockDrivenSA import java.util.*; import net.sf.samtools.Chunk; +import net.sf.samtools.Bin; /* * Copyright (c) 2009 The Broad Institute @@ -40,9 +41,18 @@ import net.sf.samtools.Chunk; * A sharding strategy for loci based on reading of the index. */ public class IndexDelimitedLocusShardStrategy implements ShardStrategy { + /** + * The data source to use when performing this sharding. + */ + private final BlockDrivenSAMDataSource blockDrivenDataSource; /** our storage of the genomic locations they'd like to shard over */ - private final SortedMap> locations = new TreeMap>(); + private final List filePointers = new ArrayList(); + + /** + * An iterator through the available file pointers. + */ + private final Iterator filePointerIterator; /** * construct the shard strategy from a seq dictionary, a shard size, and and genomeLocs @@ -50,8 +60,39 @@ public class IndexDelimitedLocusShardStrategy implements ShardStrategy { * @param locations List of locations for which to load data. */ IndexDelimitedLocusShardStrategy(SAMDataSource dataSource, GenomeLocSortedSet locations) { - for(GenomeLoc location: locations) - this.locations.put(location,((BlockDrivenSAMDataSource)dataSource).getOverlappingFilePointers(location)); + if(!(dataSource instanceof BlockDrivenSAMDataSource)) + throw new StingException("Cannot power an IndexDelimitedLocusShardStrategy with this data source."); + + blockDrivenDataSource = (BlockDrivenSAMDataSource)dataSource; + final int deepestBinLevel = blockDrivenDataSource.getNumIndexLevels()-1; + + // Create a list of contig name -> genome loc, sorted in INSERTION ORDER. + LinkedHashMap> locationToReference = new LinkedHashMap>(); + for(GenomeLoc location: locations) { + if(!locationToReference.containsKey(location.getContig())) + locationToReference.put(location.getContig(),new ArrayList()); + locationToReference.get(location.getContig()).add(location); + } + + // Group the loci by bin, sorted in the order in which bins appear in the file. Only use the smallest bins in the set. + // TODO -- does this work with large interval lists? + for(String contig: locationToReference.keySet()) { + SortedMap> bins = new TreeMap>(); + for(GenomeLoc location: locationToReference.get(contig)) { + List binsForLocation = blockDrivenDataSource.getOverlappingBins(location); + for(Bin bin: binsForLocation) { + if(blockDrivenDataSource.getLevelForBin(bin) == deepestBinLevel) { + if(!bins.containsKey(bin)) + bins.put(bin,new ArrayList()); + bins.get(bin).add(location); + } + } + } + for(SortedMap.Entry> entry: bins.entrySet()) + filePointers.add(new FilePointer(entry.getKey(),entry.getValue())); + } + + filePointerIterator = filePointers.iterator(); } /** @@ -60,7 +101,7 @@ public class IndexDelimitedLocusShardStrategy implements ShardStrategy { * @return false if we're done processing shards */ public boolean hasNext() { - return ( !locations.isEmpty() ); + return filePointerIterator.hasNext(); } /** @@ -69,16 +110,9 @@ public class IndexDelimitedLocusShardStrategy implements ShardStrategy { * @return the next shard */ public IndexDelimitedLocusShard next() { - if (( this.locations == null ) || ( locations.isEmpty() )) { - throw new StingException("IntervalShardStrategy: genomic regions list is empty in next() function."); - } - - // get the first region in the list - GenomeLoc loc = locations.firstKey(); - List filePointers = locations.get(loc); - locations.remove(loc); - - return new IndexDelimitedLocusShard(Collections.singletonList(loc),filePointers,Shard.ShardType.LOCUS_INTERVAL); + FilePointer nextFilePointer = filePointerIterator.next(); + List chunksBounding = blockDrivenDataSource.getFilePointersBounding(nextFilePointer.bin); + return new IndexDelimitedLocusShard(nextFilePointer.locations,chunksBounding,Shard.ShardType.LOCUS_INTERVAL); } /** we don't support the remove command */ @@ -94,4 +128,19 @@ public class IndexDelimitedLocusShardStrategy implements ShardStrategy { public Iterator iterator() { return this; } + + /** + * Represents a small section of a BAM file, and every associated interval. + */ + private class FilePointer { + private final Bin bin; + private final List locations; + + public FilePointer(Bin bin, List locations) { + this.bin = bin; + this.locations = locations; + } + + } + } \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/BlockDrivenSAMDataSource.java b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/BlockDrivenSAMDataSource.java index ee75291e5..0edd0f04d 100644 --- a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/BlockDrivenSAMDataSource.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/BlockDrivenSAMDataSource.java @@ -44,8 +44,33 @@ public class BlockDrivenSAMDataSource extends SAMDataSource { return reader.hasIndex(); } - public List getOverlappingFilePointers(GenomeLoc location) { - return reader.getOverlappingFilePointers(location.getContig(),(int)location.getStart(),(int)location.getStop()); + public List getOverlappingBins(GenomeLoc location) { + return reader.getOverlappingBins(location.getContig(),(int)location.getStart(),(int)location.getStop()); + } + + public List getFilePointersBounding(final Bin bin) { + return reader.getFilePointersBounding(bin); + } + + /** + * Get the number of levels employed by this index. + * @return Number of levels in this index. + */ + public int getNumIndexLevels() { + if(!hasIndex()) + throw new SAMException("Unable to determine number of index levels; BAM file index is not present."); + return reader.getNumIndexLevels(); + } + + /** + * Gets the level associated with the given bin number. + * @param bin The bin for which to determine the level. + * @return the level associated with the given bin number. + */ + public int getLevelForBin(final Bin bin) { + if(!hasIndex()) + throw new SAMException("Unable to determine level of bin number; BAM file index is not present."); + return reader.getLevelForBin(bin); } public StingSAMIterator seek(Shard shard) {