Checkpoint for combining adjacent intervals into the same shard.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2782 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
hanna 2010-02-05 02:48:02 +00:00
parent 0d347d662a
commit e53432d54d
7 changed files with 250 additions and 65 deletions

View File

@ -39,17 +39,48 @@ public class BAMFileIndex2
private static final int MAX_BINS = 37450; // =(8^6-1)/7+1
private static final int BAM_LIDX_SHIFT = 14;
/**
* What is the starting bin for each level?
*/
private static final int[] LEVEL_STARTS = {0,1,9,73,585,4681};
/**
* A mapping of reference sequence index to list of bins.
*/
protected final SortedMap<Integer,Bin[]> referenceToBins = new TreeMap<Integer,Bin[]>();
/**
* A mapping of reference sequence index to linear indices.
*/
protected final SortedMap<Integer,LinearIndex> referenceToLinearIndices = new TreeMap<Integer,LinearIndex>();
protected BAMFileIndex2(final File file) {
loadIndex(file);
}
/**
* Get the number of levels employed by this index.
* @return Number of levels in this index.
*/
protected int getNumIndexLevels() {
return LEVEL_STARTS.length;
}
/**
* Gets the level associated with the given bin number.
* @param binNumber The bin number for which to determine the level.
* @return the level associated with the given bin number.
*/
protected int getLevelForBinNumber(final int binNumber) {
if(binNumber >= MAX_BINS)
throw new SAMException("Tried to get level for invalid bin.");
for(int i = getNumIndexLevels()-1; i >= 0; i--) {
if(binNumber >= LEVEL_STARTS[i])
return i;
}
throw new SAMException("Unable to find correct bin for bin number "+binNumber);
}
/**
* Completely load the index into memory.
* @param file File to load.
@ -121,30 +152,16 @@ public class BAMFileIndex2
* positions. The last position in each pair is a virtual file pointer to the first SAMRecord beyond
* the range that may contain the indicated SAMRecords.
*/
long[] getSearchBins(final int referenceIndex, final int startPos, final int endPos) {
// System.out.println("# Sequence count: " + sequenceCount);
if (referenceIndex >= referenceToBins.size()) {
return null;
}
final BitSet regionBins = regionToBins(startPos, endPos);
if (regionBins == null) {
return null;
}
Bin[] bins = referenceToBins.get(referenceIndex);
long[] getFilePointersContaining(final int referenceIndex, final int startPos, final int endPos) {
List<Bin> bins = getBinsContaining(referenceIndex,startPos,endPos);
// System.out.println("# Sequence target TID: " + referenceIndex);
if (bins.length == 0) {
if (bins.size() == 0) {
return null;
}
List<Chunk> chunkList = new ArrayList<Chunk>();
for(Bin bin: bins) {
if (regionBins.get(bin.binNumber))
chunkList.addAll(bin.chunks);
}
for(Bin bin: bins)
chunkList.addAll(bin.chunks);
if (chunkList.isEmpty()) {
return null;
@ -161,6 +178,39 @@ public class BAMFileIndex2
return convertToArray(chunkList);
}
long[] getFilePointersBounding(final Bin bin) {
return convertToArray(bin.chunks);
}
/**
* Get a list of bins in the BAM file that may contain SAMRecords for the given range.
* @param referenceIndex sequence of desired SAMRecords
* @param startPos 1-based start of the desired interval, inclusive
* @param endPos 1-based end of the desired interval, inclusive
* @return a list of bins that contain relevant data.
*/
List<Bin> getBinsContaining(final int referenceIndex, final int startPos, final int endPos) {
List<Bin> filteredBins = new ArrayList<Bin>();
if (referenceIndex >= referenceToBins.size()) {
return null;
}
final BitSet regionBins = regionToBins(startPos, endPos);
if (regionBins == null) {
return null;
}
Bin[] bins = referenceToBins.get(referenceIndex);
for(Bin bin: bins) {
if (regionBins.get(bin.binNumber))
filteredBins.add(bin);
}
return filteredBins;
}
/**
* Use to get close to the unmapped reads at the end of a BAM file.
* @return The file offset of the first record in the last linear bin, or -1
@ -230,11 +280,11 @@ public class BAMFileIndex2
int k;
final BitSet bitSet = new BitSet(MAX_BINS);
bitSet.set(0);
for (k = 1 + (start>>26); k <= 1 + (end>>26); ++k) bitSet.set(k);
for (k = 9 + (start>>23); k <= 9 + (end>>23); ++k) bitSet.set(k);
for (k = 73 + (start>>20); k <= 73 + (end>>20); ++k) bitSet.set(k);
for (k = 585 + (start>>17); k <= 585 + (end>>17); ++k) bitSet.set(k);
for (k = 4681 + (start>>14); k <= 4681 + (end>>14); ++k) bitSet.set(k);
for (k = LEVEL_STARTS[1] + (start>>26); k <= LEVEL_STARTS[1] + (end>>26); ++k) bitSet.set(k);
for (k = LEVEL_STARTS[2] + (start>>23); k <= LEVEL_STARTS[2] + (end>>23); ++k) bitSet.set(k);
for (k = LEVEL_STARTS[3] + (start>>20); k <= LEVEL_STARTS[3] + (end>>20); ++k) bitSet.set(k);
for (k = LEVEL_STARTS[4] + (start>>17); k <= LEVEL_STARTS[4] + (end>>17); ++k) bitSet.set(k);
for (k = LEVEL_STARTS[5] + (start>>14); k <= LEVEL_STARTS[5] + (end>>14); ++k) bitSet.set(k);
return bitSet;
}

View File

@ -50,7 +50,7 @@ class BAMFileReader2
private final BlockCompressedInputStream mCompressedInputStream;
private SAMFileHeader mFileHeader = null;
// Populated if the file is seekable and an index exists
private BAMFileIndex mFileIndex = null;
private BAMFileIndex2 mFileIndex = null;
private long mFirstRecordPointer = 0;
private CloseableIterator<SAMRecord> mCurrentIterator = null;
// If true, all SAMRecords are fully decoded as they are read.
@ -115,11 +115,11 @@ class BAMFileReader2
/**
* @return the file index, if one exists, else null.
*/
BAMFileIndex getFileIndex() {
BAMFileIndex2 getFileIndex() {
return mFileIndex;
}
void setFileIndex(final BAMFileIndex fileIndex) {
void setFileIndex(final BAMFileIndex2 fileIndex) {
mFileIndex = fileIndex;
}
@ -184,17 +184,21 @@ class BAMFileReader2
return mCurrentIterator;
}
public List<Chunk> getOverlappingFilePointers(final String sequence, final int start, final int end) {
long[] filePointers = null;
public List<Bin> getOverlappingBins(final String sequence, final int start, final int end) {
List<Bin> bins = null;
final SAMFileHeader fileHeader = getFileHeader();
int referenceIndex = fileHeader.getSequenceIndex(sequence);
if (referenceIndex != -1) {
final BAMFileIndex fileIndex = getFileIndex();
filePointers = fileIndex.getSearchBins(referenceIndex, start, end);
final BAMFileIndex2 fileIndex = getFileIndex();
bins = fileIndex.getBinsContaining(referenceIndex, start, end);
}
return Chunk.toChunkList(filePointers);
return bins;
}
public List<Chunk> getFilePointersBounding(Bin bin) {
return Chunk.toChunkList(getFileIndex().getFilePointersBounding(bin));
}
/**
@ -463,8 +467,8 @@ class BAMFileReader2
final SAMFileHeader fileHeader = getFileHeader();
int referenceIndex = fileHeader.getSequenceIndex(sequence);
if (referenceIndex != -1) {
final BAMFileIndex fileIndex = getFileIndex();
filePointers = fileIndex.getSearchBins(referenceIndex, start, end);
final BAMFileIndex2 fileIndex = getFileIndex();
filePointers = fileIndex.getFilePointersContaining(referenceIndex, start, end);
}
// Create an iterator over the above chunk boundaries.

View File

@ -8,7 +8,7 @@ import java.util.List;
* @author mhanna
* @version 0.1
*/
public class Bin {
public class Bin implements Comparable {
/**
* The reference sequence associated with this bin.
*/
@ -29,4 +29,36 @@ public class Bin {
this.binNumber = binNumber;
this.chunks = chunks;
}
/**
* See whether two bins are equal. If the ref seq and the bin number
* are equal, assume equality of the chunk list.
* @param other The other Bin to which to compare this.
* @return True if the two bins are equal. False otherwise.
*/
public boolean equals(Object other) {
if(other == null) return false;
if(!(other instanceof Bin)) return false;
Bin otherBin = (Bin)other;
return this.referenceSequence == otherBin.referenceSequence && this.binNumber == otherBin.binNumber;
}
/**
* Compare two bins to see what ordering they should appear in.
* @param other Other bin to which this bin should be compared.
* @return -1 if this < other, 0 if this == other, 1 if this > other.
*/
public int compareTo(Object other) {
if(other == null)
throw new ClassCastException("Cannot compare to a null object");
Bin otherBin = (Bin)other;
// Check the reference sequences first.
if(this.referenceSequence != otherBin.referenceSequence)
return ((Integer)referenceSequence).compareTo(otherBin.referenceSequence);
// Then check the bin ordering.
return ((Integer)binNumber).compareTo(otherBin.binNumber);
}
}

View File

@ -58,7 +58,7 @@ public class SAMFileReader2 implements Iterable<SAMRecord> {
}
private boolean mIsBinary = false;
private BAMFileIndex mFileIndex = null;
private BAMFileIndex2 mFileIndex = null;
private ReaderImplementation mReader = null;
private File samFile = null;
@ -138,9 +138,6 @@ public class SAMFileReader2 implements Iterable<SAMRecord> {
if (mReader != null) {
mReader.close();
}
if (mFileIndex != null) {
mFileIndex.close();
}
mReader = null;
mFileIndex = null;
}
@ -159,6 +156,27 @@ public class SAMFileReader2 implements Iterable<SAMRecord> {
return (mFileIndex != null);
}
/**
* Get the number of levels employed by this index.
* @return Number of levels in this index.
*/
public int getNumIndexLevels() {
if(mFileIndex == null)
throw new SAMException("Unable to determine number of index levels; BAM file index is not present.");
return mFileIndex.getNumIndexLevels();
}
/**
* Gets the level associated with the given bin number.
* @param bin The bin for which to determine the level.
* @return the level associated with the given bin number.
*/
public int getLevelForBin(final Bin bin) {
if(mFileIndex == null)
throw new SAMException("Unable to determine level of bin number; BAM file index is not present.");
return mFileIndex.getLevelForBinNumber(bin.binNumber);
}
public SAMFileHeader getFileHeader() {
return mReader.getFileHeader();
}
@ -190,19 +208,26 @@ public class SAMFileReader2 implements Iterable<SAMRecord> {
* @return An iterator over the given chunks.
*/
public CloseableIterator<SAMRecord> iterator(List<Chunk> chunks) {
// TODO: Add sanity checks so that we're not doing this against a BAM file.
// TODO: Add sanity checks so that we're not doing this against an unsupported BAM file.
if(!(mReader instanceof BAMFileReader2))
throw new PicardException("This call cannot be performed without a backing BAMFileReader2");
return ((BAMFileReader2)mReader).getIterator(chunks);
}
public List<Chunk> getOverlappingFilePointers(final String sequence, final int start, final int end) {
// TODO: Add sanity checks so that we're not doing this against a BAM file.
public List<Bin> getOverlappingBins(final String sequence, final int start, final int end) {
// TODO: Add sanity checks so that we're not doing this against an unsupported BAM file.
if(!(mReader instanceof BAMFileReader2))
throw new PicardException("This call cannot be performed without a backing BAMFileReader2");
return ((BAMFileReader2)mReader).getOverlappingFilePointers(sequence,start,end);
return ((BAMFileReader2)mReader).getOverlappingBins(sequence,start,end);
}
public List<Chunk> getFilePointersBounding(final Bin bin) {
if(!(mReader instanceof BAMFileReader2))
throw new PicardException("This call cannot be performed without a backing BAMFileReader2");
return ((BAMFileReader2)mReader).getFilePointersBounding(bin);
}
/**
* Iterate over records that match the given interval. Only valid to call this if hasIndex() == true.
* <p/>
@ -382,7 +407,7 @@ public class SAMFileReader2 implements Iterable<SAMRecord> {
final BAMFileReader2 reader = new BAMFileReader2(url, eagerDecode, validationStringency);
mReader = reader;
if (indexFile != null) {
mFileIndex = new BAMFileIndex(indexFile);
mFileIndex = new BAMFileIndex2(indexFile);
reader.setFileIndex(mFileIndex);
}
} else {
@ -411,7 +436,7 @@ public class SAMFileReader2 implements Iterable<SAMRecord> {
indexFile = findIndexFile(file);
}
if (indexFile != null) {
mFileIndex = new BAMFileIndex(indexFile);
mFileIndex = new BAMFileIndex2(indexFile);
reader.setFileIndex(mFileIndex);
if (indexFile.lastModified() < file.lastModified()) {
System.err.println("WARNING: BAM index file " + indexFile.getAbsolutePath() +

View File

@ -25,7 +25,7 @@ public class BAMFileStat extends CommandLineProgram {
private CommandType command;
@Argument(doc="The BAM file to inspect.",required=true)
private File bamFile;
private String bamFileName;
@Argument(doc="The range of blocks to inspect.",required=false)
private String range;
@ -46,10 +46,10 @@ public class BAMFileStat extends CommandLineProgram {
switch(command) {
case ShowBlocks:
showBlocks(bamFile,startPosition,stopPosition);
showBlocks(new File(bamFileName),startPosition,stopPosition);
break;
case ShowIndex:
showIndexBins(bamFile);
showIndexBins(new File(bamFileName+".bai"));
break;
}
return 0;

View File

@ -9,6 +9,7 @@ import org.broadinstitute.sting.gatk.datasources.simpleDataSources.BlockDrivenSA
import java.util.*;
import net.sf.samtools.Chunk;
import net.sf.samtools.Bin;
/*
* Copyright (c) 2009 The Broad Institute
@ -40,9 +41,18 @@ import net.sf.samtools.Chunk;
* A sharding strategy for loci based on reading of the index.
*/
public class IndexDelimitedLocusShardStrategy implements ShardStrategy {
/**
* The data source to use when performing this sharding.
*/
private final BlockDrivenSAMDataSource blockDrivenDataSource;
/** our storage of the genomic locations they'd like to shard over */
private final SortedMap<GenomeLoc,List<Chunk>> locations = new TreeMap<GenomeLoc,List<Chunk>>();
private final List<FilePointer> filePointers = new ArrayList<FilePointer>();
/**
* An iterator through the available file pointers.
*/
private final Iterator<FilePointer> filePointerIterator;
/**
* construct the shard strategy from a seq dictionary, a shard size, and and genomeLocs
@ -50,8 +60,39 @@ public class IndexDelimitedLocusShardStrategy implements ShardStrategy {
* @param locations List of locations for which to load data.
*/
IndexDelimitedLocusShardStrategy(SAMDataSource dataSource, GenomeLocSortedSet locations) {
for(GenomeLoc location: locations)
this.locations.put(location,((BlockDrivenSAMDataSource)dataSource).getOverlappingFilePointers(location));
if(!(dataSource instanceof BlockDrivenSAMDataSource))
throw new StingException("Cannot power an IndexDelimitedLocusShardStrategy with this data source.");
blockDrivenDataSource = (BlockDrivenSAMDataSource)dataSource;
final int deepestBinLevel = blockDrivenDataSource.getNumIndexLevels()-1;
// Create a list of contig name -> genome loc, sorted in INSERTION ORDER.
LinkedHashMap<String,List<GenomeLoc>> locationToReference = new LinkedHashMap<String,List<GenomeLoc>>();
for(GenomeLoc location: locations) {
if(!locationToReference.containsKey(location.getContig()))
locationToReference.put(location.getContig(),new ArrayList<GenomeLoc>());
locationToReference.get(location.getContig()).add(location);
}
// Group the loci by bin, sorted in the order in which bins appear in the file. Only use the smallest bins in the set.
// TODO -- does this work with large interval lists?
for(String contig: locationToReference.keySet()) {
SortedMap<Bin,List<GenomeLoc>> bins = new TreeMap<Bin,List<GenomeLoc>>();
for(GenomeLoc location: locationToReference.get(contig)) {
List<Bin> binsForLocation = blockDrivenDataSource.getOverlappingBins(location);
for(Bin bin: binsForLocation) {
if(blockDrivenDataSource.getLevelForBin(bin) == deepestBinLevel) {
if(!bins.containsKey(bin))
bins.put(bin,new ArrayList<GenomeLoc>());
bins.get(bin).add(location);
}
}
}
for(SortedMap.Entry<Bin,List<GenomeLoc>> entry: bins.entrySet())
filePointers.add(new FilePointer(entry.getKey(),entry.getValue()));
}
filePointerIterator = filePointers.iterator();
}
/**
@ -60,7 +101,7 @@ public class IndexDelimitedLocusShardStrategy implements ShardStrategy {
* @return false if we're done processing shards
*/
public boolean hasNext() {
return ( !locations.isEmpty() );
return filePointerIterator.hasNext();
}
/**
@ -69,16 +110,9 @@ public class IndexDelimitedLocusShardStrategy implements ShardStrategy {
* @return the next shard
*/
public IndexDelimitedLocusShard next() {
if (( this.locations == null ) || ( locations.isEmpty() )) {
throw new StingException("IntervalShardStrategy: genomic regions list is empty in next() function.");
}
// get the first region in the list
GenomeLoc loc = locations.firstKey();
List<Chunk> filePointers = locations.get(loc);
locations.remove(loc);
return new IndexDelimitedLocusShard(Collections.singletonList(loc),filePointers,Shard.ShardType.LOCUS_INTERVAL);
FilePointer nextFilePointer = filePointerIterator.next();
List<Chunk> chunksBounding = blockDrivenDataSource.getFilePointersBounding(nextFilePointer.bin);
return new IndexDelimitedLocusShard(nextFilePointer.locations,chunksBounding,Shard.ShardType.LOCUS_INTERVAL);
}
/** we don't support the remove command */
@ -94,4 +128,19 @@ public class IndexDelimitedLocusShardStrategy implements ShardStrategy {
public Iterator<Shard> iterator() {
return this;
}
/**
* Represents a small section of a BAM file, and every associated interval.
*/
private class FilePointer {
private final Bin bin;
private final List<GenomeLoc> locations;
public FilePointer(Bin bin, List<GenomeLoc> locations) {
this.bin = bin;
this.locations = locations;
}
}
}

View File

@ -44,8 +44,33 @@ public class BlockDrivenSAMDataSource extends SAMDataSource {
return reader.hasIndex();
}
public List<Chunk> getOverlappingFilePointers(GenomeLoc location) {
return reader.getOverlappingFilePointers(location.getContig(),(int)location.getStart(),(int)location.getStop());
public List<Bin> getOverlappingBins(GenomeLoc location) {
return reader.getOverlappingBins(location.getContig(),(int)location.getStart(),(int)location.getStop());
}
public List<Chunk> getFilePointersBounding(final Bin bin) {
return reader.getFilePointersBounding(bin);
}
/**
* Get the number of levels employed by this index.
* @return Number of levels in this index.
*/
public int getNumIndexLevels() {
if(!hasIndex())
throw new SAMException("Unable to determine number of index levels; BAM file index is not present.");
return reader.getNumIndexLevels();
}
/**
* Gets the level associated with the given bin number.
* @param bin The bin for which to determine the level.
* @return the level associated with the given bin number.
*/
public int getLevelForBin(final Bin bin) {
if(!hasIndex())
throw new SAMException("Unable to determine level of bin number; BAM file index is not present.");
return reader.getLevelForBin(bin);
}
public StingSAMIterator seek(Shard shard) {