Checkpoint for combining adjacent intervals into the same shard.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2782 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
0d347d662a
commit
e53432d54d
|
|
@ -39,17 +39,48 @@ public class BAMFileIndex2
|
||||||
private static final int MAX_BINS = 37450; // =(8^6-1)/7+1
|
private static final int MAX_BINS = 37450; // =(8^6-1)/7+1
|
||||||
private static final int BAM_LIDX_SHIFT = 14;
|
private static final int BAM_LIDX_SHIFT = 14;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* What is the starting bin for each level?
|
||||||
|
*/
|
||||||
|
private static final int[] LEVEL_STARTS = {0,1,9,73,585,4681};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A mapping of reference sequence index to list of bins.
|
* A mapping of reference sequence index to list of bins.
|
||||||
*/
|
*/
|
||||||
protected final SortedMap<Integer,Bin[]> referenceToBins = new TreeMap<Integer,Bin[]>();
|
protected final SortedMap<Integer,Bin[]> referenceToBins = new TreeMap<Integer,Bin[]>();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A mapping of reference sequence index to linear indices.
|
||||||
|
*/
|
||||||
protected final SortedMap<Integer,LinearIndex> referenceToLinearIndices = new TreeMap<Integer,LinearIndex>();
|
protected final SortedMap<Integer,LinearIndex> referenceToLinearIndices = new TreeMap<Integer,LinearIndex>();
|
||||||
|
|
||||||
protected BAMFileIndex2(final File file) {
|
protected BAMFileIndex2(final File file) {
|
||||||
loadIndex(file);
|
loadIndex(file);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get the number of levels employed by this index.
|
||||||
|
* @return Number of levels in this index.
|
||||||
|
*/
|
||||||
|
protected int getNumIndexLevels() {
|
||||||
|
return LEVEL_STARTS.length;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Gets the level associated with the given bin number.
|
||||||
|
* @param binNumber The bin number for which to determine the level.
|
||||||
|
* @return the level associated with the given bin number.
|
||||||
|
*/
|
||||||
|
protected int getLevelForBinNumber(final int binNumber) {
|
||||||
|
if(binNumber >= MAX_BINS)
|
||||||
|
throw new SAMException("Tried to get level for invalid bin.");
|
||||||
|
for(int i = getNumIndexLevels()-1; i >= 0; i--) {
|
||||||
|
if(binNumber >= LEVEL_STARTS[i])
|
||||||
|
return i;
|
||||||
|
}
|
||||||
|
throw new SAMException("Unable to find correct bin for bin number "+binNumber);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Completely load the index into memory.
|
* Completely load the index into memory.
|
||||||
* @param file File to load.
|
* @param file File to load.
|
||||||
|
|
@ -121,30 +152,16 @@ public class BAMFileIndex2
|
||||||
* positions. The last position in each pair is a virtual file pointer to the first SAMRecord beyond
|
* positions. The last position in each pair is a virtual file pointer to the first SAMRecord beyond
|
||||||
* the range that may contain the indicated SAMRecords.
|
* the range that may contain the indicated SAMRecords.
|
||||||
*/
|
*/
|
||||||
long[] getSearchBins(final int referenceIndex, final int startPos, final int endPos) {
|
long[] getFilePointersContaining(final int referenceIndex, final int startPos, final int endPos) {
|
||||||
|
List<Bin> bins = getBinsContaining(referenceIndex,startPos,endPos);
|
||||||
// System.out.println("# Sequence count: " + sequenceCount);
|
|
||||||
if (referenceIndex >= referenceToBins.size()) {
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
final BitSet regionBins = regionToBins(startPos, endPos);
|
|
||||||
if (regionBins == null) {
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
Bin[] bins = referenceToBins.get(referenceIndex);
|
|
||||||
|
|
||||||
// System.out.println("# Sequence target TID: " + referenceIndex);
|
// System.out.println("# Sequence target TID: " + referenceIndex);
|
||||||
if (bins.length == 0) {
|
if (bins.size() == 0) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
List<Chunk> chunkList = new ArrayList<Chunk>();
|
List<Chunk> chunkList = new ArrayList<Chunk>();
|
||||||
for(Bin bin: bins) {
|
for(Bin bin: bins)
|
||||||
if (regionBins.get(bin.binNumber))
|
chunkList.addAll(bin.chunks);
|
||||||
chunkList.addAll(bin.chunks);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (chunkList.isEmpty()) {
|
if (chunkList.isEmpty()) {
|
||||||
return null;
|
return null;
|
||||||
|
|
@ -161,6 +178,39 @@ public class BAMFileIndex2
|
||||||
return convertToArray(chunkList);
|
return convertToArray(chunkList);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
long[] getFilePointersBounding(final Bin bin) {
|
||||||
|
return convertToArray(bin.chunks);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get a list of bins in the BAM file that may contain SAMRecords for the given range.
|
||||||
|
* @param referenceIndex sequence of desired SAMRecords
|
||||||
|
* @param startPos 1-based start of the desired interval, inclusive
|
||||||
|
* @param endPos 1-based end of the desired interval, inclusive
|
||||||
|
* @return a list of bins that contain relevant data.
|
||||||
|
*/
|
||||||
|
List<Bin> getBinsContaining(final int referenceIndex, final int startPos, final int endPos) {
|
||||||
|
List<Bin> filteredBins = new ArrayList<Bin>();
|
||||||
|
|
||||||
|
if (referenceIndex >= referenceToBins.size()) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
final BitSet regionBins = regionToBins(startPos, endPos);
|
||||||
|
if (regionBins == null) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
Bin[] bins = referenceToBins.get(referenceIndex);
|
||||||
|
|
||||||
|
for(Bin bin: bins) {
|
||||||
|
if (regionBins.get(bin.binNumber))
|
||||||
|
filteredBins.add(bin);
|
||||||
|
}
|
||||||
|
|
||||||
|
return filteredBins;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Use to get close to the unmapped reads at the end of a BAM file.
|
* Use to get close to the unmapped reads at the end of a BAM file.
|
||||||
* @return The file offset of the first record in the last linear bin, or -1
|
* @return The file offset of the first record in the last linear bin, or -1
|
||||||
|
|
@ -230,11 +280,11 @@ public class BAMFileIndex2
|
||||||
int k;
|
int k;
|
||||||
final BitSet bitSet = new BitSet(MAX_BINS);
|
final BitSet bitSet = new BitSet(MAX_BINS);
|
||||||
bitSet.set(0);
|
bitSet.set(0);
|
||||||
for (k = 1 + (start>>26); k <= 1 + (end>>26); ++k) bitSet.set(k);
|
for (k = LEVEL_STARTS[1] + (start>>26); k <= LEVEL_STARTS[1] + (end>>26); ++k) bitSet.set(k);
|
||||||
for (k = 9 + (start>>23); k <= 9 + (end>>23); ++k) bitSet.set(k);
|
for (k = LEVEL_STARTS[2] + (start>>23); k <= LEVEL_STARTS[2] + (end>>23); ++k) bitSet.set(k);
|
||||||
for (k = 73 + (start>>20); k <= 73 + (end>>20); ++k) bitSet.set(k);
|
for (k = LEVEL_STARTS[3] + (start>>20); k <= LEVEL_STARTS[3] + (end>>20); ++k) bitSet.set(k);
|
||||||
for (k = 585 + (start>>17); k <= 585 + (end>>17); ++k) bitSet.set(k);
|
for (k = LEVEL_STARTS[4] + (start>>17); k <= LEVEL_STARTS[4] + (end>>17); ++k) bitSet.set(k);
|
||||||
for (k = 4681 + (start>>14); k <= 4681 + (end>>14); ++k) bitSet.set(k);
|
for (k = LEVEL_STARTS[5] + (start>>14); k <= LEVEL_STARTS[5] + (end>>14); ++k) bitSet.set(k);
|
||||||
return bitSet;
|
return bitSet;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -50,7 +50,7 @@ class BAMFileReader2
|
||||||
private final BlockCompressedInputStream mCompressedInputStream;
|
private final BlockCompressedInputStream mCompressedInputStream;
|
||||||
private SAMFileHeader mFileHeader = null;
|
private SAMFileHeader mFileHeader = null;
|
||||||
// Populated if the file is seekable and an index exists
|
// Populated if the file is seekable and an index exists
|
||||||
private BAMFileIndex mFileIndex = null;
|
private BAMFileIndex2 mFileIndex = null;
|
||||||
private long mFirstRecordPointer = 0;
|
private long mFirstRecordPointer = 0;
|
||||||
private CloseableIterator<SAMRecord> mCurrentIterator = null;
|
private CloseableIterator<SAMRecord> mCurrentIterator = null;
|
||||||
// If true, all SAMRecords are fully decoded as they are read.
|
// If true, all SAMRecords are fully decoded as they are read.
|
||||||
|
|
@ -115,11 +115,11 @@ class BAMFileReader2
|
||||||
/**
|
/**
|
||||||
* @return the file index, if one exists, else null.
|
* @return the file index, if one exists, else null.
|
||||||
*/
|
*/
|
||||||
BAMFileIndex getFileIndex() {
|
BAMFileIndex2 getFileIndex() {
|
||||||
return mFileIndex;
|
return mFileIndex;
|
||||||
}
|
}
|
||||||
|
|
||||||
void setFileIndex(final BAMFileIndex fileIndex) {
|
void setFileIndex(final BAMFileIndex2 fileIndex) {
|
||||||
mFileIndex = fileIndex;
|
mFileIndex = fileIndex;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -184,17 +184,21 @@ class BAMFileReader2
|
||||||
return mCurrentIterator;
|
return mCurrentIterator;
|
||||||
}
|
}
|
||||||
|
|
||||||
public List<Chunk> getOverlappingFilePointers(final String sequence, final int start, final int end) {
|
public List<Bin> getOverlappingBins(final String sequence, final int start, final int end) {
|
||||||
long[] filePointers = null;
|
List<Bin> bins = null;
|
||||||
|
|
||||||
final SAMFileHeader fileHeader = getFileHeader();
|
final SAMFileHeader fileHeader = getFileHeader();
|
||||||
int referenceIndex = fileHeader.getSequenceIndex(sequence);
|
int referenceIndex = fileHeader.getSequenceIndex(sequence);
|
||||||
if (referenceIndex != -1) {
|
if (referenceIndex != -1) {
|
||||||
final BAMFileIndex fileIndex = getFileIndex();
|
final BAMFileIndex2 fileIndex = getFileIndex();
|
||||||
filePointers = fileIndex.getSearchBins(referenceIndex, start, end);
|
bins = fileIndex.getBinsContaining(referenceIndex, start, end);
|
||||||
}
|
}
|
||||||
|
|
||||||
return Chunk.toChunkList(filePointers);
|
return bins;
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<Chunk> getFilePointersBounding(Bin bin) {
|
||||||
|
return Chunk.toChunkList(getFileIndex().getFilePointersBounding(bin));
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -463,8 +467,8 @@ class BAMFileReader2
|
||||||
final SAMFileHeader fileHeader = getFileHeader();
|
final SAMFileHeader fileHeader = getFileHeader();
|
||||||
int referenceIndex = fileHeader.getSequenceIndex(sequence);
|
int referenceIndex = fileHeader.getSequenceIndex(sequence);
|
||||||
if (referenceIndex != -1) {
|
if (referenceIndex != -1) {
|
||||||
final BAMFileIndex fileIndex = getFileIndex();
|
final BAMFileIndex2 fileIndex = getFileIndex();
|
||||||
filePointers = fileIndex.getSearchBins(referenceIndex, start, end);
|
filePointers = fileIndex.getFilePointersContaining(referenceIndex, start, end);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Create an iterator over the above chunk boundaries.
|
// Create an iterator over the above chunk boundaries.
|
||||||
|
|
|
||||||
|
|
@ -8,7 +8,7 @@ import java.util.List;
|
||||||
* @author mhanna
|
* @author mhanna
|
||||||
* @version 0.1
|
* @version 0.1
|
||||||
*/
|
*/
|
||||||
public class Bin {
|
public class Bin implements Comparable {
|
||||||
/**
|
/**
|
||||||
* The reference sequence associated with this bin.
|
* The reference sequence associated with this bin.
|
||||||
*/
|
*/
|
||||||
|
|
@ -29,4 +29,36 @@ public class Bin {
|
||||||
this.binNumber = binNumber;
|
this.binNumber = binNumber;
|
||||||
this.chunks = chunks;
|
this.chunks = chunks;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* See whether two bins are equal. If the ref seq and the bin number
|
||||||
|
* are equal, assume equality of the chunk list.
|
||||||
|
* @param other The other Bin to which to compare this.
|
||||||
|
* @return True if the two bins are equal. False otherwise.
|
||||||
|
*/
|
||||||
|
public boolean equals(Object other) {
|
||||||
|
if(other == null) return false;
|
||||||
|
if(!(other instanceof Bin)) return false;
|
||||||
|
|
||||||
|
Bin otherBin = (Bin)other;
|
||||||
|
return this.referenceSequence == otherBin.referenceSequence && this.binNumber == otherBin.binNumber;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Compare two bins to see what ordering they should appear in.
|
||||||
|
* @param other Other bin to which this bin should be compared.
|
||||||
|
* @return -1 if this < other, 0 if this == other, 1 if this > other.
|
||||||
|
*/
|
||||||
|
public int compareTo(Object other) {
|
||||||
|
if(other == null)
|
||||||
|
throw new ClassCastException("Cannot compare to a null object");
|
||||||
|
Bin otherBin = (Bin)other;
|
||||||
|
|
||||||
|
// Check the reference sequences first.
|
||||||
|
if(this.referenceSequence != otherBin.referenceSequence)
|
||||||
|
return ((Integer)referenceSequence).compareTo(otherBin.referenceSequence);
|
||||||
|
|
||||||
|
// Then check the bin ordering.
|
||||||
|
return ((Integer)binNumber).compareTo(otherBin.binNumber);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -58,7 +58,7 @@ public class SAMFileReader2 implements Iterable<SAMRecord> {
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean mIsBinary = false;
|
private boolean mIsBinary = false;
|
||||||
private BAMFileIndex mFileIndex = null;
|
private BAMFileIndex2 mFileIndex = null;
|
||||||
private ReaderImplementation mReader = null;
|
private ReaderImplementation mReader = null;
|
||||||
private File samFile = null;
|
private File samFile = null;
|
||||||
|
|
||||||
|
|
@ -138,9 +138,6 @@ public class SAMFileReader2 implements Iterable<SAMRecord> {
|
||||||
if (mReader != null) {
|
if (mReader != null) {
|
||||||
mReader.close();
|
mReader.close();
|
||||||
}
|
}
|
||||||
if (mFileIndex != null) {
|
|
||||||
mFileIndex.close();
|
|
||||||
}
|
|
||||||
mReader = null;
|
mReader = null;
|
||||||
mFileIndex = null;
|
mFileIndex = null;
|
||||||
}
|
}
|
||||||
|
|
@ -159,6 +156,27 @@ public class SAMFileReader2 implements Iterable<SAMRecord> {
|
||||||
return (mFileIndex != null);
|
return (mFileIndex != null);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get the number of levels employed by this index.
|
||||||
|
* @return Number of levels in this index.
|
||||||
|
*/
|
||||||
|
public int getNumIndexLevels() {
|
||||||
|
if(mFileIndex == null)
|
||||||
|
throw new SAMException("Unable to determine number of index levels; BAM file index is not present.");
|
||||||
|
return mFileIndex.getNumIndexLevels();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Gets the level associated with the given bin number.
|
||||||
|
* @param bin The bin for which to determine the level.
|
||||||
|
* @return the level associated with the given bin number.
|
||||||
|
*/
|
||||||
|
public int getLevelForBin(final Bin bin) {
|
||||||
|
if(mFileIndex == null)
|
||||||
|
throw new SAMException("Unable to determine level of bin number; BAM file index is not present.");
|
||||||
|
return mFileIndex.getLevelForBinNumber(bin.binNumber);
|
||||||
|
}
|
||||||
|
|
||||||
public SAMFileHeader getFileHeader() {
|
public SAMFileHeader getFileHeader() {
|
||||||
return mReader.getFileHeader();
|
return mReader.getFileHeader();
|
||||||
}
|
}
|
||||||
|
|
@ -190,19 +208,26 @@ public class SAMFileReader2 implements Iterable<SAMRecord> {
|
||||||
* @return An iterator over the given chunks.
|
* @return An iterator over the given chunks.
|
||||||
*/
|
*/
|
||||||
public CloseableIterator<SAMRecord> iterator(List<Chunk> chunks) {
|
public CloseableIterator<SAMRecord> iterator(List<Chunk> chunks) {
|
||||||
// TODO: Add sanity checks so that we're not doing this against a BAM file.
|
// TODO: Add sanity checks so that we're not doing this against an unsupported BAM file.
|
||||||
if(!(mReader instanceof BAMFileReader2))
|
if(!(mReader instanceof BAMFileReader2))
|
||||||
throw new PicardException("This call cannot be performed without a backing BAMFileReader2");
|
throw new PicardException("This call cannot be performed without a backing BAMFileReader2");
|
||||||
return ((BAMFileReader2)mReader).getIterator(chunks);
|
return ((BAMFileReader2)mReader).getIterator(chunks);
|
||||||
}
|
}
|
||||||
|
|
||||||
public List<Chunk> getOverlappingFilePointers(final String sequence, final int start, final int end) {
|
public List<Bin> getOverlappingBins(final String sequence, final int start, final int end) {
|
||||||
// TODO: Add sanity checks so that we're not doing this against a BAM file.
|
// TODO: Add sanity checks so that we're not doing this against an unsupported BAM file.
|
||||||
if(!(mReader instanceof BAMFileReader2))
|
if(!(mReader instanceof BAMFileReader2))
|
||||||
throw new PicardException("This call cannot be performed without a backing BAMFileReader2");
|
throw new PicardException("This call cannot be performed without a backing BAMFileReader2");
|
||||||
return ((BAMFileReader2)mReader).getOverlappingFilePointers(sequence,start,end);
|
return ((BAMFileReader2)mReader).getOverlappingBins(sequence,start,end);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public List<Chunk> getFilePointersBounding(final Bin bin) {
|
||||||
|
if(!(mReader instanceof BAMFileReader2))
|
||||||
|
throw new PicardException("This call cannot be performed without a backing BAMFileReader2");
|
||||||
|
return ((BAMFileReader2)mReader).getFilePointersBounding(bin);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Iterate over records that match the given interval. Only valid to call this if hasIndex() == true.
|
* Iterate over records that match the given interval. Only valid to call this if hasIndex() == true.
|
||||||
* <p/>
|
* <p/>
|
||||||
|
|
@ -382,7 +407,7 @@ public class SAMFileReader2 implements Iterable<SAMRecord> {
|
||||||
final BAMFileReader2 reader = new BAMFileReader2(url, eagerDecode, validationStringency);
|
final BAMFileReader2 reader = new BAMFileReader2(url, eagerDecode, validationStringency);
|
||||||
mReader = reader;
|
mReader = reader;
|
||||||
if (indexFile != null) {
|
if (indexFile != null) {
|
||||||
mFileIndex = new BAMFileIndex(indexFile);
|
mFileIndex = new BAMFileIndex2(indexFile);
|
||||||
reader.setFileIndex(mFileIndex);
|
reader.setFileIndex(mFileIndex);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
|
@ -411,7 +436,7 @@ public class SAMFileReader2 implements Iterable<SAMRecord> {
|
||||||
indexFile = findIndexFile(file);
|
indexFile = findIndexFile(file);
|
||||||
}
|
}
|
||||||
if (indexFile != null) {
|
if (indexFile != null) {
|
||||||
mFileIndex = new BAMFileIndex(indexFile);
|
mFileIndex = new BAMFileIndex2(indexFile);
|
||||||
reader.setFileIndex(mFileIndex);
|
reader.setFileIndex(mFileIndex);
|
||||||
if (indexFile.lastModified() < file.lastModified()) {
|
if (indexFile.lastModified() < file.lastModified()) {
|
||||||
System.err.println("WARNING: BAM index file " + indexFile.getAbsolutePath() +
|
System.err.println("WARNING: BAM index file " + indexFile.getAbsolutePath() +
|
||||||
|
|
|
||||||
|
|
@ -25,7 +25,7 @@ public class BAMFileStat extends CommandLineProgram {
|
||||||
private CommandType command;
|
private CommandType command;
|
||||||
|
|
||||||
@Argument(doc="The BAM file to inspect.",required=true)
|
@Argument(doc="The BAM file to inspect.",required=true)
|
||||||
private File bamFile;
|
private String bamFileName;
|
||||||
|
|
||||||
@Argument(doc="The range of blocks to inspect.",required=false)
|
@Argument(doc="The range of blocks to inspect.",required=false)
|
||||||
private String range;
|
private String range;
|
||||||
|
|
@ -46,10 +46,10 @@ public class BAMFileStat extends CommandLineProgram {
|
||||||
|
|
||||||
switch(command) {
|
switch(command) {
|
||||||
case ShowBlocks:
|
case ShowBlocks:
|
||||||
showBlocks(bamFile,startPosition,stopPosition);
|
showBlocks(new File(bamFileName),startPosition,stopPosition);
|
||||||
break;
|
break;
|
||||||
case ShowIndex:
|
case ShowIndex:
|
||||||
showIndexBins(bamFile);
|
showIndexBins(new File(bamFileName+".bai"));
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
return 0;
|
return 0;
|
||||||
|
|
|
||||||
|
|
@ -9,6 +9,7 @@ import org.broadinstitute.sting.gatk.datasources.simpleDataSources.BlockDrivenSA
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
|
||||||
import net.sf.samtools.Chunk;
|
import net.sf.samtools.Chunk;
|
||||||
|
import net.sf.samtools.Bin;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Copyright (c) 2009 The Broad Institute
|
* Copyright (c) 2009 The Broad Institute
|
||||||
|
|
@ -40,9 +41,18 @@ import net.sf.samtools.Chunk;
|
||||||
* A sharding strategy for loci based on reading of the index.
|
* A sharding strategy for loci based on reading of the index.
|
||||||
*/
|
*/
|
||||||
public class IndexDelimitedLocusShardStrategy implements ShardStrategy {
|
public class IndexDelimitedLocusShardStrategy implements ShardStrategy {
|
||||||
|
/**
|
||||||
|
* The data source to use when performing this sharding.
|
||||||
|
*/
|
||||||
|
private final BlockDrivenSAMDataSource blockDrivenDataSource;
|
||||||
|
|
||||||
/** our storage of the genomic locations they'd like to shard over */
|
/** our storage of the genomic locations they'd like to shard over */
|
||||||
private final SortedMap<GenomeLoc,List<Chunk>> locations = new TreeMap<GenomeLoc,List<Chunk>>();
|
private final List<FilePointer> filePointers = new ArrayList<FilePointer>();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* An iterator through the available file pointers.
|
||||||
|
*/
|
||||||
|
private final Iterator<FilePointer> filePointerIterator;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* construct the shard strategy from a seq dictionary, a shard size, and and genomeLocs
|
* construct the shard strategy from a seq dictionary, a shard size, and and genomeLocs
|
||||||
|
|
@ -50,8 +60,39 @@ public class IndexDelimitedLocusShardStrategy implements ShardStrategy {
|
||||||
* @param locations List of locations for which to load data.
|
* @param locations List of locations for which to load data.
|
||||||
*/
|
*/
|
||||||
IndexDelimitedLocusShardStrategy(SAMDataSource dataSource, GenomeLocSortedSet locations) {
|
IndexDelimitedLocusShardStrategy(SAMDataSource dataSource, GenomeLocSortedSet locations) {
|
||||||
for(GenomeLoc location: locations)
|
if(!(dataSource instanceof BlockDrivenSAMDataSource))
|
||||||
this.locations.put(location,((BlockDrivenSAMDataSource)dataSource).getOverlappingFilePointers(location));
|
throw new StingException("Cannot power an IndexDelimitedLocusShardStrategy with this data source.");
|
||||||
|
|
||||||
|
blockDrivenDataSource = (BlockDrivenSAMDataSource)dataSource;
|
||||||
|
final int deepestBinLevel = blockDrivenDataSource.getNumIndexLevels()-1;
|
||||||
|
|
||||||
|
// Create a list of contig name -> genome loc, sorted in INSERTION ORDER.
|
||||||
|
LinkedHashMap<String,List<GenomeLoc>> locationToReference = new LinkedHashMap<String,List<GenomeLoc>>();
|
||||||
|
for(GenomeLoc location: locations) {
|
||||||
|
if(!locationToReference.containsKey(location.getContig()))
|
||||||
|
locationToReference.put(location.getContig(),new ArrayList<GenomeLoc>());
|
||||||
|
locationToReference.get(location.getContig()).add(location);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Group the loci by bin, sorted in the order in which bins appear in the file. Only use the smallest bins in the set.
|
||||||
|
// TODO -- does this work with large interval lists?
|
||||||
|
for(String contig: locationToReference.keySet()) {
|
||||||
|
SortedMap<Bin,List<GenomeLoc>> bins = new TreeMap<Bin,List<GenomeLoc>>();
|
||||||
|
for(GenomeLoc location: locationToReference.get(contig)) {
|
||||||
|
List<Bin> binsForLocation = blockDrivenDataSource.getOverlappingBins(location);
|
||||||
|
for(Bin bin: binsForLocation) {
|
||||||
|
if(blockDrivenDataSource.getLevelForBin(bin) == deepestBinLevel) {
|
||||||
|
if(!bins.containsKey(bin))
|
||||||
|
bins.put(bin,new ArrayList<GenomeLoc>());
|
||||||
|
bins.get(bin).add(location);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for(SortedMap.Entry<Bin,List<GenomeLoc>> entry: bins.entrySet())
|
||||||
|
filePointers.add(new FilePointer(entry.getKey(),entry.getValue()));
|
||||||
|
}
|
||||||
|
|
||||||
|
filePointerIterator = filePointers.iterator();
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -60,7 +101,7 @@ public class IndexDelimitedLocusShardStrategy implements ShardStrategy {
|
||||||
* @return false if we're done processing shards
|
* @return false if we're done processing shards
|
||||||
*/
|
*/
|
||||||
public boolean hasNext() {
|
public boolean hasNext() {
|
||||||
return ( !locations.isEmpty() );
|
return filePointerIterator.hasNext();
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -69,16 +110,9 @@ public class IndexDelimitedLocusShardStrategy implements ShardStrategy {
|
||||||
* @return the next shard
|
* @return the next shard
|
||||||
*/
|
*/
|
||||||
public IndexDelimitedLocusShard next() {
|
public IndexDelimitedLocusShard next() {
|
||||||
if (( this.locations == null ) || ( locations.isEmpty() )) {
|
FilePointer nextFilePointer = filePointerIterator.next();
|
||||||
throw new StingException("IntervalShardStrategy: genomic regions list is empty in next() function.");
|
List<Chunk> chunksBounding = blockDrivenDataSource.getFilePointersBounding(nextFilePointer.bin);
|
||||||
}
|
return new IndexDelimitedLocusShard(nextFilePointer.locations,chunksBounding,Shard.ShardType.LOCUS_INTERVAL);
|
||||||
|
|
||||||
// get the first region in the list
|
|
||||||
GenomeLoc loc = locations.firstKey();
|
|
||||||
List<Chunk> filePointers = locations.get(loc);
|
|
||||||
locations.remove(loc);
|
|
||||||
|
|
||||||
return new IndexDelimitedLocusShard(Collections.singletonList(loc),filePointers,Shard.ShardType.LOCUS_INTERVAL);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/** we don't support the remove command */
|
/** we don't support the remove command */
|
||||||
|
|
@ -94,4 +128,19 @@ public class IndexDelimitedLocusShardStrategy implements ShardStrategy {
|
||||||
public Iterator<Shard> iterator() {
|
public Iterator<Shard> iterator() {
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Represents a small section of a BAM file, and every associated interval.
|
||||||
|
*/
|
||||||
|
private class FilePointer {
|
||||||
|
private final Bin bin;
|
||||||
|
private final List<GenomeLoc> locations;
|
||||||
|
|
||||||
|
public FilePointer(Bin bin, List<GenomeLoc> locations) {
|
||||||
|
this.bin = bin;
|
||||||
|
this.locations = locations;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
@ -44,8 +44,33 @@ public class BlockDrivenSAMDataSource extends SAMDataSource {
|
||||||
return reader.hasIndex();
|
return reader.hasIndex();
|
||||||
}
|
}
|
||||||
|
|
||||||
public List<Chunk> getOverlappingFilePointers(GenomeLoc location) {
|
public List<Bin> getOverlappingBins(GenomeLoc location) {
|
||||||
return reader.getOverlappingFilePointers(location.getContig(),(int)location.getStart(),(int)location.getStop());
|
return reader.getOverlappingBins(location.getContig(),(int)location.getStart(),(int)location.getStop());
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<Chunk> getFilePointersBounding(final Bin bin) {
|
||||||
|
return reader.getFilePointersBounding(bin);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get the number of levels employed by this index.
|
||||||
|
* @return Number of levels in this index.
|
||||||
|
*/
|
||||||
|
public int getNumIndexLevels() {
|
||||||
|
if(!hasIndex())
|
||||||
|
throw new SAMException("Unable to determine number of index levels; BAM file index is not present.");
|
||||||
|
return reader.getNumIndexLevels();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Gets the level associated with the given bin number.
|
||||||
|
* @param bin The bin for which to determine the level.
|
||||||
|
* @return the level associated with the given bin number.
|
||||||
|
*/
|
||||||
|
public int getLevelForBin(final Bin bin) {
|
||||||
|
if(!hasIndex())
|
||||||
|
throw new SAMException("Unable to determine level of bin number; BAM file index is not present.");
|
||||||
|
return reader.getLevelForBin(bin);
|
||||||
}
|
}
|
||||||
|
|
||||||
public StingSAMIterator seek(Shard shard) {
|
public StingSAMIterator seek(Shard shard) {
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue