Misc correctness enhancements: develop the bin selector into a recursive algorithm and return a shard when reads are missing. Also improve the performance of the read filter that clips reads not actually present in the shard.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2870 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
hanna 2010-02-22 22:19:06 +00:00
parent 5f9c3f3884
commit 88d0677379
3 changed files with 105 additions and 28 deletions

View File

@ -41,7 +41,6 @@ import org.broadinstitute.sting.utils.StingException;
public class SAMFileReader2 extends SAMFileReader {
/**
* Prepare to read a SAM or BAM file. If the given file is a BAM, and has a companion BAI index file
* that is named according to the convention, it will be found and opened, and indexed query will be allowed.
*/
public SAMFileReader2(final File file) {
this(file, null, false);

View File

@ -66,7 +66,6 @@ public class IndexDelimitedLocusShardStrategy implements ShardStrategy {
throw new StingException("Cannot power an IndexDelimitedLocusShardStrategy with this data source.");
blockDrivenDataSource = (BlockDrivenSAMDataSource)dataSource;
final int deepestBinLevel = blockDrivenDataSource.getNumIndexLevels()-1;
// Create a list of contig name -> genome loc, sorted in INSERTION ORDER.
LinkedHashMap<String,List<GenomeLoc>> locationToReference = new LinkedHashMap<String,List<GenomeLoc>>();
@ -76,34 +75,103 @@ public class IndexDelimitedLocusShardStrategy implements ShardStrategy {
locationToReference.get(location.getContig()).add(location);
}
// TODO: Not sure there's any reason to pre-separate the contigs now that we're using a streaming approach to file pointer allocation.
for(String contig: locationToReference.keySet()) {
// Gather bins for the given loci, splitting loci as necessary so that each falls into exactly one lowest-level bin.
SortedMap<Bin,List<GenomeLoc>> bins = new TreeMap<Bin,List<GenomeLoc>>();
for(GenomeLoc location: locationToReference.get(contig)) {
List<Bin> binsForLocation = blockDrivenDataSource.getOverlappingBins(location);
for(Bin bin: binsForLocation) {
if(blockDrivenDataSource.getLevelForBin(bin) == deepestBinLevel) {
final int firstLoc = blockDrivenDataSource.getFirstLocusInBin(bin);
final int lastLoc = blockDrivenDataSource.getLastLocusInBin(bin);
if(!bins.containsKey(bin))
bins.put(bin,new ArrayList<GenomeLoc>());
bins.get(bin).add(GenomeLocParser.createGenomeLoc(location.getContig(),
Math.max(location.getStart(),firstLoc),
Math.min(location.getStop(),lastLoc)));
}
}
}
// Add a record of the new bin structure.
for(SortedMap.Entry<Bin,List<GenomeLoc>> entry: bins.entrySet()) {
Collections.sort(entry.getValue());
filePointers.add(new FilePointer(entry.getKey(),entry.getValue()));
}
filePointers.addAll(batchLociIntoBins(locationToReference.get(contig),blockDrivenDataSource.getNumIndexLevels()-1));
}
filePointerIterator = filePointers.iterator();
}
private List<FilePointer> batchLociIntoBins(final List<GenomeLoc> loci, final int binsDeeperThan) {
// Gather bins for the given loci, splitting loci as necessary so that each falls into exactly one lowest-level bin.
List<FilePointer> filePointers = new ArrayList<FilePointer>();
FilePointer filePointer = null;
for(GenomeLoc location: loci) {
int locationStart = (int)location.getStart();
final int locationStop = (int)location.getStop();
List<Bin> bins = findBinsAtLeastAsDeepAs(blockDrivenDataSource.getOverlappingBins(location),binsDeeperThan);
if(bins.size() == 0) {
if(filePointer != null && filePointer.locations.size() > 0) {
filePointers.add(filePointer);
filePointer = null;
}
filePointers.add(new FilePointer(location));
continue;
}
Collections.sort(bins);
Iterator<Bin> binIterator = bins.iterator();
while(locationStop >= locationStart) {
int binStart = filePointer!=null ? blockDrivenDataSource.getFirstLocusInBin(filePointer.bin) : 0;
int binStop = filePointer!=null ? blockDrivenDataSource.getLastLocusInBin(filePointer.bin) : 0;
while(binStop <= locationStart && binIterator.hasNext()) {
if(filePointer != null && filePointer.locations.size() > 0)
filePointers.add(filePointer);
filePointer = new FilePointer(binIterator.next());
binStart = blockDrivenDataSource.getFirstLocusInBin(filePointer.bin);
binStop = blockDrivenDataSource.getLastLocusInBin(filePointer.bin);
}
if(locationStart < binStart) {
// The region starts before the first bin in the sequence. Add the region occurring before the sequence.
if(filePointer != null && filePointer.locations.size() > 0) {
filePointers.add(filePointer);
filePointer = null;
}
final int regionStop = Math.min(locationStop,binStart-1);
GenomeLoc subset = GenomeLocParser.createGenomeLoc(location.getContig(),locationStart,regionStop);
filePointers.addAll(batchLociIntoBins(Collections.singletonList(subset),binsDeeperThan-1));
locationStart = regionStop + 1;
}
else if(locationStart > binStop) {
// The region starts after the last bin in the sequence. Add the region occurring after the sequence.
if(filePointer != null && filePointer.locations.size() > 0) {
filePointers.add(filePointer);
filePointer = null;
}
GenomeLoc subset = GenomeLocParser.createGenomeLoc(location.getContig(),locationStart,locationStop);
filePointers.addAll(batchLociIntoBins(Collections.singletonList(subset),binsDeeperThan-1));
locationStart = locationStop + 1;
}
else {
// The start of the region overlaps the bin. Add the overlapping subset.
final int regionStop = Math.min(locationStop,binStop);
filePointer.addLocation(GenomeLocParser.createGenomeLoc(location.getContig(),
locationStart,
regionStop));
locationStart = regionStop + 1;
}
}
}
if(filePointer != null && filePointer.locations.size() > 0)
filePointers.add(filePointer);
return filePointers;
}
private List<Bin> findBinsAtLeastAsDeepAs(final List<Bin> bins, final int deepestBinLevel) {
List<Bin> deepestBins = new ArrayList<Bin>();
for(Bin bin: bins) {
if(blockDrivenDataSource.getLevelForBin(bin) >= deepestBinLevel)
deepestBins.add(bin);
}
return deepestBins;
}
/**
* returns true if there are additional shards
*
@ -145,9 +213,18 @@ public class IndexDelimitedLocusShardStrategy implements ShardStrategy {
private final Bin bin;
private final List<GenomeLoc> locations;
public FilePointer(Bin bin, List<GenomeLoc> locations) {
public FilePointer(Bin bin) {
this.bin = bin;
this.locations = locations;
this.locations = new ArrayList<GenomeLoc>();
}
public FilePointer(GenomeLoc location) {
bin = null;
locations = Collections.singletonList(location);
}
public void addLocation(GenomeLoc location) {
locations.add(location);
}
}

View File

@ -29,9 +29,10 @@ public class IntervalOverlappingFilter implements SamRecordFilter {
* @return True to filter the read out. False otherwise.
*/
public boolean filterOut(SAMRecord read) {
GenomeLoc readLocation = GenomeLocParser.createGenomeLoc(read);
for(GenomeLoc interval: intervals) {
if(interval.overlapsP(readLocation))
if((read.getAlignmentStart() >= interval.getStart() && read.getAlignmentStart() <= interval.getStop()) ||
(read.getAlignmentEnd() >= interval.getStart() && read.getAlignmentEnd() <= interval.getStop()) ||
(read.getAlignmentStart() < interval.getStart() && read.getAlignmentEnd() > interval.getStop()))
return false;
}
return true;