Misc correctness enhancements: develop the bin selector into a recursive algorithm and return a shard when reads are missing. Also improve the performance of the read filter that clips reads not actually present in the shard.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2870 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
5f9c3f3884
commit
88d0677379
|
|
@ -41,7 +41,6 @@ import org.broadinstitute.sting.utils.StingException;
|
||||||
public class SAMFileReader2 extends SAMFileReader {
|
public class SAMFileReader2 extends SAMFileReader {
|
||||||
/**
|
/**
|
||||||
* Prepare to read a SAM or BAM file. If the given file is a BAM, and has a companion BAI index file
|
* Prepare to read a SAM or BAM file. If the given file is a BAM, and has a companion BAI index file
|
||||||
* that is named according to the convention, it will be found and opened, and indexed query will be allowed.
|
|
||||||
*/
|
*/
|
||||||
public SAMFileReader2(final File file) {
|
public SAMFileReader2(final File file) {
|
||||||
this(file, null, false);
|
this(file, null, false);
|
||||||
|
|
|
||||||
|
|
@ -66,7 +66,6 @@ public class IndexDelimitedLocusShardStrategy implements ShardStrategy {
|
||||||
throw new StingException("Cannot power an IndexDelimitedLocusShardStrategy with this data source.");
|
throw new StingException("Cannot power an IndexDelimitedLocusShardStrategy with this data source.");
|
||||||
|
|
||||||
blockDrivenDataSource = (BlockDrivenSAMDataSource)dataSource;
|
blockDrivenDataSource = (BlockDrivenSAMDataSource)dataSource;
|
||||||
final int deepestBinLevel = blockDrivenDataSource.getNumIndexLevels()-1;
|
|
||||||
|
|
||||||
// Create a list of contig name -> genome loc, sorted in INSERTION ORDER.
|
// Create a list of contig name -> genome loc, sorted in INSERTION ORDER.
|
||||||
LinkedHashMap<String,List<GenomeLoc>> locationToReference = new LinkedHashMap<String,List<GenomeLoc>>();
|
LinkedHashMap<String,List<GenomeLoc>> locationToReference = new LinkedHashMap<String,List<GenomeLoc>>();
|
||||||
|
|
@ -76,34 +75,103 @@ public class IndexDelimitedLocusShardStrategy implements ShardStrategy {
|
||||||
locationToReference.get(location.getContig()).add(location);
|
locationToReference.get(location.getContig()).add(location);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TODO: Not sure there's any reason to pre-separate the contigs now that we're using a streaming approach to file pointer allocation.
|
||||||
for(String contig: locationToReference.keySet()) {
|
for(String contig: locationToReference.keySet()) {
|
||||||
// Gather bins for the given loci, splitting loci as necessary so that each falls into exactly one lowest-level bin.
|
filePointers.addAll(batchLociIntoBins(locationToReference.get(contig),blockDrivenDataSource.getNumIndexLevels()-1));
|
||||||
SortedMap<Bin,List<GenomeLoc>> bins = new TreeMap<Bin,List<GenomeLoc>>();
|
|
||||||
for(GenomeLoc location: locationToReference.get(contig)) {
|
|
||||||
List<Bin> binsForLocation = blockDrivenDataSource.getOverlappingBins(location);
|
|
||||||
for(Bin bin: binsForLocation) {
|
|
||||||
if(blockDrivenDataSource.getLevelForBin(bin) == deepestBinLevel) {
|
|
||||||
final int firstLoc = blockDrivenDataSource.getFirstLocusInBin(bin);
|
|
||||||
final int lastLoc = blockDrivenDataSource.getLastLocusInBin(bin);
|
|
||||||
if(!bins.containsKey(bin))
|
|
||||||
bins.put(bin,new ArrayList<GenomeLoc>());
|
|
||||||
bins.get(bin).add(GenomeLocParser.createGenomeLoc(location.getContig(),
|
|
||||||
Math.max(location.getStart(),firstLoc),
|
|
||||||
Math.min(location.getStop(),lastLoc)));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Add a record of the new bin structure.
|
|
||||||
for(SortedMap.Entry<Bin,List<GenomeLoc>> entry: bins.entrySet()) {
|
|
||||||
Collections.sort(entry.getValue());
|
|
||||||
filePointers.add(new FilePointer(entry.getKey(),entry.getValue()));
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
filePointerIterator = filePointers.iterator();
|
filePointerIterator = filePointers.iterator();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private List<FilePointer> batchLociIntoBins(final List<GenomeLoc> loci, final int binsDeeperThan) {
|
||||||
|
// Gather bins for the given loci, splitting loci as necessary so that each falls into exactly one lowest-level bin.
|
||||||
|
List<FilePointer> filePointers = new ArrayList<FilePointer>();
|
||||||
|
FilePointer filePointer = null;
|
||||||
|
|
||||||
|
for(GenomeLoc location: loci) {
|
||||||
|
int locationStart = (int)location.getStart();
|
||||||
|
final int locationStop = (int)location.getStop();
|
||||||
|
|
||||||
|
List<Bin> bins = findBinsAtLeastAsDeepAs(blockDrivenDataSource.getOverlappingBins(location),binsDeeperThan);
|
||||||
|
|
||||||
|
if(bins.size() == 0) {
|
||||||
|
if(filePointer != null && filePointer.locations.size() > 0) {
|
||||||
|
filePointers.add(filePointer);
|
||||||
|
filePointer = null;
|
||||||
|
}
|
||||||
|
|
||||||
|
filePointers.add(new FilePointer(location));
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
Collections.sort(bins);
|
||||||
|
|
||||||
|
Iterator<Bin> binIterator = bins.iterator();
|
||||||
|
|
||||||
|
while(locationStop >= locationStart) {
|
||||||
|
int binStart = filePointer!=null ? blockDrivenDataSource.getFirstLocusInBin(filePointer.bin) : 0;
|
||||||
|
int binStop = filePointer!=null ? blockDrivenDataSource.getLastLocusInBin(filePointer.bin) : 0;
|
||||||
|
|
||||||
|
while(binStop <= locationStart && binIterator.hasNext()) {
|
||||||
|
if(filePointer != null && filePointer.locations.size() > 0)
|
||||||
|
filePointers.add(filePointer);
|
||||||
|
|
||||||
|
filePointer = new FilePointer(binIterator.next());
|
||||||
|
binStart = blockDrivenDataSource.getFirstLocusInBin(filePointer.bin);
|
||||||
|
binStop = blockDrivenDataSource.getLastLocusInBin(filePointer.bin);
|
||||||
|
}
|
||||||
|
|
||||||
|
if(locationStart < binStart) {
|
||||||
|
// The region starts before the first bin in the sequence. Add the region occurring before the sequence.
|
||||||
|
if(filePointer != null && filePointer.locations.size() > 0) {
|
||||||
|
filePointers.add(filePointer);
|
||||||
|
filePointer = null;
|
||||||
|
}
|
||||||
|
|
||||||
|
final int regionStop = Math.min(locationStop,binStart-1);
|
||||||
|
|
||||||
|
GenomeLoc subset = GenomeLocParser.createGenomeLoc(location.getContig(),locationStart,regionStop);
|
||||||
|
filePointers.addAll(batchLociIntoBins(Collections.singletonList(subset),binsDeeperThan-1));
|
||||||
|
|
||||||
|
locationStart = regionStop + 1;
|
||||||
|
}
|
||||||
|
else if(locationStart > binStop) {
|
||||||
|
// The region starts after the last bin in the sequence. Add the region occurring after the sequence.
|
||||||
|
if(filePointer != null && filePointer.locations.size() > 0) {
|
||||||
|
filePointers.add(filePointer);
|
||||||
|
filePointer = null;
|
||||||
|
}
|
||||||
|
|
||||||
|
GenomeLoc subset = GenomeLocParser.createGenomeLoc(location.getContig(),locationStart,locationStop);
|
||||||
|
filePointers.addAll(batchLociIntoBins(Collections.singletonList(subset),binsDeeperThan-1));
|
||||||
|
|
||||||
|
locationStart = locationStop + 1;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
// The start of the region overlaps the bin. Add the overlapping subset.
|
||||||
|
final int regionStop = Math.min(locationStop,binStop);
|
||||||
|
filePointer.addLocation(GenomeLocParser.createGenomeLoc(location.getContig(),
|
||||||
|
locationStart,
|
||||||
|
regionStop));
|
||||||
|
locationStart = regionStop + 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if(filePointer != null && filePointer.locations.size() > 0)
|
||||||
|
filePointers.add(filePointer);
|
||||||
|
|
||||||
|
return filePointers;
|
||||||
|
}
|
||||||
|
|
||||||
|
private List<Bin> findBinsAtLeastAsDeepAs(final List<Bin> bins, final int deepestBinLevel) {
|
||||||
|
List<Bin> deepestBins = new ArrayList<Bin>();
|
||||||
|
for(Bin bin: bins) {
|
||||||
|
if(blockDrivenDataSource.getLevelForBin(bin) >= deepestBinLevel)
|
||||||
|
deepestBins.add(bin);
|
||||||
|
}
|
||||||
|
return deepestBins;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* returns true if there are additional shards
|
* returns true if there are additional shards
|
||||||
*
|
*
|
||||||
|
|
@ -145,9 +213,18 @@ public class IndexDelimitedLocusShardStrategy implements ShardStrategy {
|
||||||
private final Bin bin;
|
private final Bin bin;
|
||||||
private final List<GenomeLoc> locations;
|
private final List<GenomeLoc> locations;
|
||||||
|
|
||||||
public FilePointer(Bin bin, List<GenomeLoc> locations) {
|
public FilePointer(Bin bin) {
|
||||||
this.bin = bin;
|
this.bin = bin;
|
||||||
this.locations = locations;
|
this.locations = new ArrayList<GenomeLoc>();
|
||||||
|
}
|
||||||
|
|
||||||
|
public FilePointer(GenomeLoc location) {
|
||||||
|
bin = null;
|
||||||
|
locations = Collections.singletonList(location);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void addLocation(GenomeLoc location) {
|
||||||
|
locations.add(location);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -29,9 +29,10 @@ public class IntervalOverlappingFilter implements SamRecordFilter {
|
||||||
* @return True to filter the read out. False otherwise.
|
* @return True to filter the read out. False otherwise.
|
||||||
*/
|
*/
|
||||||
public boolean filterOut(SAMRecord read) {
|
public boolean filterOut(SAMRecord read) {
|
||||||
GenomeLoc readLocation = GenomeLocParser.createGenomeLoc(read);
|
|
||||||
for(GenomeLoc interval: intervals) {
|
for(GenomeLoc interval: intervals) {
|
||||||
if(interval.overlapsP(readLocation))
|
if((read.getAlignmentStart() >= interval.getStart() && read.getAlignmentStart() <= interval.getStop()) ||
|
||||||
|
(read.getAlignmentEnd() >= interval.getStart() && read.getAlignmentEnd() <= interval.getStop()) ||
|
||||||
|
(read.getAlignmentStart() < interval.getStart() && read.getAlignmentEnd() > interval.getStop()))
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue