diff --git a/java/src/net/sf/samtools/SAMFileReader2.java b/java/src/net/sf/samtools/SAMFileReader2.java index e1120e1ef..2f5ae4ece 100644 --- a/java/src/net/sf/samtools/SAMFileReader2.java +++ b/java/src/net/sf/samtools/SAMFileReader2.java @@ -41,7 +41,6 @@ import org.broadinstitute.sting.utils.StingException; public class SAMFileReader2 extends SAMFileReader { /** * Prepare to read a SAM or BAM file. If the given file is a BAM, and has a companion BAI index file - * that is named according to the convention, it will be found and opened, and indexed query will be allowed. */ public SAMFileReader2(final File file) { this(file, null, false); diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/shards/IndexDelimitedLocusShardStrategy.java b/java/src/org/broadinstitute/sting/gatk/datasources/shards/IndexDelimitedLocusShardStrategy.java index b4dcab9a1..e4313f0ce 100755 --- a/java/src/org/broadinstitute/sting/gatk/datasources/shards/IndexDelimitedLocusShardStrategy.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/shards/IndexDelimitedLocusShardStrategy.java @@ -66,7 +66,6 @@ public class IndexDelimitedLocusShardStrategy implements ShardStrategy { throw new StingException("Cannot power an IndexDelimitedLocusShardStrategy with this data source."); blockDrivenDataSource = (BlockDrivenSAMDataSource)dataSource; - final int deepestBinLevel = blockDrivenDataSource.getNumIndexLevels()-1; // Create a list of contig name -> genome loc, sorted in INSERTION ORDER. LinkedHashMap> locationToReference = new LinkedHashMap>(); @@ -76,34 +75,103 @@ public class IndexDelimitedLocusShardStrategy implements ShardStrategy { locationToReference.get(location.getContig()).add(location); } + // TODO: Not sure there's any reason to pre-separate the contigs now that we're using a streaming approach to file pointer allocation. for(String contig: locationToReference.keySet()) { - // Gather bins for the given loci, splitting loci as necessary so that each falls into exactly one lowest-level bin. - SortedMap> bins = new TreeMap>(); - for(GenomeLoc location: locationToReference.get(contig)) { - List binsForLocation = blockDrivenDataSource.getOverlappingBins(location); - for(Bin bin: binsForLocation) { - if(blockDrivenDataSource.getLevelForBin(bin) == deepestBinLevel) { - final int firstLoc = blockDrivenDataSource.getFirstLocusInBin(bin); - final int lastLoc = blockDrivenDataSource.getLastLocusInBin(bin); - if(!bins.containsKey(bin)) - bins.put(bin,new ArrayList()); - bins.get(bin).add(GenomeLocParser.createGenomeLoc(location.getContig(), - Math.max(location.getStart(),firstLoc), - Math.min(location.getStop(),lastLoc))); - } - } - } - - // Add a record of the new bin structure. - for(SortedMap.Entry> entry: bins.entrySet()) { - Collections.sort(entry.getValue()); - filePointers.add(new FilePointer(entry.getKey(),entry.getValue())); - } + filePointers.addAll(batchLociIntoBins(locationToReference.get(contig),blockDrivenDataSource.getNumIndexLevels()-1)); } filePointerIterator = filePointers.iterator(); } + private List batchLociIntoBins(final List loci, final int binsDeeperThan) { + // Gather bins for the given loci, splitting loci as necessary so that each falls into exactly one lowest-level bin. + List filePointers = new ArrayList(); + FilePointer filePointer = null; + + for(GenomeLoc location: loci) { + int locationStart = (int)location.getStart(); + final int locationStop = (int)location.getStop(); + + List bins = findBinsAtLeastAsDeepAs(blockDrivenDataSource.getOverlappingBins(location),binsDeeperThan); + + if(bins.size() == 0) { + if(filePointer != null && filePointer.locations.size() > 0) { + filePointers.add(filePointer); + filePointer = null; + } + + filePointers.add(new FilePointer(location)); + continue; + } + Collections.sort(bins); + + Iterator binIterator = bins.iterator(); + + while(locationStop >= locationStart) { + int binStart = filePointer!=null ? blockDrivenDataSource.getFirstLocusInBin(filePointer.bin) : 0; + int binStop = filePointer!=null ? blockDrivenDataSource.getLastLocusInBin(filePointer.bin) : 0; + + while(binStop <= locationStart && binIterator.hasNext()) { + if(filePointer != null && filePointer.locations.size() > 0) + filePointers.add(filePointer); + + filePointer = new FilePointer(binIterator.next()); + binStart = blockDrivenDataSource.getFirstLocusInBin(filePointer.bin); + binStop = blockDrivenDataSource.getLastLocusInBin(filePointer.bin); + } + + if(locationStart < binStart) { + // The region starts before the first bin in the sequence. Add the region occurring before the sequence. + if(filePointer != null && filePointer.locations.size() > 0) { + filePointers.add(filePointer); + filePointer = null; + } + + final int regionStop = Math.min(locationStop,binStart-1); + + GenomeLoc subset = GenomeLocParser.createGenomeLoc(location.getContig(),locationStart,regionStop); + filePointers.addAll(batchLociIntoBins(Collections.singletonList(subset),binsDeeperThan-1)); + + locationStart = regionStop + 1; + } + else if(locationStart > binStop) { + // The region starts after the last bin in the sequence. Add the region occurring after the sequence. + if(filePointer != null && filePointer.locations.size() > 0) { + filePointers.add(filePointer); + filePointer = null; + } + + GenomeLoc subset = GenomeLocParser.createGenomeLoc(location.getContig(),locationStart,locationStop); + filePointers.addAll(batchLociIntoBins(Collections.singletonList(subset),binsDeeperThan-1)); + + locationStart = locationStop + 1; + } + else { + // The start of the region overlaps the bin. Add the overlapping subset. + final int regionStop = Math.min(locationStop,binStop); + filePointer.addLocation(GenomeLocParser.createGenomeLoc(location.getContig(), + locationStart, + regionStop)); + locationStart = regionStop + 1; + } + } + } + + if(filePointer != null && filePointer.locations.size() > 0) + filePointers.add(filePointer); + + return filePointers; + } + + private List findBinsAtLeastAsDeepAs(final List bins, final int deepestBinLevel) { + List deepestBins = new ArrayList(); + for(Bin bin: bins) { + if(blockDrivenDataSource.getLevelForBin(bin) >= deepestBinLevel) + deepestBins.add(bin); + } + return deepestBins; + } + /** * returns true if there are additional shards * @@ -145,9 +213,18 @@ public class IndexDelimitedLocusShardStrategy implements ShardStrategy { private final Bin bin; private final List locations; - public FilePointer(Bin bin, List locations) { + public FilePointer(Bin bin) { this.bin = bin; - this.locations = locations; + this.locations = new ArrayList(); + } + + public FilePointer(GenomeLoc location) { + bin = null; + locations = Collections.singletonList(location); + } + + public void addLocation(GenomeLoc location) { + locations.add(location); } } diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/IntervalOverlappingFilter.java b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/IntervalOverlappingFilter.java index b90f05b4d..827f69938 100644 --- a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/IntervalOverlappingFilter.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/IntervalOverlappingFilter.java @@ -29,9 +29,10 @@ public class IntervalOverlappingFilter implements SamRecordFilter { * @return True to filter the read out. False otherwise. */ public boolean filterOut(SAMRecord read) { - GenomeLoc readLocation = GenomeLocParser.createGenomeLoc(read); for(GenomeLoc interval: intervals) { - if(interval.overlapsP(readLocation)) + if((read.getAlignmentStart() >= interval.getStart() && read.getAlignmentStart() <= interval.getStop()) || + (read.getAlignmentEnd() >= interval.getStart() && read.getAlignmentEnd() <= interval.getStop()) || + (read.getAlignmentStart() < interval.getStart() && read.getAlignmentEnd() > interval.getStop())) return false; } return true;