Misc correctness enhancements: develop the bin selector into a recursive algorithm and return a shard when reads are missing. Also improve the performance of the read filter that clips reads not actually present in the shard.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2870 348d0f76-0448-11de-a6fe-93d51630548a
2010-02-22 22:19:06 +00:00 · 2010-02-22 22:19:06 +00:00 · 88d0677379
parent 5f9c3f3884
commit 88d0677379
3 changed files with 105 additions and 28 deletions
--- a/java/src/net/sf/samtools/SAMFileReader2.java
+++ b/java/src/net/sf/samtools/SAMFileReader2.java
@ -41,7 +41,6 @@ import org.broadinstitute.sting.utils.StingException;
 public class SAMFileReader2 extends SAMFileReader {
    /**
     * Prepare to read a SAM or BAM file.  If the given file is a BAM, and has a companion BAI index file
-     * that is named according to the convention, it will be found and opened, and indexed query will be allowed.
     */
    public SAMFileReader2(final File file) {
        this(file, null, false);
--- a/java/src/org/broadinstitute/sting/gatk/datasources/shards/IndexDelimitedLocusShardStrategy.java
+++ b/java/src/org/broadinstitute/sting/gatk/datasources/shards/IndexDelimitedLocusShardStrategy.java
@ -66,7 +66,6 @@ public class IndexDelimitedLocusShardStrategy implements ShardStrategy {
            throw new StingException("Cannot power an IndexDelimitedLocusShardStrategy with this data source.");

        blockDrivenDataSource = (BlockDrivenSAMDataSource)dataSource;
-        final int deepestBinLevel = blockDrivenDataSource.getNumIndexLevels()-1;

        // Create a list of contig name -> genome loc, sorted in INSERTION ORDER.
        LinkedHashMap<String,List<GenomeLoc>> locationToReference = new LinkedHashMap<String,List<GenomeLoc>>();
@ -76,34 +75,103 @@ public class IndexDelimitedLocusShardStrategy implements ShardStrategy {
            locationToReference.get(location.getContig()).add(location);
        }

+        // TODO: Not sure there's any reason to pre-separate the contigs now that we're using a streaming approach to file pointer allocation.
        for(String contig: locationToReference.keySet()) {
-            // Gather bins for the given loci, splitting loci as necessary so that each falls into exactly one lowest-level bin.
-            SortedMap<Bin,List<GenomeLoc>> bins = new TreeMap<Bin,List<GenomeLoc>>();
-            for(GenomeLoc location: locationToReference.get(contig)) {
-                List<Bin> binsForLocation = blockDrivenDataSource.getOverlappingBins(location);
-                for(Bin bin: binsForLocation) {
-                    if(blockDrivenDataSource.getLevelForBin(bin) == deepestBinLevel) {
-                        final int firstLoc = blockDrivenDataSource.getFirstLocusInBin(bin);
-                        final int lastLoc = blockDrivenDataSource.getLastLocusInBin(bin);
-                        if(!bins.containsKey(bin))
-                            bins.put(bin,new ArrayList<GenomeLoc>());
-                        bins.get(bin).add(GenomeLocParser.createGenomeLoc(location.getContig(),
-                                                                          Math.max(location.getStart(),firstLoc),
-                                                                          Math.min(location.getStop(),lastLoc)));
-                    }
-                }
-            }
-
-            // Add a record of the new bin structure.
-            for(SortedMap.Entry<Bin,List<GenomeLoc>> entry: bins.entrySet()) {
-                Collections.sort(entry.getValue());                
-                filePointers.add(new FilePointer(entry.getKey(),entry.getValue()));
-            }
+            filePointers.addAll(batchLociIntoBins(locationToReference.get(contig),blockDrivenDataSource.getNumIndexLevels()-1));
        }

        filePointerIterator = filePointers.iterator();
    }

+    private List<FilePointer> batchLociIntoBins(final List<GenomeLoc> loci, final int binsDeeperThan) {
+        // Gather bins for the given loci, splitting loci as necessary so that each falls into exactly one lowest-level bin.
+        List<FilePointer> filePointers = new ArrayList<FilePointer>();
+        FilePointer filePointer = null;
+
+        for(GenomeLoc location: loci) {
+            int locationStart = (int)location.getStart();
+            final int locationStop = (int)location.getStop();
+
+            List<Bin> bins = findBinsAtLeastAsDeepAs(blockDrivenDataSource.getOverlappingBins(location),binsDeeperThan);
+
+            if(bins.size() == 0) {
+                if(filePointer != null && filePointer.locations.size() > 0) {
+                    filePointers.add(filePointer);
+                    filePointer = null;
+                }                
+
+                filePointers.add(new FilePointer(location));
+                continue;
+            }
+            Collections.sort(bins);
+
+            Iterator<Bin> binIterator = bins.iterator();
+
+            while(locationStop >= locationStart) {
+                int binStart = filePointer!=null ? blockDrivenDataSource.getFirstLocusInBin(filePointer.bin) : 0;
+                int binStop = filePointer!=null ? blockDrivenDataSource.getLastLocusInBin(filePointer.bin) : 0;
+
+                while(binStop <= locationStart && binIterator.hasNext()) {
+                    if(filePointer != null && filePointer.locations.size() > 0)
+                        filePointers.add(filePointer);
+
+                    filePointer = new FilePointer(binIterator.next());
+                    binStart = blockDrivenDataSource.getFirstLocusInBin(filePointer.bin);
+                    binStop = blockDrivenDataSource.getLastLocusInBin(filePointer.bin);
+                }
+
+                if(locationStart < binStart) {
+                    // The region starts before the first bin in the sequence.  Add the region occurring before the sequence.
+                    if(filePointer != null && filePointer.locations.size() > 0) {
+                        filePointers.add(filePointer);
+                        filePointer = null;
+                    }
+
+                    final int regionStop = Math.min(locationStop,binStart-1);
+
+                    GenomeLoc subset = GenomeLocParser.createGenomeLoc(location.getContig(),locationStart,regionStop);
+                    filePointers.addAll(batchLociIntoBins(Collections.singletonList(subset),binsDeeperThan-1));
+
+                    locationStart = regionStop + 1;
+                }
+                else if(locationStart > binStop) {
+                    // The region starts after the last bin in the sequence.  Add the region occurring after the sequence.
+                    if(filePointer != null && filePointer.locations.size() > 0) {
+                        filePointers.add(filePointer);
+                        filePointer = null;
+                    }
+
+                    GenomeLoc subset = GenomeLocParser.createGenomeLoc(location.getContig(),locationStart,locationStop);
+                    filePointers.addAll(batchLociIntoBins(Collections.singletonList(subset),binsDeeperThan-1));
+
+                    locationStart = locationStop + 1;
+                }
+                else {
+                    // The start of the region overlaps the bin.  Add the overlapping subset.
+                    final int regionStop = Math.min(locationStop,binStop);
+                    filePointer.addLocation(GenomeLocParser.createGenomeLoc(location.getContig(),
+                            locationStart,
+                            regionStop));
+                    locationStart = regionStop + 1;
+                }
+            }
+        }
+
+        if(filePointer != null && filePointer.locations.size() > 0)
+            filePointers.add(filePointer);
+
+        return filePointers;
+    }
+
+    private List<Bin> findBinsAtLeastAsDeepAs(final List<Bin> bins, final int deepestBinLevel) {
+        List<Bin> deepestBins = new ArrayList<Bin>();
+        for(Bin bin: bins) {
+            if(blockDrivenDataSource.getLevelForBin(bin) >= deepestBinLevel)
+                deepestBins.add(bin);
+        }
+        return deepestBins;
+    }
+
    /**
     * returns true if there are additional shards
     *
@ -145,9 +213,18 @@ public class IndexDelimitedLocusShardStrategy implements ShardStrategy {
        private final Bin bin;
        private final List<GenomeLoc> locations;

-        public FilePointer(Bin bin, List<GenomeLoc> locations) {
+        public FilePointer(Bin bin) {
            this.bin = bin;
-            this.locations = locations;
+            this.locations = new ArrayList<GenomeLoc>();
+        }
+
+        public FilePointer(GenomeLoc location) {
+            bin = null;
+            locations = Collections.singletonList(location);
+        }
+
+        public void addLocation(GenomeLoc location) {
+            locations.add(location);
        }
        
    }
--- a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/IntervalOverlappingFilter.java
+++ b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/IntervalOverlappingFilter.java
@ -29,9 +29,10 @@ public class IntervalOverlappingFilter implements SamRecordFilter {
     * @return True to filter the read out.  False otherwise.
     */
    public boolean filterOut(SAMRecord read) {
-        GenomeLoc readLocation = GenomeLocParser.createGenomeLoc(read);
        for(GenomeLoc interval: intervals) {
-            if(interval.overlapsP(readLocation))
+            if((read.getAlignmentStart() >= interval.getStart() && read.getAlignmentStart() <= interval.getStop()) ||
+               (read.getAlignmentEnd() >= interval.getStart() && read.getAlignmentEnd() <= interval.getStop()) ||
+               (read.getAlignmentStart() < interval.getStart() && read.getAlignmentEnd() > interval.getStop()))
                return false;
        }
        return true;