diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/shards/BlockDelimitedReadShardStrategy.java b/java/src/org/broadinstitute/sting/gatk/datasources/shards/BlockDelimitedReadShardStrategy.java index 032c93b54..12feb4c74 100644 --- a/java/src/org/broadinstitute/sting/gatk/datasources/shards/BlockDelimitedReadShardStrategy.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/shards/BlockDelimitedReadShardStrategy.java @@ -27,7 +27,12 @@ public class BlockDelimitedReadShardStrategy extends ReadShardStrategy { /** * The data source used to shard. */ - protected final BlockDrivenSAMDataSource dataSource; + private final BlockDrivenSAMDataSource dataSource; + + /** + * The intervals to be processed. + */ + private final GenomeLocSortedSet locations; /** * The cached shard to be returned next. Prefetched in the peekable iterator style. @@ -63,10 +68,13 @@ public class BlockDelimitedReadShardStrategy extends ReadShardStrategy { this.dataSource = (BlockDrivenSAMDataSource)dataSource; this.position = this.dataSource.getCurrentPosition(); - if(locations != null) - filePointers.addAll(IntervalSharder.shardIntervals(this.dataSource,locations.toList())); + this.locations = locations; + + if(locations != null) + filePointerIterator = IntervalSharder.shardIntervals(this.dataSource,locations.toList()); + else + filePointerIterator = filePointers.iterator(); - filePointerIterator = filePointers.iterator(); if(filePointerIterator.hasNext()) currentFilePointer = filePointerIterator.next(); @@ -99,7 +107,7 @@ public class BlockDelimitedReadShardStrategy extends ReadShardStrategy { nextShard = null; SamRecordFilter filter = null; - if(!filePointers.isEmpty()) { + if(locations != null) { Map> selectedReaders = new HashMap>(); while(selectedReaders.size() == 0 && currentFilePointer != null) { shardPosition = currentFilePointer.chunks; diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/shards/IndexDelimitedLocusShardStrategy.java b/java/src/org/broadinstitute/sting/gatk/datasources/shards/IndexDelimitedLocusShardStrategy.java index d428064d3..20c44b994 100755 --- a/java/src/org/broadinstitute/sting/gatk/datasources/shards/IndexDelimitedLocusShardStrategy.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/shards/IndexDelimitedLocusShardStrategy.java @@ -12,7 +12,6 @@ import org.broadinstitute.sting.gatk.datasources.simpleDataSources.SAMReaderID; import java.util.*; import net.sf.samtools.Chunk; -import net.sf.samtools.SAMFileReader; import net.sf.samtools.SAMFileHeader; import net.sf.samtools.SAMSequenceRecord; @@ -51,9 +50,6 @@ public class IndexDelimitedLocusShardStrategy implements ShardStrategy { */ private final BlockDrivenSAMDataSource reads; - /** our storage of the genomic locations they'd like to shard over */ - private final List filePointers = new ArrayList(); - /** * An iterator through the available file pointers. */ @@ -90,13 +86,13 @@ public class IndexDelimitedLocusShardStrategy implements ShardStrategy { else intervals = locations.toList(); - this.reads = (BlockDrivenSAMDataSource)reads; - filePointers.addAll(IntervalSharder.shardIntervals(this.reads,intervals)); + this.filePointerIterator = IntervalSharder.shardIntervals(this.reads,intervals); } else { final int maxShardSize = 100000; this.reads = null; + List filePointers = new ArrayList(); if(locations == null) { for(SAMSequenceRecord refSequenceRecord: reference.getSequenceDictionary().getSequences()) { for(int shardStart = 1; shardStart <= refSequenceRecord.getSequenceLength(); shardStart += maxShardSize) { @@ -109,9 +105,9 @@ public class IndexDelimitedLocusShardStrategy implements ShardStrategy { for(GenomeLoc interval: locations) filePointers.add(new FilePointer(interval)); } + filePointerIterator = filePointers.iterator(); } - filePointerIterator = filePointers.iterator(); } /** diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/shards/IntervalSharder.java b/java/src/org/broadinstitute/sting/gatk/datasources/shards/IntervalSharder.java index 91520f842..311c8a94f 100644 --- a/java/src/org/broadinstitute/sting/gatk/datasources/shards/IntervalSharder.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/shards/IntervalSharder.java @@ -1,9 +1,6 @@ package org.broadinstitute.sting.gatk.datasources.shards; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.StingException; -import org.broadinstitute.sting.utils.GenomeLocSortedSet; +import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.gatk.datasources.simpleDataSources.BlockDrivenSAMDataSource; import org.broadinstitute.sting.gatk.datasources.simpleDataSources.SAMReaderID; @@ -19,192 +16,113 @@ import net.sf.picard.util.PeekableIterator; * @version 0.1 */ public class IntervalSharder { - protected static List shardIntervals(final BlockDrivenSAMDataSource dataSource, final List loci) { - Map> filePointersByReader = new HashMap>(); - for(SAMReaderID id: dataSource.getReaderIDs()) { - PreloadedBAMFileIndex index = dataSource.getIndex(id); - // Gather bins for the given loci, splitting loci as necessary so that each falls into exactly one lowest-level bin.\ - filePointersByReader.put(id,shardIntervalsOverIndex(dataSource,id,index,loci,index.getNumIndexLevels()-1)); - index.close(); - } - return combineFilePointers(filePointersByReader); + public static Iterator shardIntervals(final BlockDrivenSAMDataSource dataSource, final List loci) { + return new FilePointerIterator(dataSource,loci); } /** - * Combine adjacent file pointers into a structure that can be streamed in. - * @param filePointersByReader File pointers broken down by reader. - * @return A large structure of file pointers. + * A lazy-loading iterator over file pointers. */ - private static List combineFilePointers(Map> filePointersByReader) { - PeekableIterator mergingIterator = new PeekableIterator(new FilePointerMergingIterator(filePointersByReader)); + private static class FilePointerIterator implements Iterator { + final BlockDrivenSAMDataSource dataSource; + final PeekableIterator locusIterator; + final Queue cachedFilePointers = new LinkedList(); - List overlappingFilePointers = new ArrayList(); - List mergedFilePointers = new ArrayList(); - - while(mergingIterator.hasNext()) { - GenomeLoc bounds = null; - - // Load up a segment where file pointers overlap - while(mergingIterator.hasNext() && (overlappingFilePointers.size() == 0 || mergingIterator.peek().getBounds().overlapsP(bounds))) { - FilePointer filePointer = mergingIterator.next(); - if(bounds != null) - bounds = GenomeLocParser.createGenomeLoc(bounds.getContig(), - Math.min(bounds.getStart(),filePointer.getBounds().getStart()), - Math.max(bounds.getStop(),filePointer.getBounds().getStop())); - else - bounds = filePointer.getBounds(); - overlappingFilePointers.add(filePointer); - } - - // determine the complete set of unique locations defining this set. - List overlappingLocations = new ArrayList(); - for(FilePointer filePointer: overlappingFilePointers) - overlappingLocations.addAll(filePointer.locations); - Collections.sort(overlappingLocations); - overlappingLocations = GenomeLocSortedSet.mergeOverlappingLocations(overlappingLocations); - - while(!overlappingLocations.isEmpty()) { - long overlapStart = overlappingLocations.get(0).getStart(); - long overlapStop = overlappingLocations.get(overlappingLocations.size()-1).getStop(); - - for(FilePointer overlappingFilePointer: overlappingFilePointers) { - if(overlappingFilePointer.getBounds().getStop() < overlapStart) - continue; - if(overlappingFilePointer.getBounds().getStart() > overlapStart) overlapStop = Math.min(overlapStop,overlappingFilePointer.getBounds().getStart()-1); - if(overlappingFilePointer.getBounds().getStop() < overlapStop) overlapStop = Math.min(overlapStop,overlappingFilePointer.getBounds().getStop()); - } - - // Find the overlapping genome locs. - List segmentOverlap = new ArrayList(); - for(GenomeLoc overlappingLocation: overlappingLocations) { - if(overlappingLocation.getStop() <= overlapStop) { - // segment is completely before end of overlap. - segmentOverlap.add(overlappingLocation); - } - else if(overlappingLocation.getStart() <= overlapStop) { - // segment is partially before end of overlap. - segmentOverlap.add(GenomeLocParser.setStop(overlappingLocation,overlapStop)); - break; - } - else { - // segment starts after overlap ends. - break; - } - } - - // Trim the overlapping genome locs of the overlapping locations list. - while(!overlappingLocations.isEmpty() && overlappingLocations.get(0).getStart() <= overlapStop) { - GenomeLoc location = overlappingLocations.remove(0); - if(location.getStop() > overlapStop) - overlappingLocations.add(0,GenomeLocParser.setStart(location,overlapStop+1)); - } - - // Merge together all file pointers that overlap with these bounds. - GenomeLoc overlapBounds = GenomeLocParser.createGenomeLoc(segmentOverlap.get(0).getContigIndex(),overlapStart,overlapStop); - FilePointer mergedFilePointer = null; - for(FilePointer overlappingFilePointer: overlappingFilePointers) { - if(overlappingFilePointer.getBounds().overlapsP(overlapBounds)) - mergedFilePointer = overlappingFilePointer.merge(mergedFilePointer,segmentOverlap); - } - - // Add the resulting file pointer and clear state. - mergedFilePointers.add(mergedFilePointer); - } - - // reset - overlappingFilePointers.clear(); - } - - return mergedFilePointers; - } - - private static class FilePointerMergingIterator implements Iterator { - private PriorityQueue> filePointerQueue; - - public FilePointerMergingIterator(Map> filePointers) { - filePointerQueue = new PriorityQueue>(filePointers.size(),new FilePointerMergingComparator()); - for(List filePointersByReader: filePointers.values()) - filePointerQueue.add(new PeekableIterator(filePointersByReader.iterator())); + public FilePointerIterator(final BlockDrivenSAMDataSource dataSource, final List loci) { + this.dataSource = dataSource; + locusIterator = new PeekableIterator(loci.iterator()); + advance(); } public boolean hasNext() { - return !filePointerQueue.isEmpty(); + return !cachedFilePointers.isEmpty(); } public FilePointer next() { - if(!hasNext()) throw new NoSuchElementException("FilePointerMergingIterator is out of elements"); - PeekableIterator nextIterator = filePointerQueue.remove(); - FilePointer nextFilePointer = nextIterator.next(); - if(nextIterator.hasNext()) - filePointerQueue.add(nextIterator); - return nextFilePointer; + if(!hasNext()) + throw new NoSuchElementException("FilePointerIterator iteration is complete"); + FilePointer filePointer = cachedFilePointers.remove(); + if(cachedFilePointers.isEmpty()) + advance(); + return filePointer; } - public void remove() { throw new UnsupportedOperationException("Cannot remove from a merging iterator."); } + public void remove() { + throw new UnsupportedOperationException("Cannot remove from a FilePointerIterator"); + } - private class FilePointerMergingComparator implements Comparator> { - public int compare(PeekableIterator lhs, PeekableIterator rhs) { - if(!lhs.hasNext() && !rhs.hasNext()) return 0; - if(!rhs.hasNext()) return -1; - if(!lhs.hasNext()) return 1; - return lhs.peek().getBounds().compareTo(rhs.peek().getBounds()); + private void advance() { + List nextBatch = new ArrayList(); + String contig = null; + + while(locusIterator.hasNext() && nextBatch.isEmpty()) { + contig = null; + while(locusIterator.hasNext() && (contig == null || locusIterator.peek().getContig().equals(contig))) { + GenomeLoc nextLocus = locusIterator.next(); + contig = nextLocus.getContig(); + nextBatch.add(nextLocus); + } } + + if(nextBatch.size() > 0) + cachedFilePointers.addAll(shardIntervalsOnContig(dataSource,contig,nextBatch)); } } - - private static List shardIntervalsOverIndex(final BlockDrivenSAMDataSource dataSource, final SAMReaderID id, final PreloadedBAMFileIndex index, final List loci, final int binsDeeperThan) { + + private static List shardIntervalsOnContig(final BlockDrivenSAMDataSource dataSource, final String contig, final List loci) { // Gather bins for the given loci, splitting loci as necessary so that each falls into exactly one lowest-level bin. List filePointers = new ArrayList(); FilePointer lastFilePointer = null; - Bin lastBin = null; + BAMOverlap lastBAMOverlap = null; + + Map readerToIndexMap = new HashMap(); + BinMergingIterator binMerger = new BinMergingIterator(); + for(SAMReaderID id: dataSource.getReaderIDs()) { + final SAMSequenceRecord referenceSequence = dataSource.getHeader(id).getSequence(contig); + final PreloadedBAMFileIndex index = dataSource.getIndex(id); + binMerger.addReader(id, + index, + referenceSequence.getSequenceIndex(), + index.getBinsOverlapping(referenceSequence.getSequenceIndex(),1,referenceSequence.getSequenceLength()).iterator()); + // Cache the reader for later data lookup. + readerToIndexMap.put(id,index); + } + PeekableIterator binIterator = new PeekableIterator(binMerger); for(GenomeLoc location: loci) { - // If crossing contigs, be sure to reset the filepointer that's been accumulating shard data. - if(lastFilePointer != null && lastFilePointer.referenceSequence != location.getContigIndex()) { - filePointers.add(lastFilePointer); - lastFilePointer = null; - lastBin = null; - } + if(!location.getContig().equals(contig)) + throw new StingException("Location outside bounds of contig"); int locationStart = (int)location.getStart(); final int locationStop = (int)location.getStop(); - List bins = findBinsAtLeastAsDeepAs(index,getOverlappingBins(dataSource,id,index,location),binsDeeperThan); + // Advance to first bin. + while(binIterator.peek().stop < locationStart) + binIterator.next(); - // Recursive stopping condition -- algorithm is at the zero point and no bins have been found. - if(binsDeeperThan == 0 && bins.size() == 0) { - filePointers.add(new FilePointer(location)); - continue; - } - - // No bins found; step up a level and search again. - if(bins.size() == 0) { - if(lastFilePointer != null && lastFilePointer.locations.size() > 0) { - filePointers.add(lastFilePointer); - lastFilePointer = null; - lastBin = null; - } - - filePointers.addAll(shardIntervalsOverIndex(dataSource,id,index,Collections.singletonList(location),binsDeeperThan-1)); - continue; - } + // Add all relevant bins to a list. If the given bin extends beyond the end of the current interval, make + // sure the extending bin is not pruned from the list. + List bamOverlaps = new ArrayList(); + while(binIterator.hasNext() && binIterator.peek().stop <= locationStop) + bamOverlaps.add(binIterator.next()); + if(binIterator.hasNext() && binIterator.peek().start <= locationStop) + bamOverlaps.add(binIterator.peek()); // Bins found; try to match bins with locations. - Collections.sort(bins); - Iterator binIterator = bins.iterator(); + Iterator bamOverlapIterator = bamOverlaps.iterator(); while(locationStop >= locationStart) { - int binStart = lastFilePointer!=null ? index.getFirstLocusInBin(lastBin) : 0; - int binStop = lastFilePointer!=null ? index.getLastLocusInBin(lastBin) : 0; + int binStart = lastFilePointer!=null ? lastFilePointer.overlap.start : 0; + int binStop = lastFilePointer!=null ? lastFilePointer.overlap.stop : 0; - while(binStop < locationStart && binIterator.hasNext()) { + while(binStop < locationStart && bamOverlapIterator.hasNext()) { if(lastFilePointer != null && lastFilePointer.locations.size() > 0) filePointers.add(lastFilePointer); - lastBin = binIterator.next(); - lastFilePointer = new FilePointer(id,lastBin.referenceSequence,getFilePointersBounding(index,lastBin)); - binStart = index.getFirstLocusInBin(lastBin); - binStop = index.getLastLocusInBin(lastBin); + lastBAMOverlap = bamOverlapIterator.next(); + lastFilePointer = new FilePointer(contig,lastBAMOverlap); + binStart = lastFilePointer.overlap.start; + binStop = lastFilePointer.overlap.stop; } if(locationStart < binStart) { @@ -212,13 +130,13 @@ public class IntervalSharder { if(lastFilePointer != null && lastFilePointer.locations.size() > 0) { filePointers.add(lastFilePointer); lastFilePointer = null; - lastBin = null; + lastBAMOverlap = null; } final int regionStop = Math.min(locationStop,binStart-1); GenomeLoc subset = GenomeLocParser.createGenomeLoc(location.getContig(),locationStart,regionStop); - filePointers.addAll(shardIntervalsOverIndex(dataSource,id,index,Collections.singletonList(subset),binsDeeperThan-1)); + lastFilePointer = new FilePointer(subset); locationStart = regionStop + 1; } @@ -227,20 +145,21 @@ public class IntervalSharder { if(lastFilePointer != null && lastFilePointer.locations.size() > 0) { filePointers.add(lastFilePointer); lastFilePointer = null; - lastBin = null; + lastBAMOverlap = null; } GenomeLoc subset = GenomeLocParser.createGenomeLoc(location.getContig(),locationStart,locationStop); - filePointers.addAll(shardIntervalsOverIndex(dataSource,id,index,Collections.singletonList(subset),binsDeeperThan-1)); + filePointers.add(new FilePointer(subset)); locationStart = locationStop + 1; } else { + if(lastFilePointer == null) + throw new StingException("Illegal state: initializer failed to create cached file pointer."); + // The start of the region overlaps the bin. Add the overlapping subset. final int regionStop = Math.min(locationStop,binStop); - lastFilePointer.addLocation(GenomeLocParser.createGenomeLoc(location.getContig(), - locationStart, - regionStop)); + lastFilePointer.addLocation(GenomeLocParser.createGenomeLoc(location.getContig(),locationStart,regionStop)); locationStart = regionStop + 1; } } @@ -249,48 +168,204 @@ public class IntervalSharder { if(lastFilePointer != null && lastFilePointer.locations.size() > 0) filePointers.add(lastFilePointer); + // Lookup the locations for every file pointer in the index. + for(SAMReaderID id: dataSource.getReaderIDs()) { + PreloadedBAMFileIndex index = readerToIndexMap.get(id); + for(FilePointer filePointer: filePointers) + filePointer.addChunks(id,index.getChunksOverlapping(filePointer.overlap.getBin(id))); + index.close(); + } + return filePointers; } - private static List findBinsAtLeastAsDeepAs(final PreloadedBAMFileIndex index, final List bins, final int deepestBinLevel) { - List deepestBins = new ArrayList(); - for(Bin bin: bins) { - if(index.getLevelForBin(bin) >= deepestBinLevel) - deepestBins.add(bin); + private static class BinMergingIterator implements Iterator { + private PriorityQueue binQueue = new PriorityQueue(); + private Queue pendingOverlaps = new LinkedList(); + + public void addReader(final SAMReaderID id, final PreloadedBAMFileIndex index, final int referenceSequence, Iterator bins) { + binQueue.add(new BinQueueState(id,index,referenceSequence,new LowestLevelBinFilteringIterator(index,bins))); + } + + public boolean hasNext() { + return pendingOverlaps.size() > 0 || !binQueue.isEmpty(); + } + + public BAMOverlap next() { + if(!hasNext()) + throw new NoSuchElementException("No elements left in merging iterator"); + if(pendingOverlaps.isEmpty()) + advance(); + return pendingOverlaps.remove(); + } + + public void advance() { + List bins = new ArrayList(); + int boundsStart, boundsStop; + + // Prime the pump + if(binQueue.isEmpty()) + return; + bins.add(getNextBin()); + boundsStart = bins.get(0).getStart(); + boundsStop = bins.get(0).getStop(); + + // Accumulate all the bins that overlap the current bin, in sorted order. + while(!binQueue.isEmpty() && peekNextBin().getStart() <= boundsStop) { + ReaderBin bin = getNextBin(); + bins.add(bin); + boundsStart = Math.min(boundsStart,bin.getStart()); + boundsStop = Math.max(boundsStop,bin.getStop()); + } + + List> range = new ArrayList>(); + int start = bins.get(0).getStart(); + int stop = bins.get(0).getStop(); + while(start <= boundsStop) { + // Find the next stopping point. + for(ReaderBin bin: bins) { + stop = Math.min(stop,bin.getStop()); + if(start < bin.getStart()) + stop = Math.min(stop,bin.getStart()-1); + } + + range.add(new Pair(start,stop)); + // If the last entry added included the last element, stop. + if(stop >= boundsStop) + break; + + // Find the next start. + start = stop + 1; + for(ReaderBin bin: bins) { + if(start >= bin.getStart() && start <= bin.getStop()) + break; + else if(start < bin.getStart()) { + start = bin.getStart(); + break; + } + } + } + + // Add the next series of BAM overlaps to the window. + for(Pair window: range) { + BAMOverlap bamOverlap = new BAMOverlap(window.first,window.second); + for(ReaderBin bin: bins) + bamOverlap.addBin(bin.id,bin.bin); + pendingOverlaps.add(bamOverlap); + } + } + + public void remove() { throw new UnsupportedOperationException("Cannot remove from a merging iterator."); } + + private ReaderBin peekNextBin() { + if(binQueue.isEmpty()) + throw new NoSuchElementException("No more bins are available"); + BinQueueState current = binQueue.peek(); + return new ReaderBin(current.id,current.index,current.referenceSequence,current.bins.peek()); + } + + private ReaderBin getNextBin() { + if(binQueue.isEmpty()) + throw new NoSuchElementException("No more bins are available"); + BinQueueState current = binQueue.remove(); + ReaderBin readerBin = new ReaderBin(current.id,current.index,current.referenceSequence,current.bins.next()); + if(current.bins.hasNext()) + binQueue.add(current); + return readerBin; + } + + private class ReaderBin { + public final SAMReaderID id; + public final PreloadedBAMFileIndex index; + public final int referenceSequence; + public final Bin bin; + + public ReaderBin(final SAMReaderID id, final PreloadedBAMFileIndex index, final int referenceSequence, final Bin bin) { + this.id = id; + this.index = index; + this.referenceSequence = referenceSequence; + this.bin = bin; + } + + public int getStart() { + return index.getFirstLocusInBin(bin); + } + + public int getStop() { + return index.getLastLocusInBin(bin); + } + } + + private class BinQueueState implements Comparable { + public final SAMReaderID id; + public final PreloadedBAMFileIndex index; + public final int referenceSequence; + public final PeekableIterator bins; + + public BinQueueState(final SAMReaderID id, final PreloadedBAMFileIndex index, final int referenceSequence, final Iterator bins) { + this.id = id; + this.index = index; + this.referenceSequence = referenceSequence; + this.bins = new PeekableIterator(bins); + } + + public int compareTo(BinQueueState other) { + if(!this.bins.hasNext() && !other.bins.hasNext()) return 0; + if(!this.bins.hasNext()) return -1; + if(!this.bins.hasNext()) return 1; + + int thisStart = this.index.getFirstLocusInBin(this.bins.peek()); + int otherStart = other.index.getFirstLocusInBin(other.bins.peek()); + + // Straight integer subtraction works here because lhsStart, rhsStart always positive. + if(thisStart != otherStart) + return thisStart - otherStart; + + int thisStop = this.index.getLastLocusInBin(this.bins.peek()); + int otherStop = other.index.getLastLocusInBin(other.bins.peek()); + + // Straight integer subtraction works here because lhsStop, rhsStop always positive. + return thisStop - otherStop; + } } - return deepestBins; } /** - * Gets a list of the bins in each BAM file that overlap with the given interval list. - * @param location Location for which to determine the bin. - * @return A map of reader back to bin. + * Filters out bins not at the lowest level in the tree. */ - private static List getOverlappingBins(final BlockDrivenSAMDataSource dataSource, final SAMReaderID id, final PreloadedBAMFileIndex index, final GenomeLoc location) { - // All readers will have the same bin structure, so just use the first bin as an example. - final SAMFileHeader fileHeader = dataSource.getHeader(id); - int referenceIndex = fileHeader.getSequenceIndex(location.getContig()); - if (referenceIndex != -1) { - return index.getBinsContaining(referenceIndex,(int)location.getStart(),(int)location.getStop()); + private static class LowestLevelBinFilteringIterator implements Iterator { + private PreloadedBAMFileIndex index; + private Iterator wrappedIterator; + + private Bin nextBin; + + public LowestLevelBinFilteringIterator(final PreloadedBAMFileIndex index, Iterator iterator) { + this.index = index; + this.wrappedIterator = iterator; + advance(); } - return Collections.emptyList(); - } - /** - * Gets the file pointers bounded by this bin, grouped by the reader of origination. - * @param bin The bin for which to load data. - * @return A map of the file pointers bounding the bin. - */ - private static List getFilePointersBounding(final PreloadedBAMFileIndex index, final Bin bin) { - if(bin != null) { - List chunks = index.getSearchBins(bin); - return chunks != null ? chunks : Collections.emptyList(); + public boolean hasNext() { + return nextBin != null; } - else - return Collections.emptyList(); - } + public Bin next() { + Bin bin = nextBin; + advance(); + return bin; + } + public void remove() { throw new UnsupportedOperationException("Remove operation is not supported"); } + + private void advance() { + nextBin = null; + while(wrappedIterator.hasNext() && nextBin == null) { + Bin bin = wrappedIterator.next(); + if(index.getLevelForBin(bin) == index.getNumIndexLevels()-1) + nextBin = bin; + } + } + } } /** @@ -298,47 +373,53 @@ public class IntervalSharder { */ class FilePointer { protected final Map> chunks = new HashMap>(); - protected final int referenceSequence; + protected final String referenceSequence; + protected final BAMOverlap overlap; protected final List locations; - public FilePointer(SAMReaderID id, int referenceSequence, List chunks) { - this.referenceSequence = referenceSequence; - this.chunks.put(id,chunks); - this.locations = new ArrayList(); - } - - public FilePointer(GenomeLoc location) { - referenceSequence = location.getContigIndex(); + public FilePointer(final GenomeLoc location) { + referenceSequence = location.getContig(); + overlap = null; locations = Collections.singletonList(location); } - /** - * Private constructor for merge operation. - * @param referenceSequence Sequence to merge. - * @param locations Merged locations. - */ - private FilePointer(final int referenceSequence, final List locations) { + public FilePointer(final String referenceSequence,final BAMOverlap overlap) { this.referenceSequence = referenceSequence; - this.locations = locations; - } - - public FilePointer merge(FilePointer other, List locations) { - FilePointer merged = new FilePointer(referenceSequence,locations); - merged.chunks.putAll(this.chunks); - if(other != null) - merged.chunks.putAll(other.chunks); - return merged; + this.overlap = overlap; + this.locations = new ArrayList(); } public void addLocation(GenomeLoc location) { locations.add(location); } - public GenomeLoc getBounds() { - final long boundaryStart = locations.get(0).getStart(); - final long boundaryStop = locations.get(locations.size()-1).getStop(); - return GenomeLocParser.createGenomeLoc(locations.get(0).getContigIndex(),boundaryStart,boundaryStop); + public void addChunks(SAMReaderID id, List chunks) { + this.chunks.put(id,chunks); + } +} + +/** + * Models a bin at which all BAM files in the merged input stream overlap. + */ +class BAMOverlap { + public final int start; + public final int stop; + + private final Map bins = new HashMap(); + + public BAMOverlap(final int start, final int stop) { + this.start = start; + this.stop = stop; + } + + public void addBin(final SAMReaderID id, final Bin bin) { + bins.put(id,bin); + } + + public Bin getBin(final SAMReaderID id) { + return bins.get(id); } } + diff --git a/settings/repository/edu.mit.broad/picard-private-parts-1333-sharding-3.jar b/settings/repository/edu.mit.broad/picard-private-parts-1333-sharding-4.jar similarity index 94% rename from settings/repository/edu.mit.broad/picard-private-parts-1333-sharding-3.jar rename to settings/repository/edu.mit.broad/picard-private-parts-1333-sharding-4.jar index 05ba590d3..f8fc13d6d 100644 Binary files a/settings/repository/edu.mit.broad/picard-private-parts-1333-sharding-3.jar and b/settings/repository/edu.mit.broad/picard-private-parts-1333-sharding-4.jar differ diff --git a/settings/repository/edu.mit.broad/picard-private-parts-1333-sharding-3.xml b/settings/repository/edu.mit.broad/picard-private-parts-1333-sharding-4.xml similarity index 55% rename from settings/repository/edu.mit.broad/picard-private-parts-1333-sharding-3.xml rename to settings/repository/edu.mit.broad/picard-private-parts-1333-sharding-4.xml index 1b36b12ab..acccf2e28 100644 --- a/settings/repository/edu.mit.broad/picard-private-parts-1333-sharding-3.xml +++ b/settings/repository/edu.mit.broad/picard-private-parts-1333-sharding-4.xml @@ -1,3 +1,3 @@ - + diff --git a/settings/repository/net.sf/picard-1.16.363-sharding.jar b/settings/repository/net.sf/picard-1.16.364-sharding.jar similarity index 95% rename from settings/repository/net.sf/picard-1.16.363-sharding.jar rename to settings/repository/net.sf/picard-1.16.364-sharding.jar index 60f977b53..fc60bbb51 100644 Binary files a/settings/repository/net.sf/picard-1.16.363-sharding.jar and b/settings/repository/net.sf/picard-1.16.364-sharding.jar differ diff --git a/settings/repository/net.sf/picard-1.16.363-sharding.xml b/settings/repository/net.sf/picard-1.16.364-sharding.xml similarity index 76% rename from settings/repository/net.sf/picard-1.16.363-sharding.xml rename to settings/repository/net.sf/picard-1.16.364-sharding.xml index 31c6ab92b..b731a711d 100644 --- a/settings/repository/net.sf/picard-1.16.363-sharding.xml +++ b/settings/repository/net.sf/picard-1.16.364-sharding.xml @@ -1,3 +1,3 @@ - + diff --git a/settings/repository/net.sf/sam-1.16.363-sharding.jar b/settings/repository/net.sf/sam-1.16.364-sharding.jar similarity index 94% rename from settings/repository/net.sf/sam-1.16.363-sharding.jar rename to settings/repository/net.sf/sam-1.16.364-sharding.jar index 3ebadb087..7453d10ea 100644 Binary files a/settings/repository/net.sf/sam-1.16.363-sharding.jar and b/settings/repository/net.sf/sam-1.16.364-sharding.jar differ diff --git a/settings/repository/net.sf/sam-1.16.363-sharding.xml b/settings/repository/net.sf/sam-1.16.364-sharding.xml similarity index 52% rename from settings/repository/net.sf/sam-1.16.363-sharding.xml rename to settings/repository/net.sf/sam-1.16.364-sharding.xml index cc08fdd9e..598c75488 100644 --- a/settings/repository/net.sf/sam-1.16.363-sharding.xml +++ b/settings/repository/net.sf/sam-1.16.364-sharding.xml @@ -1,3 +1,3 @@ - +