From fece2167b31f1666e52c027ba5708533838e8865 Mon Sep 17 00:00:00 2001 From: hanna Date: Fri, 8 Apr 2011 02:09:14 +0000 Subject: [PATCH] Prototype implementation of protoshard merging when protoshard n and protoshard n+1 completely overlap. Gives a small but consistent performance increase in non-intervaled whole exome traversals (2.79min original, 2.69min revised). Needs a more in depth analysis of optimal shard sizing to determine a true optimum. Also renamed a variable because Khalid disapproved of my naming choices. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5595 348d0f76-0448-11de-a6fe-93d51630548a --- java/src/net/sf/samtools/GATKBAMFileSpan.java | 4 +- .../gatk/datasources/reads/BAMScheduler.java | 220 ++++++++++++++++++ .../gatk/datasources/reads/FilePointer.java | 108 ++++++++- .../datasources/reads/IntervalSharder.java | 2 +- .../reads/LowMemoryIntervalSharder.java | 198 ++-------------- .../SAMFileWriterArgumentTypeDescriptor.java | 10 +- 6 files changed, 349 insertions(+), 193 deletions(-) create mode 100644 java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java diff --git a/java/src/net/sf/samtools/GATKBAMFileSpan.java b/java/src/net/sf/samtools/GATKBAMFileSpan.java index 16d4ed0f5..f19c3c56d 100644 --- a/java/src/net/sf/samtools/GATKBAMFileSpan.java +++ b/java/src/net/sf/samtools/GATKBAMFileSpan.java @@ -190,7 +190,7 @@ public class GATKBAMFileSpan extends BAMFileSpan { * @return This file span minuse the other file span. */ - public GATKBAMFileSpan subtract(final GATKBAMFileSpan other) { + public GATKBAMFileSpan minus(final GATKBAMFileSpan other) { Iterator thisIterator = getGATKChunks().iterator(); Iterator otherIterator = other.getGATKChunks().iterator(); @@ -204,7 +204,7 @@ public class GATKBAMFileSpan extends BAMFileSpan { while(thisChunk != null && otherChunk != null) { // If this iterator is before the other, add this to the subtracted list and forge ahead. - if(thisChunk.getChunkEnd() < otherChunk.getChunkStart()) { + if(thisChunk.getChunkEnd() <= otherChunk.getChunkStart()) { subtracted.add(thisChunk); thisChunk = thisIterator.hasNext() ? thisIterator.next() : null; continue; diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java b/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java new file mode 100644 index 000000000..c94609d54 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java @@ -0,0 +1,220 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.datasources.reads; + +import net.sf.picard.util.PeekableIterator; +import net.sf.samtools.GATKBAMFileSpan; +import org.apache.log4j.Logger; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocSortedSet; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.HashMap; +import java.util.Iterator; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.NoSuchElementException; + +/** + * Assign intervals to the most appropriate blocks, keeping as little as possible in memory at once. + */ +public class BAMScheduler implements Iterator { + private final SAMDataSource dataSource; + + private final Map indices = new HashMap(); + + private FilePointer nextFilePointer = null; + + private final GenomeLocSortedSet loci; + + private final PeekableIterator locusIterator; + + private GenomeLoc currentLocus; + + public BAMScheduler(final SAMDataSource dataSource, final GenomeLocSortedSet loci) { + this.dataSource = dataSource; + for(SAMReaderID reader: dataSource.getReaderIDs()) + indices.put(reader,(GATKBAMIndex)dataSource.getIndex(reader)); + this.loci = loci; + locusIterator = new PeekableIterator(loci.iterator()); + if(locusIterator.hasNext()) + currentLocus = locusIterator.next(); + advance(); + } + + public boolean hasNext() { + return nextFilePointer != null; + } + + public FilePointer next() { + if(!hasNext()) + throw new NoSuchElementException("No next element available in interval sharder"); + FilePointer currentFilePointer = nextFilePointer; + advance(); + return currentFilePointer; + } + + public void remove() { + throw new UnsupportedOperationException("Unable to remove FilePointers from an IntervalSharder"); + } + + private void advance() { + if(loci.isEmpty()) + return; + + nextFilePointer = null; + while(nextFilePointer == null && currentLocus != null) { + // special case handling of the unmapped shard. + if(currentLocus == GenomeLoc.UNMAPPED) { + nextFilePointer = new FilePointer(GenomeLoc.UNMAPPED); + for(SAMReaderID id: dataSource.getReaderIDs()) + nextFilePointer.addFileSpans(id,null); + currentLocus = null; + continue; + } + + nextFilePointer = new FilePointer(); + + int coveredRegionStart = 1; + int coveredRegionStop = Integer.MAX_VALUE; + GenomeLoc coveredRegion = null; + + BAMScheduleEntry scheduleEntry = getNextOverlappingBAMScheduleEntry(indices,currentLocus); + + // No overlapping data at all. + if(scheduleEntry != null) { + coveredRegionStart = Math.max(coveredRegionStart,scheduleEntry.start); + coveredRegionStop = Math.min(coveredRegionStop,scheduleEntry.stop); + coveredRegion = loci.getGenomeLocParser().createGenomeLoc(currentLocus.getContig(),coveredRegionStart,coveredRegionStop); + + nextFilePointer.addFileSpans(scheduleEntry.fileSpans); + } + else { + // Always create a file span, whether there was covered data or not. If there was no covered data, then the binTree is empty. + //System.out.printf("Shard: index file = %s; reference sequence = %d; ",index.getIndexFile(),currentLocus.getContigIndex()); + for(SAMReaderID reader: indices.keySet()) + nextFilePointer.addFileSpans(reader,new GATKBAMFileSpan()); + } + + // Early exit if no bins were found. + if(coveredRegion == null) { + // for debugging only: maximum split is 16384. + if(currentLocus.size() > 16384) { + GenomeLoc[] splitContigs = currentLocus.split(currentLocus.getStart()+16384); + nextFilePointer.addLocation(splitContigs[0]); + currentLocus = splitContigs[1]; + } + else { + nextFilePointer.addLocation(currentLocus); + currentLocus = locusIterator.hasNext() ? locusIterator.next() : null; + } + continue; + } + + // Early exit if only part of the first interval was found. + if(currentLocus.startsBefore(coveredRegion)) { + // for debugging only: maximum split is 16384. + int splitPoint = Math.min(coveredRegion.getStart()-currentLocus.getStart(),16384)+currentLocus.getStart(); + GenomeLoc[] splitContigs = currentLocus.split(splitPoint); + nextFilePointer.addLocation(splitContigs[0]); + currentLocus = splitContigs[1]; + continue; + } + + // Define the initial range of the file pointer, aka the region where the locus currently being processed intersects the BAM list. + GenomeLoc initialLocation = currentLocus.intersect(coveredRegion); + nextFilePointer.addLocation(initialLocation); + + // See whether the BAM regions discovered overlap the next set of intervals in the interval list. If so, include every overlapping interval. + if(!nextFilePointer.locations.isEmpty()) { + while(locusIterator.hasNext() && locusIterator.peek().overlapsP(coveredRegion)) { + currentLocus = locusIterator.next(); + nextFilePointer.addLocation(currentLocus.intersect(coveredRegion)); + } + + // Chop off the uncovered portion of the locus. Since we know that the covered region overlaps the current locus, + // we can simplify the interval creation process to the end of the covered region to the stop of the given interval. + if(coveredRegionStop < currentLocus.getStop()) + currentLocus = loci.getGenomeLocParser().createGenomeLoc(currentLocus.getContig(),coveredRegionStop+1,currentLocus.getStop()); + else if(locusIterator.hasNext()) + currentLocus = locusIterator.next(); + else + currentLocus = null; + } + + } + } + + + /** + * The last reference sequence processed by this iterator. + */ + private Integer lastReferenceSequenceLoaded = null; + + /** + * The stateful iterator used to progress through the genoem. + */ + private PeekableIterator bamScheduleIterator = null; + + /** + * Get the next overlapping tree of bins associated with the given BAM file. + * @param indices BAM index representation. + * @param currentLocus The actual locus for which to check overlap. + * @return The next schedule entry overlapping with the given list of loci. + */ + private BAMScheduleEntry getNextOverlappingBAMScheduleEntry(final Map indices, final GenomeLoc currentLocus) { + // Stale reference sequence or first invocation. (Re)create the binTreeIterator. + if(lastReferenceSequenceLoaded == null || lastReferenceSequenceLoaded != currentLocus.getContigIndex()) { + if(bamScheduleIterator != null) + bamScheduleIterator.close(); + lastReferenceSequenceLoaded = currentLocus.getContigIndex(); + + // Naive algorithm: find all elements in current contig for proper schedule creation. + List lociInContig = new LinkedList(); + for(GenomeLoc locus: loci) { + if(locus.getContigIndex() == lastReferenceSequenceLoaded) + lociInContig.add(locus); + } + + bamScheduleIterator = new PeekableIterator(new BAMSchedule(indices,lociInContig)); + } + + if(!bamScheduleIterator.hasNext()) + return null; + + // Peek the iterator along until finding the first binTree at or following the current locus. + BAMScheduleEntry bamScheduleEntry = bamScheduleIterator.peek(); + while(bamScheduleEntry != null && bamScheduleEntry.isBefore(currentLocus)) { + bamScheduleIterator.next(); + bamScheduleEntry = bamScheduleIterator.hasNext() ? bamScheduleIterator.peek() : null; + } + + return (bamScheduleEntry != null && bamScheduleEntry.overlaps(currentLocus)) ? bamScheduleEntry : null; + } + +} diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/reads/FilePointer.java b/java/src/org/broadinstitute/sting/gatk/datasources/reads/FilePointer.java index b2bfa224c..b1f264e63 100644 --- a/java/src/org/broadinstitute/sting/gatk/datasources/reads/FilePointer.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/reads/FilePointer.java @@ -24,22 +24,29 @@ package org.broadinstitute.sting.gatk.datasources.reads; +import net.sf.picard.util.PeekableIterator; import net.sf.samtools.GATKBAMFileSpan; import net.sf.samtools.SAMFileSpan; import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.GenomeLocSortedSet; +import org.broadinstitute.sting.utils.interval.IntervalMergingRule; +import org.broadinstitute.sting.utils.interval.IntervalUtils; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; +import java.util.Iterator; import java.util.List; import java.util.Map; +import java.util.SortedMap; +import java.util.TreeMap; /** * Represents a small section of a BAM file, and every associated interval. */ class FilePointer { - protected final Map fileSpans = new HashMap(); - protected final String referenceSequence; + protected final SortedMap fileSpans = new TreeMap(); protected final BAMOverlap overlap; protected final List locations; @@ -48,24 +55,22 @@ class FilePointer { */ protected final boolean isRegionUnmapped; + public FilePointer() { + this((BAMOverlap)null); + } + public FilePointer(final GenomeLoc location) { - this.referenceSequence = location.getContig(); this.overlap = null; this.locations = Collections.singletonList(location); this.isRegionUnmapped = GenomeLoc.isUnmapped(location); } - public FilePointer(final String referenceSequence,final BAMOverlap overlap) { - this.referenceSequence = referenceSequence; + public FilePointer(final BAMOverlap overlap) { this.overlap = overlap; this.locations = new ArrayList(); this.isRegionUnmapped = false; } - public FilePointer(final String referenceSequence) { - this(referenceSequence,null); - } - public void addLocation(final GenomeLoc location) { locations.add(location); } @@ -77,4 +82,89 @@ class FilePointer { public void addFileSpans(final Map fileSpans) { this.fileSpans.putAll(fileSpans); } + + + /** + * Computes the size of this file span, in uncompressed bytes. + * @return Size of the file span. + */ + public long size() { + long size = 0L; + for(SAMFileSpan fileSpan: fileSpans.values()) + size += ((GATKBAMFileSpan)fileSpan).size(); + return size; + } + + /** + * Returns the difference in size between two filespans. + * @param other Other filespan against which to measure. + * @return The difference in size between the two file pointers. + */ + public long minus(final FilePointer other) { + long difference = 0; + PeekableIterator> thisIterator = new PeekableIterator>(this.fileSpans.entrySet().iterator()); + PeekableIterator> otherIterator = new PeekableIterator>(other.fileSpans.entrySet().iterator()); + + while(thisIterator.hasNext() || otherIterator.hasNext()) { + int compareValue = thisIterator.peek().getKey().compareTo(otherIterator.peek().getKey()); + + if(compareValue < 0) { + // This before other. + difference += ((GATKBAMFileSpan)thisIterator.next().getValue()).size(); + } + else if(compareValue > 0) { + // Other before this. + difference += ((GATKBAMFileSpan)otherIterator.next().getValue()).size(); + } + else { + // equality; difference the values. + GATKBAMFileSpan thisRegion = (GATKBAMFileSpan)thisIterator.next().getValue(); + GATKBAMFileSpan otherRegion = (GATKBAMFileSpan)otherIterator.next().getValue(); + difference += Math.abs(thisRegion.minus(otherRegion).size()); + } + } + return difference; + } + + /** + * Combines two file pointers into one. + * @param parser The genomelocparser to use when manipulating intervals. + * @param other File pointer to combine into this one. + * @return A completely new file pointer that is the combination of the two. + */ + public FilePointer combine(final GenomeLocParser parser, final FilePointer other) { + FilePointer combined = new FilePointer(); + + List intervals = new ArrayList(); + intervals.addAll(locations); + intervals.addAll(other.locations); + for(GenomeLoc interval: IntervalUtils.sortAndMergeIntervals(parser,intervals,IntervalMergingRule.ALL)) + combined.addLocation(interval); + + PeekableIterator> thisIterator = new PeekableIterator>(this.fileSpans.entrySet().iterator()); + PeekableIterator> otherIterator = new PeekableIterator>(other.fileSpans.entrySet().iterator()); + + while(thisIterator.hasNext() || otherIterator.hasNext()) { + int compareValue = thisIterator.peek().getKey().compareTo(otherIterator.peek().getKey()); + + if(compareValue < 0) { + // This before other. + Map.Entry entry = thisIterator.next(); + combined.addFileSpans(entry.getKey(),entry.getValue()); + } + else if(compareValue > 0) { + // Other before this. + Map.Entry entry = otherIterator.next(); + combined.addFileSpans(entry.getKey(),entry.getValue()); + } + else { + // equality; union the values. + SAMReaderID reader = thisIterator.peek().getKey(); + GATKBAMFileSpan thisRegion = (GATKBAMFileSpan)thisIterator.next().getValue(); + GATKBAMFileSpan otherRegion = (GATKBAMFileSpan)otherIterator.next().getValue(); + combined.addFileSpans(reader,thisRegion.union(otherRegion)); + } + } + return combined; + } } diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/reads/IntervalSharder.java b/java/src/org/broadinstitute/sting/gatk/datasources/reads/IntervalSharder.java index f2110fae0..26166c753 100644 --- a/java/src/org/broadinstitute/sting/gatk/datasources/reads/IntervalSharder.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/reads/IntervalSharder.java @@ -185,7 +185,7 @@ public class IntervalSharder { filePointers.add(lastFilePointer); lastBAMOverlap = bamOverlapIterator.next(); - lastFilePointer = new FilePointer(contig,lastBAMOverlap); + lastFilePointer = new FilePointer(lastBAMOverlap); binStart = lastFilePointer.overlap.start; binStop = lastFilePointer.overlap.stop; } diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/reads/LowMemoryIntervalSharder.java b/java/src/org/broadinstitute/sting/gatk/datasources/reads/LowMemoryIntervalSharder.java index 53fb49134..ba6321121 100644 --- a/java/src/org/broadinstitute/sting/gatk/datasources/reads/LowMemoryIntervalSharder.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/reads/LowMemoryIntervalSharder.java @@ -25,198 +25,44 @@ package org.broadinstitute.sting.gatk.datasources.reads; import net.sf.picard.util.PeekableIterator; -import net.sf.samtools.GATKBAMFileSpan; -import org.apache.log4j.Logger; -import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.GenomeLocSortedSet; -import java.util.ArrayList; -import java.util.Collections; -import java.util.Comparator; -import java.util.HashMap; import java.util.Iterator; -import java.util.LinkedList; -import java.util.List; -import java.util.Map; -import java.util.NoSuchElementException; /** - * Assign intervals to the most appropriate blocks, keeping as little as possible in memory at once. + * Handles the process of aggregating BAM intervals into individual shards. */ public class LowMemoryIntervalSharder implements Iterator { - private static Logger logger = Logger.getLogger(IntervalSharder.class); + /** + * The iterator actually laying out the data for BAM scheduling. + */ + private final PeekableIterator wrappedIterator; - private final SAMDataSource dataSource; - - private final Map indices = new HashMap(); - - private FilePointer nextFilePointer = null; - - private final GenomeLocSortedSet loci; - - private final PeekableIterator locusIterator; - - private GenomeLoc currentLocus; + /** + * The parser, for interval manipulation. + */ + private final GenomeLocParser parser; public LowMemoryIntervalSharder(final SAMDataSource dataSource, final GenomeLocSortedSet loci) { - this.dataSource = dataSource; - for(SAMReaderID reader: dataSource.getReaderIDs()) - indices.put(reader,(GATKBAMIndex)dataSource.getIndex(reader)); - this.loci = loci; - locusIterator = new PeekableIterator(loci.iterator()); - if(locusIterator.hasNext()) - currentLocus = locusIterator.next(); - advance(); + wrappedIterator = new PeekableIterator(new BAMScheduler(dataSource,loci)); + parser = loci.getGenomeLocParser(); } public boolean hasNext() { - return nextFilePointer != null; + return wrappedIterator.hasNext(); } + /** + * Accumulate shards where there's no additional cost to processing the next shard in the sequence. + * @return The next file pointer to process. + */ public FilePointer next() { - if(!hasNext()) - throw new NoSuchElementException("No next element available in interval sharder"); - FilePointer currentFilePointer = nextFilePointer; - advance(); - return currentFilePointer; - } - - public void remove() { - throw new UnsupportedOperationException("Unable to remove FilePointers from an IntervalSharder"); - } - - private void advance() { - if(loci.isEmpty()) - return; - - nextFilePointer = null; - while(nextFilePointer == null && currentLocus != null) { - // special case handling of the unmapped shard. - if(currentLocus == GenomeLoc.UNMAPPED) { - nextFilePointer = new FilePointer(GenomeLoc.UNMAPPED); - for(SAMReaderID id: dataSource.getReaderIDs()) - nextFilePointer.addFileSpans(id,null); - currentLocus = null; - continue; - } - - nextFilePointer = new FilePointer(currentLocus.getContig()); - - int coveredRegionStart = 1; - int coveredRegionStop = Integer.MAX_VALUE; - GenomeLoc coveredRegion = null; - - BAMScheduleEntry scheduleEntry = getNextOverlappingBAMScheduleEntry(indices,currentLocus); - - // No overlapping data at all. - if(scheduleEntry != null) { - coveredRegionStart = Math.max(coveredRegionStart,scheduleEntry.start); - coveredRegionStop = Math.min(coveredRegionStop,scheduleEntry.stop); - coveredRegion = loci.getGenomeLocParser().createGenomeLoc(currentLocus.getContig(),coveredRegionStart,coveredRegionStop); - - nextFilePointer.addFileSpans(scheduleEntry.fileSpans); - } - else { - // Always create a file span, whether there was covered data or not. If there was no covered data, then the binTree is empty. - //System.out.printf("Shard: index file = %s; reference sequence = %d; ",index.getIndexFile(),currentLocus.getContigIndex()); - for(SAMReaderID reader: indices.keySet()) - nextFilePointer.addFileSpans(reader,new GATKBAMFileSpan()); - } - - // Early exit if no bins were found. - if(coveredRegion == null) { - // for debugging only: maximum split is 16384. - if(currentLocus.size() > 16384) { - GenomeLoc[] splitContigs = currentLocus.split(currentLocus.getStart()+16384); - nextFilePointer.addLocation(splitContigs[0]); - currentLocus = splitContigs[1]; - } - else { - nextFilePointer.addLocation(currentLocus); - currentLocus = locusIterator.hasNext() ? locusIterator.next() : null; - } - continue; - } - - // Early exit if only part of the first interval was found. - if(currentLocus.startsBefore(coveredRegion)) { - // for debugging only: maximum split is 16384. - int splitPoint = Math.min(coveredRegion.getStart()-currentLocus.getStart(),16384)+currentLocus.getStart(); - GenomeLoc[] splitContigs = currentLocus.split(splitPoint); - nextFilePointer.addLocation(splitContigs[0]); - currentLocus = splitContigs[1]; - continue; - } - - // Define the initial range of the file pointer, aka the region where the locus currently being processed intersects the BAM list. - GenomeLoc initialLocation = currentLocus.intersect(coveredRegion); - nextFilePointer.addLocation(initialLocation); - - // See whether the BAM regions discovered overlap the next set of intervals in the interval list. If so, include every overlapping interval. - if(!nextFilePointer.locations.isEmpty()) { - while(locusIterator.hasNext() && locusIterator.peek().overlapsP(coveredRegion)) { - currentLocus = locusIterator.next(); - nextFilePointer.addLocation(currentLocus.intersect(coveredRegion)); - } - - // Chop off the uncovered portion of the locus. Since we know that the covered region overlaps the current locus, - // we can simplify the interval creation process to the end of the covered region to the stop of the given interval. - if(coveredRegionStop < currentLocus.getStop()) - currentLocus = loci.getGenomeLocParser().createGenomeLoc(currentLocus.getContig(),coveredRegionStop+1,currentLocus.getStop()); - else if(locusIterator.hasNext()) - currentLocus = locusIterator.next(); - else - currentLocus = null; - } - - } - } - - - /** - * The last reference sequence processed by this iterator. - */ - private Integer lastReferenceSequenceLoaded = null; - - /** - * The stateful iterator used to progress through the genoem. - */ - private PeekableIterator bamScheduleIterator = null; - - /** - * Get the next overlapping tree of bins associated with the given BAM file. - * @param indices BAM index representation. - * @param currentLocus The actual locus for which to check overlap. - * @return The next schedule entry overlapping with the given list of loci. - */ - private BAMScheduleEntry getNextOverlappingBAMScheduleEntry(final Map indices, final GenomeLoc currentLocus) { - // Stale reference sequence or first invocation. (Re)create the binTreeIterator. - if(lastReferenceSequenceLoaded == null || lastReferenceSequenceLoaded != currentLocus.getContigIndex()) { - if(bamScheduleIterator != null) - bamScheduleIterator.close(); - lastReferenceSequenceLoaded = currentLocus.getContigIndex(); - - // Naive algorithm: find all elements in current contig for proper schedule creation. - List lociInContig = new LinkedList(); - for(GenomeLoc locus: loci) { - if(locus.getContigIndex() == lastReferenceSequenceLoaded) - lociInContig.add(locus); - } - - bamScheduleIterator = new PeekableIterator(new BAMSchedule(indices,lociInContig)); - } - - if(!bamScheduleIterator.hasNext()) - return null; - - // Peek the iterator along until finding the first binTree at or following the current locus. - BAMScheduleEntry bamScheduleEntry = bamScheduleIterator.peek(); - while(bamScheduleEntry != null && bamScheduleEntry.isBefore(currentLocus)) { - bamScheduleIterator.next(); - bamScheduleEntry = bamScheduleIterator.hasNext() ? bamScheduleIterator.peek() : null; - } - - return (bamScheduleEntry != null && bamScheduleEntry.overlaps(currentLocus)) ? bamScheduleEntry : null; + FilePointer current = wrappedIterator.next(); + while(wrappedIterator.hasNext() && current.minus(wrappedIterator.peek()) == 0) + current = current.combine(parser,wrappedIterator.next()); + return current; } + public void remove() { throw new UnsupportedOperationException("Unable to remove from an interval sharder."); } } diff --git a/java/src/org/broadinstitute/sting/gatk/io/stubs/SAMFileWriterArgumentTypeDescriptor.java b/java/src/org/broadinstitute/sting/gatk/io/stubs/SAMFileWriterArgumentTypeDescriptor.java index a08101f89..77903bbc0 100644 --- a/java/src/org/broadinstitute/sting/gatk/io/stubs/SAMFileWriterArgumentTypeDescriptor.java +++ b/java/src/org/broadinstitute/sting/gatk/io/stubs/SAMFileWriterArgumentTypeDescriptor.java @@ -51,7 +51,7 @@ public class SAMFileWriterArgumentTypeDescriptor extends ArgumentTypeDescriptor public static final String SIMPLIFY_BAM_FULLNAME = "simplifyBAM"; public static final String SIMPLIFY_BAM_SHORTNAME = SIMPLIFY_BAM_FULLNAME; - public static final String CREATE_INDEX_FULLNAME = "disable_bam_indexing"; + public static final String DISABLE_INDEXING_FULLNAME = "disable_bam_indexing"; /** * The engine into which output stubs should be fed. @@ -82,7 +82,7 @@ public class SAMFileWriterArgumentTypeDescriptor extends ArgumentTypeDescriptor public List createArgumentDefinitions( ArgumentSource source ) { return Arrays.asList( createBAMArgumentDefinition(source), createBAMCompressionArgumentDefinition(source), - createWriteIndexArgumentDefinition(source), + disableWriteIndexArgumentDefinition(source), createSimplifyBAMArgumentDefinition(source)); } @@ -117,7 +117,7 @@ public class SAMFileWriterArgumentTypeDescriptor extends ArgumentTypeDescriptor if( compressionLevel != null ) stub.setCompressionLevel(compressionLevel); - stub.setIndexOnTheFly(!argumentIsPresent(createWriteIndexArgumentDefinition(source),matches)); + stub.setIndexOnTheFly(!argumentIsPresent(disableWriteIndexArgumentDefinition(source),matches)); stub.setSimplifyBAM(argumentIsPresent(createSimplifyBAMArgumentDefinition(source),matches)); // WARNING: Side effects required by engine! @@ -171,10 +171,10 @@ public class SAMFileWriterArgumentTypeDescriptor extends ArgumentTypeDescriptor null ); } - private ArgumentDefinition createWriteIndexArgumentDefinition(ArgumentSource source) { + private ArgumentDefinition disableWriteIndexArgumentDefinition(ArgumentSource source) { return new ArgumentDefinition( ArgumentIOType.ARGUMENT, boolean.class, - CREATE_INDEX_FULLNAME, + DISABLE_INDEXING_FULLNAME, null, "Turn off on-the-fly creation of indices for output BAM files.", false,