Prototype implementation of protoshard merging when protoshard n and protoshard
n+1 completely overlap. Gives a small but consistent performance increase in non-intervaled whole exome traversals (2.79min original, 2.69min revised). Needs a more in depth analysis of optimal shard sizing to determine a true optimum. Also renamed a variable because Khalid disapproved of my naming choices. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5595 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
32d502c122
commit
fece2167b3
|
|
@ -190,7 +190,7 @@ public class GATKBAMFileSpan extends BAMFileSpan {
|
|||
* @return This file span minuse the other file span.
|
||||
*/
|
||||
|
||||
public GATKBAMFileSpan subtract(final GATKBAMFileSpan other) {
|
||||
public GATKBAMFileSpan minus(final GATKBAMFileSpan other) {
|
||||
Iterator<GATKChunk> thisIterator = getGATKChunks().iterator();
|
||||
Iterator<GATKChunk> otherIterator = other.getGATKChunks().iterator();
|
||||
|
||||
|
|
@ -204,7 +204,7 @@ public class GATKBAMFileSpan extends BAMFileSpan {
|
|||
|
||||
while(thisChunk != null && otherChunk != null) {
|
||||
// If this iterator is before the other, add this to the subtracted list and forge ahead.
|
||||
if(thisChunk.getChunkEnd() < otherChunk.getChunkStart()) {
|
||||
if(thisChunk.getChunkEnd() <= otherChunk.getChunkStart()) {
|
||||
subtracted.add(thisChunk);
|
||||
thisChunk = thisIterator.hasNext() ? thisIterator.next() : null;
|
||||
continue;
|
||||
|
|
|
|||
|
|
@ -0,0 +1,220 @@
|
|||
/*
|
||||
* Copyright (c) 2011, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.datasources.reads;
|
||||
|
||||
import net.sf.picard.util.PeekableIterator;
|
||||
import net.sf.samtools.GATKBAMFileSpan;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.NoSuchElementException;
|
||||
|
||||
/**
|
||||
* Assign intervals to the most appropriate blocks, keeping as little as possible in memory at once.
|
||||
*/
|
||||
public class BAMScheduler implements Iterator<FilePointer> {
|
||||
private final SAMDataSource dataSource;
|
||||
|
||||
private final Map<SAMReaderID,GATKBAMIndex> indices = new HashMap<SAMReaderID,GATKBAMIndex>();
|
||||
|
||||
private FilePointer nextFilePointer = null;
|
||||
|
||||
private final GenomeLocSortedSet loci;
|
||||
|
||||
private final PeekableIterator<GenomeLoc> locusIterator;
|
||||
|
||||
private GenomeLoc currentLocus;
|
||||
|
||||
public BAMScheduler(final SAMDataSource dataSource, final GenomeLocSortedSet loci) {
|
||||
this.dataSource = dataSource;
|
||||
for(SAMReaderID reader: dataSource.getReaderIDs())
|
||||
indices.put(reader,(GATKBAMIndex)dataSource.getIndex(reader));
|
||||
this.loci = loci;
|
||||
locusIterator = new PeekableIterator<GenomeLoc>(loci.iterator());
|
||||
if(locusIterator.hasNext())
|
||||
currentLocus = locusIterator.next();
|
||||
advance();
|
||||
}
|
||||
|
||||
public boolean hasNext() {
|
||||
return nextFilePointer != null;
|
||||
}
|
||||
|
||||
public FilePointer next() {
|
||||
if(!hasNext())
|
||||
throw new NoSuchElementException("No next element available in interval sharder");
|
||||
FilePointer currentFilePointer = nextFilePointer;
|
||||
advance();
|
||||
return currentFilePointer;
|
||||
}
|
||||
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException("Unable to remove FilePointers from an IntervalSharder");
|
||||
}
|
||||
|
||||
private void advance() {
|
||||
if(loci.isEmpty())
|
||||
return;
|
||||
|
||||
nextFilePointer = null;
|
||||
while(nextFilePointer == null && currentLocus != null) {
|
||||
// special case handling of the unmapped shard.
|
||||
if(currentLocus == GenomeLoc.UNMAPPED) {
|
||||
nextFilePointer = new FilePointer(GenomeLoc.UNMAPPED);
|
||||
for(SAMReaderID id: dataSource.getReaderIDs())
|
||||
nextFilePointer.addFileSpans(id,null);
|
||||
currentLocus = null;
|
||||
continue;
|
||||
}
|
||||
|
||||
nextFilePointer = new FilePointer();
|
||||
|
||||
int coveredRegionStart = 1;
|
||||
int coveredRegionStop = Integer.MAX_VALUE;
|
||||
GenomeLoc coveredRegion = null;
|
||||
|
||||
BAMScheduleEntry scheduleEntry = getNextOverlappingBAMScheduleEntry(indices,currentLocus);
|
||||
|
||||
// No overlapping data at all.
|
||||
if(scheduleEntry != null) {
|
||||
coveredRegionStart = Math.max(coveredRegionStart,scheduleEntry.start);
|
||||
coveredRegionStop = Math.min(coveredRegionStop,scheduleEntry.stop);
|
||||
coveredRegion = loci.getGenomeLocParser().createGenomeLoc(currentLocus.getContig(),coveredRegionStart,coveredRegionStop);
|
||||
|
||||
nextFilePointer.addFileSpans(scheduleEntry.fileSpans);
|
||||
}
|
||||
else {
|
||||
// Always create a file span, whether there was covered data or not. If there was no covered data, then the binTree is empty.
|
||||
//System.out.printf("Shard: index file = %s; reference sequence = %d; ",index.getIndexFile(),currentLocus.getContigIndex());
|
||||
for(SAMReaderID reader: indices.keySet())
|
||||
nextFilePointer.addFileSpans(reader,new GATKBAMFileSpan());
|
||||
}
|
||||
|
||||
// Early exit if no bins were found.
|
||||
if(coveredRegion == null) {
|
||||
// for debugging only: maximum split is 16384.
|
||||
if(currentLocus.size() > 16384) {
|
||||
GenomeLoc[] splitContigs = currentLocus.split(currentLocus.getStart()+16384);
|
||||
nextFilePointer.addLocation(splitContigs[0]);
|
||||
currentLocus = splitContigs[1];
|
||||
}
|
||||
else {
|
||||
nextFilePointer.addLocation(currentLocus);
|
||||
currentLocus = locusIterator.hasNext() ? locusIterator.next() : null;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
// Early exit if only part of the first interval was found.
|
||||
if(currentLocus.startsBefore(coveredRegion)) {
|
||||
// for debugging only: maximum split is 16384.
|
||||
int splitPoint = Math.min(coveredRegion.getStart()-currentLocus.getStart(),16384)+currentLocus.getStart();
|
||||
GenomeLoc[] splitContigs = currentLocus.split(splitPoint);
|
||||
nextFilePointer.addLocation(splitContigs[0]);
|
||||
currentLocus = splitContigs[1];
|
||||
continue;
|
||||
}
|
||||
|
||||
// Define the initial range of the file pointer, aka the region where the locus currently being processed intersects the BAM list.
|
||||
GenomeLoc initialLocation = currentLocus.intersect(coveredRegion);
|
||||
nextFilePointer.addLocation(initialLocation);
|
||||
|
||||
// See whether the BAM regions discovered overlap the next set of intervals in the interval list. If so, include every overlapping interval.
|
||||
if(!nextFilePointer.locations.isEmpty()) {
|
||||
while(locusIterator.hasNext() && locusIterator.peek().overlapsP(coveredRegion)) {
|
||||
currentLocus = locusIterator.next();
|
||||
nextFilePointer.addLocation(currentLocus.intersect(coveredRegion));
|
||||
}
|
||||
|
||||
// Chop off the uncovered portion of the locus. Since we know that the covered region overlaps the current locus,
|
||||
// we can simplify the interval creation process to the end of the covered region to the stop of the given interval.
|
||||
if(coveredRegionStop < currentLocus.getStop())
|
||||
currentLocus = loci.getGenomeLocParser().createGenomeLoc(currentLocus.getContig(),coveredRegionStop+1,currentLocus.getStop());
|
||||
else if(locusIterator.hasNext())
|
||||
currentLocus = locusIterator.next();
|
||||
else
|
||||
currentLocus = null;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* The last reference sequence processed by this iterator.
|
||||
*/
|
||||
private Integer lastReferenceSequenceLoaded = null;
|
||||
|
||||
/**
|
||||
* The stateful iterator used to progress through the genoem.
|
||||
*/
|
||||
private PeekableIterator<BAMScheduleEntry> bamScheduleIterator = null;
|
||||
|
||||
/**
|
||||
* Get the next overlapping tree of bins associated with the given BAM file.
|
||||
* @param indices BAM index representation.
|
||||
* @param currentLocus The actual locus for which to check overlap.
|
||||
* @return The next schedule entry overlapping with the given list of loci.
|
||||
*/
|
||||
private BAMScheduleEntry getNextOverlappingBAMScheduleEntry(final Map<SAMReaderID,GATKBAMIndex> indices, final GenomeLoc currentLocus) {
|
||||
// Stale reference sequence or first invocation. (Re)create the binTreeIterator.
|
||||
if(lastReferenceSequenceLoaded == null || lastReferenceSequenceLoaded != currentLocus.getContigIndex()) {
|
||||
if(bamScheduleIterator != null)
|
||||
bamScheduleIterator.close();
|
||||
lastReferenceSequenceLoaded = currentLocus.getContigIndex();
|
||||
|
||||
// Naive algorithm: find all elements in current contig for proper schedule creation.
|
||||
List<GenomeLoc> lociInContig = new LinkedList<GenomeLoc>();
|
||||
for(GenomeLoc locus: loci) {
|
||||
if(locus.getContigIndex() == lastReferenceSequenceLoaded)
|
||||
lociInContig.add(locus);
|
||||
}
|
||||
|
||||
bamScheduleIterator = new PeekableIterator<BAMScheduleEntry>(new BAMSchedule(indices,lociInContig));
|
||||
}
|
||||
|
||||
if(!bamScheduleIterator.hasNext())
|
||||
return null;
|
||||
|
||||
// Peek the iterator along until finding the first binTree at or following the current locus.
|
||||
BAMScheduleEntry bamScheduleEntry = bamScheduleIterator.peek();
|
||||
while(bamScheduleEntry != null && bamScheduleEntry.isBefore(currentLocus)) {
|
||||
bamScheduleIterator.next();
|
||||
bamScheduleEntry = bamScheduleIterator.hasNext() ? bamScheduleIterator.peek() : null;
|
||||
}
|
||||
|
||||
return (bamScheduleEntry != null && bamScheduleEntry.overlaps(currentLocus)) ? bamScheduleEntry : null;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -24,22 +24,29 @@
|
|||
|
||||
package org.broadinstitute.sting.gatk.datasources.reads;
|
||||
|
||||
import net.sf.picard.util.PeekableIterator;
|
||||
import net.sf.samtools.GATKBAMFileSpan;
|
||||
import net.sf.samtools.SAMFileSpan;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
|
||||
import org.broadinstitute.sting.utils.interval.IntervalMergingRule;
|
||||
import org.broadinstitute.sting.utils.interval.IntervalUtils;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.SortedMap;
|
||||
import java.util.TreeMap;
|
||||
|
||||
/**
|
||||
* Represents a small section of a BAM file, and every associated interval.
|
||||
*/
|
||||
class FilePointer {
|
||||
protected final Map<SAMReaderID,SAMFileSpan> fileSpans = new HashMap<SAMReaderID,SAMFileSpan>();
|
||||
protected final String referenceSequence;
|
||||
protected final SortedMap<SAMReaderID,SAMFileSpan> fileSpans = new TreeMap<SAMReaderID,SAMFileSpan>();
|
||||
protected final BAMOverlap overlap;
|
||||
protected final List<GenomeLoc> locations;
|
||||
|
||||
|
|
@ -48,24 +55,22 @@ class FilePointer {
|
|||
*/
|
||||
protected final boolean isRegionUnmapped;
|
||||
|
||||
public FilePointer() {
|
||||
this((BAMOverlap)null);
|
||||
}
|
||||
|
||||
public FilePointer(final GenomeLoc location) {
|
||||
this.referenceSequence = location.getContig();
|
||||
this.overlap = null;
|
||||
this.locations = Collections.singletonList(location);
|
||||
this.isRegionUnmapped = GenomeLoc.isUnmapped(location);
|
||||
}
|
||||
|
||||
public FilePointer(final String referenceSequence,final BAMOverlap overlap) {
|
||||
this.referenceSequence = referenceSequence;
|
||||
public FilePointer(final BAMOverlap overlap) {
|
||||
this.overlap = overlap;
|
||||
this.locations = new ArrayList<GenomeLoc>();
|
||||
this.isRegionUnmapped = false;
|
||||
}
|
||||
|
||||
public FilePointer(final String referenceSequence) {
|
||||
this(referenceSequence,null);
|
||||
}
|
||||
|
||||
public void addLocation(final GenomeLoc location) {
|
||||
locations.add(location);
|
||||
}
|
||||
|
|
@ -77,4 +82,89 @@ class FilePointer {
|
|||
public void addFileSpans(final Map<SAMReaderID, GATKBAMFileSpan> fileSpans) {
|
||||
this.fileSpans.putAll(fileSpans);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Computes the size of this file span, in uncompressed bytes.
|
||||
* @return Size of the file span.
|
||||
*/
|
||||
public long size() {
|
||||
long size = 0L;
|
||||
for(SAMFileSpan fileSpan: fileSpans.values())
|
||||
size += ((GATKBAMFileSpan)fileSpan).size();
|
||||
return size;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the difference in size between two filespans.
|
||||
* @param other Other filespan against which to measure.
|
||||
* @return The difference in size between the two file pointers.
|
||||
*/
|
||||
public long minus(final FilePointer other) {
|
||||
long difference = 0;
|
||||
PeekableIterator<Map.Entry<SAMReaderID,SAMFileSpan>> thisIterator = new PeekableIterator<Map.Entry<SAMReaderID,SAMFileSpan>>(this.fileSpans.entrySet().iterator());
|
||||
PeekableIterator<Map.Entry<SAMReaderID,SAMFileSpan>> otherIterator = new PeekableIterator<Map.Entry<SAMReaderID,SAMFileSpan>>(other.fileSpans.entrySet().iterator());
|
||||
|
||||
while(thisIterator.hasNext() || otherIterator.hasNext()) {
|
||||
int compareValue = thisIterator.peek().getKey().compareTo(otherIterator.peek().getKey());
|
||||
|
||||
if(compareValue < 0) {
|
||||
// This before other.
|
||||
difference += ((GATKBAMFileSpan)thisIterator.next().getValue()).size();
|
||||
}
|
||||
else if(compareValue > 0) {
|
||||
// Other before this.
|
||||
difference += ((GATKBAMFileSpan)otherIterator.next().getValue()).size();
|
||||
}
|
||||
else {
|
||||
// equality; difference the values.
|
||||
GATKBAMFileSpan thisRegion = (GATKBAMFileSpan)thisIterator.next().getValue();
|
||||
GATKBAMFileSpan otherRegion = (GATKBAMFileSpan)otherIterator.next().getValue();
|
||||
difference += Math.abs(thisRegion.minus(otherRegion).size());
|
||||
}
|
||||
}
|
||||
return difference;
|
||||
}
|
||||
|
||||
/**
|
||||
* Combines two file pointers into one.
|
||||
* @param parser The genomelocparser to use when manipulating intervals.
|
||||
* @param other File pointer to combine into this one.
|
||||
* @return A completely new file pointer that is the combination of the two.
|
||||
*/
|
||||
public FilePointer combine(final GenomeLocParser parser, final FilePointer other) {
|
||||
FilePointer combined = new FilePointer();
|
||||
|
||||
List<GenomeLoc> intervals = new ArrayList<GenomeLoc>();
|
||||
intervals.addAll(locations);
|
||||
intervals.addAll(other.locations);
|
||||
for(GenomeLoc interval: IntervalUtils.sortAndMergeIntervals(parser,intervals,IntervalMergingRule.ALL))
|
||||
combined.addLocation(interval);
|
||||
|
||||
PeekableIterator<Map.Entry<SAMReaderID,SAMFileSpan>> thisIterator = new PeekableIterator<Map.Entry<SAMReaderID,SAMFileSpan>>(this.fileSpans.entrySet().iterator());
|
||||
PeekableIterator<Map.Entry<SAMReaderID,SAMFileSpan>> otherIterator = new PeekableIterator<Map.Entry<SAMReaderID,SAMFileSpan>>(other.fileSpans.entrySet().iterator());
|
||||
|
||||
while(thisIterator.hasNext() || otherIterator.hasNext()) {
|
||||
int compareValue = thisIterator.peek().getKey().compareTo(otherIterator.peek().getKey());
|
||||
|
||||
if(compareValue < 0) {
|
||||
// This before other.
|
||||
Map.Entry<SAMReaderID,SAMFileSpan> entry = thisIterator.next();
|
||||
combined.addFileSpans(entry.getKey(),entry.getValue());
|
||||
}
|
||||
else if(compareValue > 0) {
|
||||
// Other before this.
|
||||
Map.Entry<SAMReaderID,SAMFileSpan> entry = otherIterator.next();
|
||||
combined.addFileSpans(entry.getKey(),entry.getValue());
|
||||
}
|
||||
else {
|
||||
// equality; union the values.
|
||||
SAMReaderID reader = thisIterator.peek().getKey();
|
||||
GATKBAMFileSpan thisRegion = (GATKBAMFileSpan)thisIterator.next().getValue();
|
||||
GATKBAMFileSpan otherRegion = (GATKBAMFileSpan)otherIterator.next().getValue();
|
||||
combined.addFileSpans(reader,thisRegion.union(otherRegion));
|
||||
}
|
||||
}
|
||||
return combined;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -185,7 +185,7 @@ public class IntervalSharder {
|
|||
filePointers.add(lastFilePointer);
|
||||
|
||||
lastBAMOverlap = bamOverlapIterator.next();
|
||||
lastFilePointer = new FilePointer(contig,lastBAMOverlap);
|
||||
lastFilePointer = new FilePointer(lastBAMOverlap);
|
||||
binStart = lastFilePointer.overlap.start;
|
||||
binStop = lastFilePointer.overlap.stop;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -25,198 +25,44 @@
|
|||
package org.broadinstitute.sting.gatk.datasources.reads;
|
||||
|
||||
import net.sf.picard.util.PeekableIterator;
|
||||
import net.sf.samtools.GATKBAMFileSpan;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.NoSuchElementException;
|
||||
|
||||
/**
|
||||
* Assign intervals to the most appropriate blocks, keeping as little as possible in memory at once.
|
||||
* Handles the process of aggregating BAM intervals into individual shards.
|
||||
*/
|
||||
public class LowMemoryIntervalSharder implements Iterator<FilePointer> {
|
||||
private static Logger logger = Logger.getLogger(IntervalSharder.class);
|
||||
/**
|
||||
* The iterator actually laying out the data for BAM scheduling.
|
||||
*/
|
||||
private final PeekableIterator<FilePointer> wrappedIterator;
|
||||
|
||||
private final SAMDataSource dataSource;
|
||||
|
||||
private final Map<SAMReaderID,GATKBAMIndex> indices = new HashMap<SAMReaderID,GATKBAMIndex>();
|
||||
|
||||
private FilePointer nextFilePointer = null;
|
||||
|
||||
private final GenomeLocSortedSet loci;
|
||||
|
||||
private final PeekableIterator<GenomeLoc> locusIterator;
|
||||
|
||||
private GenomeLoc currentLocus;
|
||||
/**
|
||||
* The parser, for interval manipulation.
|
||||
*/
|
||||
private final GenomeLocParser parser;
|
||||
|
||||
public LowMemoryIntervalSharder(final SAMDataSource dataSource, final GenomeLocSortedSet loci) {
|
||||
this.dataSource = dataSource;
|
||||
for(SAMReaderID reader: dataSource.getReaderIDs())
|
||||
indices.put(reader,(GATKBAMIndex)dataSource.getIndex(reader));
|
||||
this.loci = loci;
|
||||
locusIterator = new PeekableIterator<GenomeLoc>(loci.iterator());
|
||||
if(locusIterator.hasNext())
|
||||
currentLocus = locusIterator.next();
|
||||
advance();
|
||||
wrappedIterator = new PeekableIterator<FilePointer>(new BAMScheduler(dataSource,loci));
|
||||
parser = loci.getGenomeLocParser();
|
||||
}
|
||||
|
||||
public boolean hasNext() {
|
||||
return nextFilePointer != null;
|
||||
return wrappedIterator.hasNext();
|
||||
}
|
||||
|
||||
/**
|
||||
* Accumulate shards where there's no additional cost to processing the next shard in the sequence.
|
||||
* @return The next file pointer to process.
|
||||
*/
|
||||
public FilePointer next() {
|
||||
if(!hasNext())
|
||||
throw new NoSuchElementException("No next element available in interval sharder");
|
||||
FilePointer currentFilePointer = nextFilePointer;
|
||||
advance();
|
||||
return currentFilePointer;
|
||||
}
|
||||
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException("Unable to remove FilePointers from an IntervalSharder");
|
||||
}
|
||||
|
||||
private void advance() {
|
||||
if(loci.isEmpty())
|
||||
return;
|
||||
|
||||
nextFilePointer = null;
|
||||
while(nextFilePointer == null && currentLocus != null) {
|
||||
// special case handling of the unmapped shard.
|
||||
if(currentLocus == GenomeLoc.UNMAPPED) {
|
||||
nextFilePointer = new FilePointer(GenomeLoc.UNMAPPED);
|
||||
for(SAMReaderID id: dataSource.getReaderIDs())
|
||||
nextFilePointer.addFileSpans(id,null);
|
||||
currentLocus = null;
|
||||
continue;
|
||||
}
|
||||
|
||||
nextFilePointer = new FilePointer(currentLocus.getContig());
|
||||
|
||||
int coveredRegionStart = 1;
|
||||
int coveredRegionStop = Integer.MAX_VALUE;
|
||||
GenomeLoc coveredRegion = null;
|
||||
|
||||
BAMScheduleEntry scheduleEntry = getNextOverlappingBAMScheduleEntry(indices,currentLocus);
|
||||
|
||||
// No overlapping data at all.
|
||||
if(scheduleEntry != null) {
|
||||
coveredRegionStart = Math.max(coveredRegionStart,scheduleEntry.start);
|
||||
coveredRegionStop = Math.min(coveredRegionStop,scheduleEntry.stop);
|
||||
coveredRegion = loci.getGenomeLocParser().createGenomeLoc(currentLocus.getContig(),coveredRegionStart,coveredRegionStop);
|
||||
|
||||
nextFilePointer.addFileSpans(scheduleEntry.fileSpans);
|
||||
}
|
||||
else {
|
||||
// Always create a file span, whether there was covered data or not. If there was no covered data, then the binTree is empty.
|
||||
//System.out.printf("Shard: index file = %s; reference sequence = %d; ",index.getIndexFile(),currentLocus.getContigIndex());
|
||||
for(SAMReaderID reader: indices.keySet())
|
||||
nextFilePointer.addFileSpans(reader,new GATKBAMFileSpan());
|
||||
}
|
||||
|
||||
// Early exit if no bins were found.
|
||||
if(coveredRegion == null) {
|
||||
// for debugging only: maximum split is 16384.
|
||||
if(currentLocus.size() > 16384) {
|
||||
GenomeLoc[] splitContigs = currentLocus.split(currentLocus.getStart()+16384);
|
||||
nextFilePointer.addLocation(splitContigs[0]);
|
||||
currentLocus = splitContigs[1];
|
||||
}
|
||||
else {
|
||||
nextFilePointer.addLocation(currentLocus);
|
||||
currentLocus = locusIterator.hasNext() ? locusIterator.next() : null;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
// Early exit if only part of the first interval was found.
|
||||
if(currentLocus.startsBefore(coveredRegion)) {
|
||||
// for debugging only: maximum split is 16384.
|
||||
int splitPoint = Math.min(coveredRegion.getStart()-currentLocus.getStart(),16384)+currentLocus.getStart();
|
||||
GenomeLoc[] splitContigs = currentLocus.split(splitPoint);
|
||||
nextFilePointer.addLocation(splitContigs[0]);
|
||||
currentLocus = splitContigs[1];
|
||||
continue;
|
||||
}
|
||||
|
||||
// Define the initial range of the file pointer, aka the region where the locus currently being processed intersects the BAM list.
|
||||
GenomeLoc initialLocation = currentLocus.intersect(coveredRegion);
|
||||
nextFilePointer.addLocation(initialLocation);
|
||||
|
||||
// See whether the BAM regions discovered overlap the next set of intervals in the interval list. If so, include every overlapping interval.
|
||||
if(!nextFilePointer.locations.isEmpty()) {
|
||||
while(locusIterator.hasNext() && locusIterator.peek().overlapsP(coveredRegion)) {
|
||||
currentLocus = locusIterator.next();
|
||||
nextFilePointer.addLocation(currentLocus.intersect(coveredRegion));
|
||||
}
|
||||
|
||||
// Chop off the uncovered portion of the locus. Since we know that the covered region overlaps the current locus,
|
||||
// we can simplify the interval creation process to the end of the covered region to the stop of the given interval.
|
||||
if(coveredRegionStop < currentLocus.getStop())
|
||||
currentLocus = loci.getGenomeLocParser().createGenomeLoc(currentLocus.getContig(),coveredRegionStop+1,currentLocus.getStop());
|
||||
else if(locusIterator.hasNext())
|
||||
currentLocus = locusIterator.next();
|
||||
else
|
||||
currentLocus = null;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* The last reference sequence processed by this iterator.
|
||||
*/
|
||||
private Integer lastReferenceSequenceLoaded = null;
|
||||
|
||||
/**
|
||||
* The stateful iterator used to progress through the genoem.
|
||||
*/
|
||||
private PeekableIterator<BAMScheduleEntry> bamScheduleIterator = null;
|
||||
|
||||
/**
|
||||
* Get the next overlapping tree of bins associated with the given BAM file.
|
||||
* @param indices BAM index representation.
|
||||
* @param currentLocus The actual locus for which to check overlap.
|
||||
* @return The next schedule entry overlapping with the given list of loci.
|
||||
*/
|
||||
private BAMScheduleEntry getNextOverlappingBAMScheduleEntry(final Map<SAMReaderID,GATKBAMIndex> indices, final GenomeLoc currentLocus) {
|
||||
// Stale reference sequence or first invocation. (Re)create the binTreeIterator.
|
||||
if(lastReferenceSequenceLoaded == null || lastReferenceSequenceLoaded != currentLocus.getContigIndex()) {
|
||||
if(bamScheduleIterator != null)
|
||||
bamScheduleIterator.close();
|
||||
lastReferenceSequenceLoaded = currentLocus.getContigIndex();
|
||||
|
||||
// Naive algorithm: find all elements in current contig for proper schedule creation.
|
||||
List<GenomeLoc> lociInContig = new LinkedList<GenomeLoc>();
|
||||
for(GenomeLoc locus: loci) {
|
||||
if(locus.getContigIndex() == lastReferenceSequenceLoaded)
|
||||
lociInContig.add(locus);
|
||||
}
|
||||
|
||||
bamScheduleIterator = new PeekableIterator<BAMScheduleEntry>(new BAMSchedule(indices,lociInContig));
|
||||
}
|
||||
|
||||
if(!bamScheduleIterator.hasNext())
|
||||
return null;
|
||||
|
||||
// Peek the iterator along until finding the first binTree at or following the current locus.
|
||||
BAMScheduleEntry bamScheduleEntry = bamScheduleIterator.peek();
|
||||
while(bamScheduleEntry != null && bamScheduleEntry.isBefore(currentLocus)) {
|
||||
bamScheduleIterator.next();
|
||||
bamScheduleEntry = bamScheduleIterator.hasNext() ? bamScheduleIterator.peek() : null;
|
||||
}
|
||||
|
||||
return (bamScheduleEntry != null && bamScheduleEntry.overlaps(currentLocus)) ? bamScheduleEntry : null;
|
||||
FilePointer current = wrappedIterator.next();
|
||||
while(wrappedIterator.hasNext() && current.minus(wrappedIterator.peek()) == 0)
|
||||
current = current.combine(parser,wrappedIterator.next());
|
||||
return current;
|
||||
}
|
||||
|
||||
public void remove() { throw new UnsupportedOperationException("Unable to remove from an interval sharder."); }
|
||||
}
|
||||
|
|
|
|||
|
|
@ -51,7 +51,7 @@ public class SAMFileWriterArgumentTypeDescriptor extends ArgumentTypeDescriptor
|
|||
public static final String SIMPLIFY_BAM_FULLNAME = "simplifyBAM";
|
||||
public static final String SIMPLIFY_BAM_SHORTNAME = SIMPLIFY_BAM_FULLNAME;
|
||||
|
||||
public static final String CREATE_INDEX_FULLNAME = "disable_bam_indexing";
|
||||
public static final String DISABLE_INDEXING_FULLNAME = "disable_bam_indexing";
|
||||
|
||||
/**
|
||||
* The engine into which output stubs should be fed.
|
||||
|
|
@ -82,7 +82,7 @@ public class SAMFileWriterArgumentTypeDescriptor extends ArgumentTypeDescriptor
|
|||
public List<ArgumentDefinition> createArgumentDefinitions( ArgumentSource source ) {
|
||||
return Arrays.asList( createBAMArgumentDefinition(source),
|
||||
createBAMCompressionArgumentDefinition(source),
|
||||
createWriteIndexArgumentDefinition(source),
|
||||
disableWriteIndexArgumentDefinition(source),
|
||||
createSimplifyBAMArgumentDefinition(source));
|
||||
}
|
||||
|
||||
|
|
@ -117,7 +117,7 @@ public class SAMFileWriterArgumentTypeDescriptor extends ArgumentTypeDescriptor
|
|||
if( compressionLevel != null )
|
||||
stub.setCompressionLevel(compressionLevel);
|
||||
|
||||
stub.setIndexOnTheFly(!argumentIsPresent(createWriteIndexArgumentDefinition(source),matches));
|
||||
stub.setIndexOnTheFly(!argumentIsPresent(disableWriteIndexArgumentDefinition(source),matches));
|
||||
stub.setSimplifyBAM(argumentIsPresent(createSimplifyBAMArgumentDefinition(source),matches));
|
||||
|
||||
// WARNING: Side effects required by engine!
|
||||
|
|
@ -171,10 +171,10 @@ public class SAMFileWriterArgumentTypeDescriptor extends ArgumentTypeDescriptor
|
|||
null );
|
||||
}
|
||||
|
||||
private ArgumentDefinition createWriteIndexArgumentDefinition(ArgumentSource source) {
|
||||
private ArgumentDefinition disableWriteIndexArgumentDefinition(ArgumentSource source) {
|
||||
return new ArgumentDefinition( ArgumentIOType.ARGUMENT,
|
||||
boolean.class,
|
||||
CREATE_INDEX_FULLNAME,
|
||||
DISABLE_INDEXING_FULLNAME,
|
||||
null,
|
||||
"Turn off on-the-fly creation of indices for output BAM files.",
|
||||
false,
|
||||
|
|
|
|||
Loading…
Reference in New Issue