diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/shards/BAMFormatAwareShard.java b/java/src/org/broadinstitute/sting/gatk/datasources/shards/BAMFormatAwareShard.java index 6029ea065..be46d7094 100644 --- a/java/src/org/broadinstitute/sting/gatk/datasources/shards/BAMFormatAwareShard.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/shards/BAMFormatAwareShard.java @@ -53,4 +53,12 @@ public interface BAMFormatAwareShard extends Shard { * @return An iterator over the reads stored in the shard. */ public StingSAMIterator iterator(); + + /** + * Whether this shard points to an unmapped region. + * Some shard types conceptually be unmapped (e.g. LocusShards). In + * this case, isUnmapped should always return false. + * @return True if this shard is unmapped. False otherwise. + */ + public boolean isUnmapped(); } diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/shards/IntervalSharder.java b/java/src/org/broadinstitute/sting/gatk/datasources/shards/IntervalSharder.java index 5cb41182c..1fdd6b100 100644 --- a/java/src/org/broadinstitute/sting/gatk/datasources/shards/IntervalSharder.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/shards/IntervalSharder.java @@ -87,21 +87,38 @@ public class IntervalSharder { GenomeLocSortedSet nextBatch = new GenomeLocSortedSet(loci.getGenomeLocParser()); String contig = null; + // If the next section of the BAM to be processed is unmapped, handle this region separately. while(locusIterator.hasNext() && nextBatch.isEmpty()) { contig = null; - while(locusIterator.hasNext() && (contig == null || locusIterator.peek().getContig().equals(contig))) { + while(locusIterator.hasNext() && (contig == null || (locusIterator.peek() != GenomeLoc.UNMAPPED && locusIterator.peek().getContig().equals(contig)))) { GenomeLoc nextLocus = locusIterator.next(); contig = nextLocus.getContig(); nextBatch.add(nextLocus); } } - if(nextBatch.size() > 0) + if(nextBatch.size() > 0) { cachedFilePointers.addAll(shardIntervalsOnContig(dataSource,contig,nextBatch)); + } } } - + + /** + * Merge / split intervals based on an awareness of the structure of the BAM file. + * @param dataSource + * @param contig Contig against which to align the intervals. If null, create a file pointer across unmapped reads. + * @param loci + * @return + */ private static List shardIntervalsOnContig(final SAMDataSource dataSource, final String contig, final GenomeLocSortedSet loci) { + // If the contig is null, eliminate the chopping process and build out a file pointer consisting of the unmapped region of all BAMs. + if(contig == null) { + FilePointer filePointer = new FilePointer(GenomeLoc.UNMAPPED); + for(SAMReaderID id: dataSource.getReaderIDs()) + filePointer.addFileSpans(id,null); + return Collections.singletonList(filePointer); + } + // Gather bins for the given loci, splitting loci as necessary so that each falls into exactly one lowest-level bin. List filePointers = new ArrayList(); FilePointer lastFilePointer = null; @@ -111,7 +128,8 @@ public class IntervalSharder { BinMergingIterator binMerger = new BinMergingIterator(); for(SAMReaderID id: dataSource.getReaderIDs()) { final SAMSequenceRecord referenceSequence = dataSource.getHeader(id).getSequence(contig); - if(referenceSequence == null) + // If this contig can't be found in the reference, skip over it. + if(referenceSequence == null && contig != null) continue; final BrowseableBAMIndex index = dataSource.getIndex(id); binMerger.addReader(id, @@ -360,16 +378,23 @@ class FilePointer { protected final BAMOverlap overlap; protected final List locations; + /** + * Does this file pointer point into an unmapped region? + */ + protected final boolean isRegionUnmapped; + public FilePointer(final GenomeLoc location) { - referenceSequence = location.getContig(); - overlap = null; - locations = Collections.singletonList(location); + this.referenceSequence = location.getContig(); + this.overlap = null; + this.locations = Collections.singletonList(location); + this.isRegionUnmapped = location == GenomeLoc.UNMAPPED; } public FilePointer(final String referenceSequence,final BAMOverlap overlap) { this.referenceSequence = referenceSequence; this.overlap = overlap; this.locations = new ArrayList(); + this.isRegionUnmapped = false; } public void addLocation(GenomeLoc location) { diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/shards/LocusShard.java b/java/src/org/broadinstitute/sting/gatk/datasources/shards/LocusShard.java index ba8eebb68..55bd5be70 100755 --- a/java/src/org/broadinstitute/sting/gatk/datasources/shards/LocusShard.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/shards/LocusShard.java @@ -117,6 +117,15 @@ public class LocusShard implements BAMFormatAwareShard { return ShardType.LOCUS; } + /** + * Locus shards don't make sense as unmapped regions. Always return false. + * @return False always. + */ + @Override + public boolean isUnmapped() { + return false; + } + /** * Gets key read validation and filtering properties. * @return set of read properties associated with this shard. diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/shards/ReadShard.java b/java/src/org/broadinstitute/sting/gatk/datasources/shards/ReadShard.java index 9245faa96..a6dacfa48 100755 --- a/java/src/org/broadinstitute/sting/gatk/datasources/shards/ReadShard.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/shards/ReadShard.java @@ -54,15 +54,21 @@ public class ReadShard implements BAMFormatAwareShard { */ private final List loci; + /** + * Whether the current location is unmapped. + */ + private final boolean isUnmapped; + /** * Statistics about which reads in this shards were used and which were filtered away. */ private final ReadMetrics readMetrics = new ReadMetrics(); - public ReadShard(SAMDataSource readsDataSource, Map fileSpans, List loci) { + public ReadShard(SAMDataSource readsDataSource, Map fileSpans, List loci, boolean isUnmapped) { this.readsDataSource = readsDataSource; this.fileSpans = fileSpans; this.loci = loci; + this.isUnmapped = isUnmapped; } /** @@ -88,6 +94,16 @@ public class ReadShard implements BAMFormatAwareShard { return loci; } + /** + * Whether this shard points to an unmapped region. + * @return True if this shard is unmapped. False otherwise. + */ + @Override + public boolean isUnmapped() { + return isUnmapped; + } + + /** * Returns true if this shard is meant to buffer reads, rather * than just holding pointers to their locations. diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/shards/ReadShardStrategy.java b/java/src/org/broadinstitute/sting/gatk/datasources/shards/ReadShardStrategy.java index f17441a35..804439e23 100755 --- a/java/src/org/broadinstitute/sting/gatk/datasources/shards/ReadShardStrategy.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/shards/ReadShardStrategy.java @@ -79,6 +79,11 @@ public class ReadShardStrategy implements ShardStrategy { */ private Map position; + /** + * An indicator whether the strategy has sharded into the unmapped region. + */ + private boolean isIntoUnmappedRegion = false; + /** * Create a new read shard strategy, loading read shards from the given BAM file. * @param dataSource Data source from which to load shards. @@ -130,13 +135,27 @@ public class ReadShardStrategy implements ShardStrategy { while(selectedReaders.size() == 0 && currentFilePointer != null) { shardPosition = currentFilePointer.fileSpans; for(SAMReaderID id: shardPosition.keySet()) { - SAMFileSpan fileSpan = shardPosition.get(id).removeContentsBefore(position.get(id)); - if(!fileSpan.isEmpty()) - selectedReaders.put(id,fileSpan); + // If the region contains location information (in other words, it is not at + // the start of the unmapped region), add the region. + if(currentFilePointer.isRegionUnmapped) { + // If the region is unmapped and no location data exists, add a null as an indicator to + // start at the next unmapped region. + if(!isIntoUnmappedRegion) { + selectedReaders.put(id,null); + isIntoUnmappedRegion = true; + } + else + selectedReaders.put(id,position.get(id)); + } + else { + SAMFileSpan fileSpan = shardPosition.get(id).removeContentsBefore(position.get(id)); + if(!fileSpan.isEmpty()) + selectedReaders.put(id,fileSpan); + } } if(selectedReaders.size() > 0) { - BAMFormatAwareShard shard = new ReadShard(dataSource,selectedReaders,currentFilePointer.locations); + BAMFormatAwareShard shard = new ReadShard(dataSource,selectedReaders,currentFilePointer.locations,currentFilePointer.isRegionUnmapped); dataSource.fillShard(shard); if(!shard.isBufferEmpty()) { @@ -150,7 +169,7 @@ public class ReadShardStrategy implements ShardStrategy { } } else { - BAMFormatAwareShard shard = new ReadShard(dataSource,position,null); + BAMFormatAwareShard shard = new ReadShard(dataSource,position,null,false); dataSource.fillShard(shard); nextShard = !shard.isBufferEmpty() ? shard : null; } diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMDataSource.java b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMDataSource.java index bea535bf7..fe5b45b8f 100755 --- a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMDataSource.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMDataSource.java @@ -501,9 +501,12 @@ public class SAMDataSource implements SimpleDataSource { MergingSamRecordIterator mergingIterator = new MergingSamRecordIterator(headerMerger,readers.values(),true); for(SAMReaderID id: getReaderIDs()) { - if(shard.getFileSpans().get(id) == null) + CloseableIterator iterator = null; + if(!shard.isUnmapped() && shard.getFileSpans().get(id) == null) continue; - CloseableIterator iterator = readers.getReader(id).iterator(shard.getFileSpans().get(id)); + iterator = shard.getFileSpans().get(id) != null ? + readers.getReader(id).iterator(shard.getFileSpans().get(id)) : + readers.getReader(id).queryUnmapped(); if(readProperties.getReadBufferSize() != null) iterator = new BufferingReadIterator(iterator,readProperties.getReadBufferSize()); if(shard.getGenomeLocs() != null) @@ -823,6 +826,11 @@ public class SAMDataSource implements SimpleDataSource { */ private SAMRecord nextRead; + /** + * Rather than using the straight genomic bounds, use filter out only mapped reads. + */ + private boolean keepOnlyUnmappedReads; + /** * Custom representation of interval bounds. * Makes it simpler to track current position. @@ -837,14 +845,30 @@ public class SAMDataSource implements SimpleDataSource { public IntervalOverlapFilteringIterator(CloseableIterator iterator, List intervals) { this.iterator = iterator; - this.intervalStarts = new int[intervals.size()]; - this.intervalEnds = new int[intervals.size()]; - int i = 0; - for(GenomeLoc interval: intervals) { - intervalStarts[i] = (int)interval.getStart(); - intervalEnds[i] = (int)interval.getStop(); - i++; + + // Look at the interval list to detect whether we should worry about unmapped reads. + // If we find a mix of mapped/unmapped intervals, throw an exception. + boolean foundMappedIntervals = false; + for(GenomeLoc location: intervals) { + if(location != GenomeLoc.UNMAPPED) + foundMappedIntervals = true; + keepOnlyUnmappedReads |= (location == GenomeLoc.UNMAPPED); } + + + if(foundMappedIntervals) { + if(keepOnlyUnmappedReads) + throw new ReviewedStingException("Tried to apply IntervalOverlapFilteringIterator to a mixed of mapped and unmapped intervals. Please apply this filter to only mapped or only unmapped reads"); + this.intervalStarts = new int[intervals.size()]; + this.intervalEnds = new int[intervals.size()]; + int i = 0; + for(GenomeLoc interval: intervals) { + intervalStarts[i] = (int)interval.getStart(); + intervalEnds[i] = (int)interval.getStop(); + i++; + } + } + advance(); } @@ -876,21 +900,31 @@ public class SAMDataSource implements SimpleDataSource { return; SAMRecord candidateRead = iterator.next(); - while(nextRead == null && currentBound < intervalStarts.length) { - if(candidateRead.getAlignmentEnd() >= intervalStarts[currentBound] || - (candidateRead.getReadUnmappedFlag() && candidateRead.getAlignmentStart() >= intervalStarts[currentBound])) { - // This read ends after the current interval begins (or, if unmapped, starts within the bounds of the interval. - // Promising, but this read must be checked against the ending bound. - if(candidateRead.getAlignmentStart() <= intervalEnds[currentBound]) { - // Yes, this read is within both bounds. This must be our next read. - nextRead = candidateRead; - break; + while(nextRead == null && (keepOnlyUnmappedReads || currentBound < intervalStarts.length)) { + if(!keepOnlyUnmappedReads) { + // Mapped read filter; check against GenomeLoc-derived bounds. + if(candidateRead.getAlignmentEnd() >= intervalStarts[currentBound] || + (candidateRead.getReadUnmappedFlag() && candidateRead.getAlignmentStart() >= intervalStarts[currentBound])) { + // This read ends after the current interval begins (or, if unmapped, starts within the bounds of the interval. + // Promising, but this read must be checked against the ending bound. + if(candidateRead.getAlignmentStart() <= intervalEnds[currentBound]) { + // Yes, this read is within both bounds. This must be our next read. + nextRead = candidateRead; + break; + } + else { + // Oops, we're past the end bound. Increment the current bound and try again. + currentBound++; + continue; + } } - else { - // Oops, we're past the end bound. Increment the current bound and try again. - currentBound++; + } + else { + // Unmapped read filter; just check getReadUnmappedFlag(). + if(!candidateRead.getReadUnmappedFlag()) continue; - } + nextRead = candidateRead; + break; } // No more reads available. Stop the search. diff --git a/java/src/org/broadinstitute/sting/utils/GenomeLoc.java b/java/src/org/broadinstitute/sting/utils/GenomeLoc.java index d125b4710..7fb93a208 100644 --- a/java/src/org/broadinstitute/sting/utils/GenomeLoc.java +++ b/java/src/org/broadinstitute/sting/utils/GenomeLoc.java @@ -26,6 +26,14 @@ public class GenomeLoc implements Comparable, Cloneable, Serializable protected final int start; protected final int stop; protected final String contigName; + + /** + * A static constant to use when referring to the unmapped section of a datafile + * file. The unmapped region cannot be subdivided. Only this instance of + * the object may be used to refer to the region, as '==' comparisons are used + * in comparators, etc. + */ + public static final GenomeLoc UNMAPPED = new GenomeLoc(null,-1,0,0); // -------------------------------------------------------------------------------------------------------------- // @@ -64,6 +72,7 @@ public class GenomeLoc implements Comparable, Cloneable, Serializable public final int getStart() { return this.start; } public final int getStop() { return this.stop; } public final String toString() { + if(this == UNMAPPED) return "unmapped"; if ( throughEndOfContigP() && atBeginningOfContigP() ) return getContig(); else if ( throughEndOfContigP() || getStart() == getStop() ) @@ -91,6 +100,12 @@ public class GenomeLoc implements Comparable, Cloneable, Serializable } public GenomeLoc merge( GenomeLoc that ) throws ReviewedStingException { + if(this == UNMAPPED || that == UNMAPPED) { + if(this != UNMAPPED || that != UNMAPPED) + throw new ReviewedStingException("Tried to merge a mapped and an unmapped genome loc"); + return UNMAPPED; + } + if (!(this.contiguousP(that))) { throw new ReviewedStingException("The two genome loc's need to be contigous"); } @@ -101,6 +116,12 @@ public class GenomeLoc implements Comparable, Cloneable, Serializable } public GenomeLoc intersect( GenomeLoc that ) throws ReviewedStingException { + if(this == UNMAPPED || that == UNMAPPED) { + if(this != UNMAPPED || that != UNMAPPED) + throw new ReviewedStingException("Tried to intersect a mapped and an unmapped genome loc"); + return UNMAPPED; + } + if (!(this.overlapsP(that))) { throw new ReviewedStingException("GenomeLoc::intersect(): The two genome loc's need to overlap"); } @@ -216,7 +237,12 @@ public class GenomeLoc implements Comparable, Cloneable, Serializable int result = 0; if ( this == that ) { result = 0; - } else { + } + else if(this == UNMAPPED) + result = 1; + else if(that == UNMAPPED) + result = -1; + else { final int cmpContig = compareContigs(that); if ( cmpContig != 0 ) { diff --git a/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java b/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java index 4197982c9..c80ab492a 100644 --- a/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java +++ b/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java @@ -41,20 +41,21 @@ public class IntervalUtils { // ensure that the arg list isn't null before looping. for (String argument : argList) { - // if any interval argument is '-L all', consider all loci by returning no intervals - if (argument.equals("all")) { - if (argList.size() != 1) { - // throw error if '-L all' is not only interval - potentially conflicting commands - throw new UserException.CommandLineException(String.format("Conflicting arguments: Intervals given along with \"-L all\"")); - } - return null; - } - // separate argument on semicolon first for (String fileOrInterval : argument.split(";")) { - + // if any interval argument is '-L all', consider all loci by returning no intervals + if (fileOrInterval.trim().toLowerCase().equals("all")) { + if (argList.size() != 1) { + // throw error if '-L all' is not only interval - potentially conflicting commands + throw new UserException.CommandLineException(String.format("Conflicting arguments: Intervals given along with \"-L all\"")); + } + return null; + } + // if any argument is 'unmapped', "parse" it to a null entry. A null in this case means 'all the intervals with no alignment data'. + else if(fileOrInterval.trim().toLowerCase().equals("unmapped")) + rawIntervals.add(GenomeLoc.UNMAPPED); // if it's a file, add items to raw interval list - if (isIntervalFile(fileOrInterval)) { + else if (isIntervalFile(fileOrInterval)) { try { rawIntervals.addAll(parser.intervalFileToList(fileOrInterval, allowEmptyIntervalList)); }