Support for '-L unmapped' in read walkers. DO NOT USE THIS PATCH YET. It has been

subjected to and passes cursory testing on one dataset (and all integration tests pass).
However, there's a small library of validation checks, and unit and integration tests 
that must be added.


git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4813 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
hanna 2010-12-09 19:51:48 +00:00
parent a2d6cef181
commit d4d3170436
8 changed files with 185 additions and 47 deletions

View File

@ -53,4 +53,12 @@ public interface BAMFormatAwareShard extends Shard {
* @return An iterator over the reads stored in the shard.
*/
public StingSAMIterator iterator();
/**
* Whether this shard points to an unmapped region.
* Some shard types conceptually be unmapped (e.g. LocusShards). In
* this case, isUnmapped should always return false.
* @return True if this shard is unmapped. False otherwise.
*/
public boolean isUnmapped();
}

View File

@ -87,21 +87,38 @@ public class IntervalSharder {
GenomeLocSortedSet nextBatch = new GenomeLocSortedSet(loci.getGenomeLocParser());
String contig = null;
// If the next section of the BAM to be processed is unmapped, handle this region separately.
while(locusIterator.hasNext() && nextBatch.isEmpty()) {
contig = null;
while(locusIterator.hasNext() && (contig == null || locusIterator.peek().getContig().equals(contig))) {
while(locusIterator.hasNext() && (contig == null || (locusIterator.peek() != GenomeLoc.UNMAPPED && locusIterator.peek().getContig().equals(contig)))) {
GenomeLoc nextLocus = locusIterator.next();
contig = nextLocus.getContig();
nextBatch.add(nextLocus);
}
}
if(nextBatch.size() > 0)
if(nextBatch.size() > 0) {
cachedFilePointers.addAll(shardIntervalsOnContig(dataSource,contig,nextBatch));
}
}
}
/**
* Merge / split intervals based on an awareness of the structure of the BAM file.
* @param dataSource
* @param contig Contig against which to align the intervals. If null, create a file pointer across unmapped reads.
* @param loci
* @return
*/
private static List<FilePointer> shardIntervalsOnContig(final SAMDataSource dataSource, final String contig, final GenomeLocSortedSet loci) {
// If the contig is null, eliminate the chopping process and build out a file pointer consisting of the unmapped region of all BAMs.
if(contig == null) {
FilePointer filePointer = new FilePointer(GenomeLoc.UNMAPPED);
for(SAMReaderID id: dataSource.getReaderIDs())
filePointer.addFileSpans(id,null);
return Collections.singletonList(filePointer);
}
// Gather bins for the given loci, splitting loci as necessary so that each falls into exactly one lowest-level bin.
List<FilePointer> filePointers = new ArrayList<FilePointer>();
FilePointer lastFilePointer = null;
@ -111,7 +128,8 @@ public class IntervalSharder {
BinMergingIterator binMerger = new BinMergingIterator();
for(SAMReaderID id: dataSource.getReaderIDs()) {
final SAMSequenceRecord referenceSequence = dataSource.getHeader(id).getSequence(contig);
if(referenceSequence == null)
// If this contig can't be found in the reference, skip over it.
if(referenceSequence == null && contig != null)
continue;
final BrowseableBAMIndex index = dataSource.getIndex(id);
binMerger.addReader(id,
@ -360,16 +378,23 @@ class FilePointer {
protected final BAMOverlap overlap;
protected final List<GenomeLoc> locations;
/**
* Does this file pointer point into an unmapped region?
*/
protected final boolean isRegionUnmapped;
public FilePointer(final GenomeLoc location) {
referenceSequence = location.getContig();
overlap = null;
locations = Collections.singletonList(location);
this.referenceSequence = location.getContig();
this.overlap = null;
this.locations = Collections.singletonList(location);
this.isRegionUnmapped = location == GenomeLoc.UNMAPPED;
}
public FilePointer(final String referenceSequence,final BAMOverlap overlap) {
this.referenceSequence = referenceSequence;
this.overlap = overlap;
this.locations = new ArrayList<GenomeLoc>();
this.isRegionUnmapped = false;
}
public void addLocation(GenomeLoc location) {

View File

@ -117,6 +117,15 @@ public class LocusShard implements BAMFormatAwareShard {
return ShardType.LOCUS;
}
/**
* Locus shards don't make sense as unmapped regions. Always return false.
* @return False always.
*/
@Override
public boolean isUnmapped() {
return false;
}
/**
* Gets key read validation and filtering properties.
* @return set of read properties associated with this shard.

View File

@ -54,15 +54,21 @@ public class ReadShard implements BAMFormatAwareShard {
*/
private final List<GenomeLoc> loci;
/**
* Whether the current location is unmapped.
*/
private final boolean isUnmapped;
/**
* Statistics about which reads in this shards were used and which were filtered away.
*/
private final ReadMetrics readMetrics = new ReadMetrics();
public ReadShard(SAMDataSource readsDataSource, Map<SAMReaderID,SAMFileSpan> fileSpans, List<GenomeLoc> loci) {
public ReadShard(SAMDataSource readsDataSource, Map<SAMReaderID,SAMFileSpan> fileSpans, List<GenomeLoc> loci, boolean isUnmapped) {
this.readsDataSource = readsDataSource;
this.fileSpans = fileSpans;
this.loci = loci;
this.isUnmapped = isUnmapped;
}
/**
@ -88,6 +94,16 @@ public class ReadShard implements BAMFormatAwareShard {
return loci;
}
/**
* Whether this shard points to an unmapped region.
* @return True if this shard is unmapped. False otherwise.
*/
@Override
public boolean isUnmapped() {
return isUnmapped;
}
/**
* Returns true if this shard is meant to buffer reads, rather
* than just holding pointers to their locations.

View File

@ -79,6 +79,11 @@ public class ReadShardStrategy implements ShardStrategy {
*/
private Map<SAMReaderID,SAMFileSpan> position;
/**
* An indicator whether the strategy has sharded into the unmapped region.
*/
private boolean isIntoUnmappedRegion = false;
/**
* Create a new read shard strategy, loading read shards from the given BAM file.
* @param dataSource Data source from which to load shards.
@ -130,13 +135,27 @@ public class ReadShardStrategy implements ShardStrategy {
while(selectedReaders.size() == 0 && currentFilePointer != null) {
shardPosition = currentFilePointer.fileSpans;
for(SAMReaderID id: shardPosition.keySet()) {
SAMFileSpan fileSpan = shardPosition.get(id).removeContentsBefore(position.get(id));
if(!fileSpan.isEmpty())
selectedReaders.put(id,fileSpan);
// If the region contains location information (in other words, it is not at
// the start of the unmapped region), add the region.
if(currentFilePointer.isRegionUnmapped) {
// If the region is unmapped and no location data exists, add a null as an indicator to
// start at the next unmapped region.
if(!isIntoUnmappedRegion) {
selectedReaders.put(id,null);
isIntoUnmappedRegion = true;
}
else
selectedReaders.put(id,position.get(id));
}
else {
SAMFileSpan fileSpan = shardPosition.get(id).removeContentsBefore(position.get(id));
if(!fileSpan.isEmpty())
selectedReaders.put(id,fileSpan);
}
}
if(selectedReaders.size() > 0) {
BAMFormatAwareShard shard = new ReadShard(dataSource,selectedReaders,currentFilePointer.locations);
BAMFormatAwareShard shard = new ReadShard(dataSource,selectedReaders,currentFilePointer.locations,currentFilePointer.isRegionUnmapped);
dataSource.fillShard(shard);
if(!shard.isBufferEmpty()) {
@ -150,7 +169,7 @@ public class ReadShardStrategy implements ShardStrategy {
}
}
else {
BAMFormatAwareShard shard = new ReadShard(dataSource,position,null);
BAMFormatAwareShard shard = new ReadShard(dataSource,position,null,false);
dataSource.fillShard(shard);
nextShard = !shard.isBufferEmpty() ? shard : null;
}

View File

@ -501,9 +501,12 @@ public class SAMDataSource implements SimpleDataSource {
MergingSamRecordIterator mergingIterator = new MergingSamRecordIterator(headerMerger,readers.values(),true);
for(SAMReaderID id: getReaderIDs()) {
if(shard.getFileSpans().get(id) == null)
CloseableIterator<SAMRecord> iterator = null;
if(!shard.isUnmapped() && shard.getFileSpans().get(id) == null)
continue;
CloseableIterator<SAMRecord> iterator = readers.getReader(id).iterator(shard.getFileSpans().get(id));
iterator = shard.getFileSpans().get(id) != null ?
readers.getReader(id).iterator(shard.getFileSpans().get(id)) :
readers.getReader(id).queryUnmapped();
if(readProperties.getReadBufferSize() != null)
iterator = new BufferingReadIterator(iterator,readProperties.getReadBufferSize());
if(shard.getGenomeLocs() != null)
@ -823,6 +826,11 @@ public class SAMDataSource implements SimpleDataSource {
*/
private SAMRecord nextRead;
/**
* Rather than using the straight genomic bounds, use filter out only mapped reads.
*/
private boolean keepOnlyUnmappedReads;
/**
* Custom representation of interval bounds.
* Makes it simpler to track current position.
@ -837,14 +845,30 @@ public class SAMDataSource implements SimpleDataSource {
public IntervalOverlapFilteringIterator(CloseableIterator<SAMRecord> iterator, List<GenomeLoc> intervals) {
this.iterator = iterator;
this.intervalStarts = new int[intervals.size()];
this.intervalEnds = new int[intervals.size()];
int i = 0;
for(GenomeLoc interval: intervals) {
intervalStarts[i] = (int)interval.getStart();
intervalEnds[i] = (int)interval.getStop();
i++;
// Look at the interval list to detect whether we should worry about unmapped reads.
// If we find a mix of mapped/unmapped intervals, throw an exception.
boolean foundMappedIntervals = false;
for(GenomeLoc location: intervals) {
if(location != GenomeLoc.UNMAPPED)
foundMappedIntervals = true;
keepOnlyUnmappedReads |= (location == GenomeLoc.UNMAPPED);
}
if(foundMappedIntervals) {
if(keepOnlyUnmappedReads)
throw new ReviewedStingException("Tried to apply IntervalOverlapFilteringIterator to a mixed of mapped and unmapped intervals. Please apply this filter to only mapped or only unmapped reads");
this.intervalStarts = new int[intervals.size()];
this.intervalEnds = new int[intervals.size()];
int i = 0;
for(GenomeLoc interval: intervals) {
intervalStarts[i] = (int)interval.getStart();
intervalEnds[i] = (int)interval.getStop();
i++;
}
}
advance();
}
@ -876,21 +900,31 @@ public class SAMDataSource implements SimpleDataSource {
return;
SAMRecord candidateRead = iterator.next();
while(nextRead == null && currentBound < intervalStarts.length) {
if(candidateRead.getAlignmentEnd() >= intervalStarts[currentBound] ||
(candidateRead.getReadUnmappedFlag() && candidateRead.getAlignmentStart() >= intervalStarts[currentBound])) {
// This read ends after the current interval begins (or, if unmapped, starts within the bounds of the interval.
// Promising, but this read must be checked against the ending bound.
if(candidateRead.getAlignmentStart() <= intervalEnds[currentBound]) {
// Yes, this read is within both bounds. This must be our next read.
nextRead = candidateRead;
break;
while(nextRead == null && (keepOnlyUnmappedReads || currentBound < intervalStarts.length)) {
if(!keepOnlyUnmappedReads) {
// Mapped read filter; check against GenomeLoc-derived bounds.
if(candidateRead.getAlignmentEnd() >= intervalStarts[currentBound] ||
(candidateRead.getReadUnmappedFlag() && candidateRead.getAlignmentStart() >= intervalStarts[currentBound])) {
// This read ends after the current interval begins (or, if unmapped, starts within the bounds of the interval.
// Promising, but this read must be checked against the ending bound.
if(candidateRead.getAlignmentStart() <= intervalEnds[currentBound]) {
// Yes, this read is within both bounds. This must be our next read.
nextRead = candidateRead;
break;
}
else {
// Oops, we're past the end bound. Increment the current bound and try again.
currentBound++;
continue;
}
}
else {
// Oops, we're past the end bound. Increment the current bound and try again.
currentBound++;
}
else {
// Unmapped read filter; just check getReadUnmappedFlag().
if(!candidateRead.getReadUnmappedFlag())
continue;
}
nextRead = candidateRead;
break;
}
// No more reads available. Stop the search.

View File

@ -26,6 +26,14 @@ public class GenomeLoc implements Comparable<GenomeLoc>, Cloneable, Serializable
protected final int start;
protected final int stop;
protected final String contigName;
/**
* A static constant to use when referring to the unmapped section of a datafile
* file. The unmapped region cannot be subdivided. Only this instance of
* the object may be used to refer to the region, as '==' comparisons are used
* in comparators, etc.
*/
public static final GenomeLoc UNMAPPED = new GenomeLoc(null,-1,0,0);
// --------------------------------------------------------------------------------------------------------------
//
@ -64,6 +72,7 @@ public class GenomeLoc implements Comparable<GenomeLoc>, Cloneable, Serializable
public final int getStart() { return this.start; }
public final int getStop() { return this.stop; }
public final String toString() {
if(this == UNMAPPED) return "unmapped";
if ( throughEndOfContigP() && atBeginningOfContigP() )
return getContig();
else if ( throughEndOfContigP() || getStart() == getStop() )
@ -91,6 +100,12 @@ public class GenomeLoc implements Comparable<GenomeLoc>, Cloneable, Serializable
}
public GenomeLoc merge( GenomeLoc that ) throws ReviewedStingException {
if(this == UNMAPPED || that == UNMAPPED) {
if(this != UNMAPPED || that != UNMAPPED)
throw new ReviewedStingException("Tried to merge a mapped and an unmapped genome loc");
return UNMAPPED;
}
if (!(this.contiguousP(that))) {
throw new ReviewedStingException("The two genome loc's need to be contigous");
}
@ -101,6 +116,12 @@ public class GenomeLoc implements Comparable<GenomeLoc>, Cloneable, Serializable
}
public GenomeLoc intersect( GenomeLoc that ) throws ReviewedStingException {
if(this == UNMAPPED || that == UNMAPPED) {
if(this != UNMAPPED || that != UNMAPPED)
throw new ReviewedStingException("Tried to intersect a mapped and an unmapped genome loc");
return UNMAPPED;
}
if (!(this.overlapsP(that))) {
throw new ReviewedStingException("GenomeLoc::intersect(): The two genome loc's need to overlap");
}
@ -216,7 +237,12 @@ public class GenomeLoc implements Comparable<GenomeLoc>, Cloneable, Serializable
int result = 0;
if ( this == that ) {
result = 0;
} else {
}
else if(this == UNMAPPED)
result = 1;
else if(that == UNMAPPED)
result = -1;
else {
final int cmpContig = compareContigs(that);
if ( cmpContig != 0 ) {

View File

@ -41,20 +41,21 @@ public class IntervalUtils {
// ensure that the arg list isn't null before looping.
for (String argument : argList) {
// if any interval argument is '-L all', consider all loci by returning no intervals
if (argument.equals("all")) {
if (argList.size() != 1) {
// throw error if '-L all' is not only interval - potentially conflicting commands
throw new UserException.CommandLineException(String.format("Conflicting arguments: Intervals given along with \"-L all\""));
}
return null;
}
// separate argument on semicolon first
for (String fileOrInterval : argument.split(";")) {
// if any interval argument is '-L all', consider all loci by returning no intervals
if (fileOrInterval.trim().toLowerCase().equals("all")) {
if (argList.size() != 1) {
// throw error if '-L all' is not only interval - potentially conflicting commands
throw new UserException.CommandLineException(String.format("Conflicting arguments: Intervals given along with \"-L all\""));
}
return null;
}
// if any argument is 'unmapped', "parse" it to a null entry. A null in this case means 'all the intervals with no alignment data'.
else if(fileOrInterval.trim().toLowerCase().equals("unmapped"))
rawIntervals.add(GenomeLoc.UNMAPPED);
// if it's a file, add items to raw interval list
if (isIntervalFile(fileOrInterval)) {
else if (isIntervalFile(fileOrInterval)) {
try {
rawIntervals.addAll(parser.intervalFileToList(fileOrInterval, allowEmptyIntervalList));
}