Support for '-L unmapped' in read walkers. DO NOT USE THIS PATCH YET. It has been

subjected to and passes cursory testing on one dataset (and all integration tests pass).
However, there's a small library of validation checks, and unit and integration tests 
that must be added.


git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4813 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
hanna 2010-12-09 19:51:48 +00:00
parent a2d6cef181
commit d4d3170436
8 changed files with 185 additions and 47 deletions

View File

@ -53,4 +53,12 @@ public interface BAMFormatAwareShard extends Shard {
* @return An iterator over the reads stored in the shard. * @return An iterator over the reads stored in the shard.
*/ */
public StingSAMIterator iterator(); public StingSAMIterator iterator();
/**
* Whether this shard points to an unmapped region.
* Some shard types conceptually be unmapped (e.g. LocusShards). In
* this case, isUnmapped should always return false.
* @return True if this shard is unmapped. False otherwise.
*/
public boolean isUnmapped();
} }

View File

@ -87,21 +87,38 @@ public class IntervalSharder {
GenomeLocSortedSet nextBatch = new GenomeLocSortedSet(loci.getGenomeLocParser()); GenomeLocSortedSet nextBatch = new GenomeLocSortedSet(loci.getGenomeLocParser());
String contig = null; String contig = null;
// If the next section of the BAM to be processed is unmapped, handle this region separately.
while(locusIterator.hasNext() && nextBatch.isEmpty()) { while(locusIterator.hasNext() && nextBatch.isEmpty()) {
contig = null; contig = null;
while(locusIterator.hasNext() && (contig == null || locusIterator.peek().getContig().equals(contig))) { while(locusIterator.hasNext() && (contig == null || (locusIterator.peek() != GenomeLoc.UNMAPPED && locusIterator.peek().getContig().equals(contig)))) {
GenomeLoc nextLocus = locusIterator.next(); GenomeLoc nextLocus = locusIterator.next();
contig = nextLocus.getContig(); contig = nextLocus.getContig();
nextBatch.add(nextLocus); nextBatch.add(nextLocus);
} }
} }
if(nextBatch.size() > 0) if(nextBatch.size() > 0) {
cachedFilePointers.addAll(shardIntervalsOnContig(dataSource,contig,nextBatch)); cachedFilePointers.addAll(shardIntervalsOnContig(dataSource,contig,nextBatch));
}
} }
} }
/**
* Merge / split intervals based on an awareness of the structure of the BAM file.
* @param dataSource
* @param contig Contig against which to align the intervals. If null, create a file pointer across unmapped reads.
* @param loci
* @return
*/
private static List<FilePointer> shardIntervalsOnContig(final SAMDataSource dataSource, final String contig, final GenomeLocSortedSet loci) { private static List<FilePointer> shardIntervalsOnContig(final SAMDataSource dataSource, final String contig, final GenomeLocSortedSet loci) {
// If the contig is null, eliminate the chopping process and build out a file pointer consisting of the unmapped region of all BAMs.
if(contig == null) {
FilePointer filePointer = new FilePointer(GenomeLoc.UNMAPPED);
for(SAMReaderID id: dataSource.getReaderIDs())
filePointer.addFileSpans(id,null);
return Collections.singletonList(filePointer);
}
// Gather bins for the given loci, splitting loci as necessary so that each falls into exactly one lowest-level bin. // Gather bins for the given loci, splitting loci as necessary so that each falls into exactly one lowest-level bin.
List<FilePointer> filePointers = new ArrayList<FilePointer>(); List<FilePointer> filePointers = new ArrayList<FilePointer>();
FilePointer lastFilePointer = null; FilePointer lastFilePointer = null;
@ -111,7 +128,8 @@ public class IntervalSharder {
BinMergingIterator binMerger = new BinMergingIterator(); BinMergingIterator binMerger = new BinMergingIterator();
for(SAMReaderID id: dataSource.getReaderIDs()) { for(SAMReaderID id: dataSource.getReaderIDs()) {
final SAMSequenceRecord referenceSequence = dataSource.getHeader(id).getSequence(contig); final SAMSequenceRecord referenceSequence = dataSource.getHeader(id).getSequence(contig);
if(referenceSequence == null) // If this contig can't be found in the reference, skip over it.
if(referenceSequence == null && contig != null)
continue; continue;
final BrowseableBAMIndex index = dataSource.getIndex(id); final BrowseableBAMIndex index = dataSource.getIndex(id);
binMerger.addReader(id, binMerger.addReader(id,
@ -360,16 +378,23 @@ class FilePointer {
protected final BAMOverlap overlap; protected final BAMOverlap overlap;
protected final List<GenomeLoc> locations; protected final List<GenomeLoc> locations;
/**
* Does this file pointer point into an unmapped region?
*/
protected final boolean isRegionUnmapped;
public FilePointer(final GenomeLoc location) { public FilePointer(final GenomeLoc location) {
referenceSequence = location.getContig(); this.referenceSequence = location.getContig();
overlap = null; this.overlap = null;
locations = Collections.singletonList(location); this.locations = Collections.singletonList(location);
this.isRegionUnmapped = location == GenomeLoc.UNMAPPED;
} }
public FilePointer(final String referenceSequence,final BAMOverlap overlap) { public FilePointer(final String referenceSequence,final BAMOverlap overlap) {
this.referenceSequence = referenceSequence; this.referenceSequence = referenceSequence;
this.overlap = overlap; this.overlap = overlap;
this.locations = new ArrayList<GenomeLoc>(); this.locations = new ArrayList<GenomeLoc>();
this.isRegionUnmapped = false;
} }
public void addLocation(GenomeLoc location) { public void addLocation(GenomeLoc location) {

View File

@ -117,6 +117,15 @@ public class LocusShard implements BAMFormatAwareShard {
return ShardType.LOCUS; return ShardType.LOCUS;
} }
/**
* Locus shards don't make sense as unmapped regions. Always return false.
* @return False always.
*/
@Override
public boolean isUnmapped() {
return false;
}
/** /**
* Gets key read validation and filtering properties. * Gets key read validation and filtering properties.
* @return set of read properties associated with this shard. * @return set of read properties associated with this shard.

View File

@ -54,15 +54,21 @@ public class ReadShard implements BAMFormatAwareShard {
*/ */
private final List<GenomeLoc> loci; private final List<GenomeLoc> loci;
/**
* Whether the current location is unmapped.
*/
private final boolean isUnmapped;
/** /**
* Statistics about which reads in this shards were used and which were filtered away. * Statistics about which reads in this shards were used and which were filtered away.
*/ */
private final ReadMetrics readMetrics = new ReadMetrics(); private final ReadMetrics readMetrics = new ReadMetrics();
public ReadShard(SAMDataSource readsDataSource, Map<SAMReaderID,SAMFileSpan> fileSpans, List<GenomeLoc> loci) { public ReadShard(SAMDataSource readsDataSource, Map<SAMReaderID,SAMFileSpan> fileSpans, List<GenomeLoc> loci, boolean isUnmapped) {
this.readsDataSource = readsDataSource; this.readsDataSource = readsDataSource;
this.fileSpans = fileSpans; this.fileSpans = fileSpans;
this.loci = loci; this.loci = loci;
this.isUnmapped = isUnmapped;
} }
/** /**
@ -88,6 +94,16 @@ public class ReadShard implements BAMFormatAwareShard {
return loci; return loci;
} }
/**
* Whether this shard points to an unmapped region.
* @return True if this shard is unmapped. False otherwise.
*/
@Override
public boolean isUnmapped() {
return isUnmapped;
}
/** /**
* Returns true if this shard is meant to buffer reads, rather * Returns true if this shard is meant to buffer reads, rather
* than just holding pointers to their locations. * than just holding pointers to their locations.

View File

@ -79,6 +79,11 @@ public class ReadShardStrategy implements ShardStrategy {
*/ */
private Map<SAMReaderID,SAMFileSpan> position; private Map<SAMReaderID,SAMFileSpan> position;
/**
* An indicator whether the strategy has sharded into the unmapped region.
*/
private boolean isIntoUnmappedRegion = false;
/** /**
* Create a new read shard strategy, loading read shards from the given BAM file. * Create a new read shard strategy, loading read shards from the given BAM file.
* @param dataSource Data source from which to load shards. * @param dataSource Data source from which to load shards.
@ -130,13 +135,27 @@ public class ReadShardStrategy implements ShardStrategy {
while(selectedReaders.size() == 0 && currentFilePointer != null) { while(selectedReaders.size() == 0 && currentFilePointer != null) {
shardPosition = currentFilePointer.fileSpans; shardPosition = currentFilePointer.fileSpans;
for(SAMReaderID id: shardPosition.keySet()) { for(SAMReaderID id: shardPosition.keySet()) {
SAMFileSpan fileSpan = shardPosition.get(id).removeContentsBefore(position.get(id)); // If the region contains location information (in other words, it is not at
if(!fileSpan.isEmpty()) // the start of the unmapped region), add the region.
selectedReaders.put(id,fileSpan); if(currentFilePointer.isRegionUnmapped) {
// If the region is unmapped and no location data exists, add a null as an indicator to
// start at the next unmapped region.
if(!isIntoUnmappedRegion) {
selectedReaders.put(id,null);
isIntoUnmappedRegion = true;
}
else
selectedReaders.put(id,position.get(id));
}
else {
SAMFileSpan fileSpan = shardPosition.get(id).removeContentsBefore(position.get(id));
if(!fileSpan.isEmpty())
selectedReaders.put(id,fileSpan);
}
} }
if(selectedReaders.size() > 0) { if(selectedReaders.size() > 0) {
BAMFormatAwareShard shard = new ReadShard(dataSource,selectedReaders,currentFilePointer.locations); BAMFormatAwareShard shard = new ReadShard(dataSource,selectedReaders,currentFilePointer.locations,currentFilePointer.isRegionUnmapped);
dataSource.fillShard(shard); dataSource.fillShard(shard);
if(!shard.isBufferEmpty()) { if(!shard.isBufferEmpty()) {
@ -150,7 +169,7 @@ public class ReadShardStrategy implements ShardStrategy {
} }
} }
else { else {
BAMFormatAwareShard shard = new ReadShard(dataSource,position,null); BAMFormatAwareShard shard = new ReadShard(dataSource,position,null,false);
dataSource.fillShard(shard); dataSource.fillShard(shard);
nextShard = !shard.isBufferEmpty() ? shard : null; nextShard = !shard.isBufferEmpty() ? shard : null;
} }

View File

@ -501,9 +501,12 @@ public class SAMDataSource implements SimpleDataSource {
MergingSamRecordIterator mergingIterator = new MergingSamRecordIterator(headerMerger,readers.values(),true); MergingSamRecordIterator mergingIterator = new MergingSamRecordIterator(headerMerger,readers.values(),true);
for(SAMReaderID id: getReaderIDs()) { for(SAMReaderID id: getReaderIDs()) {
if(shard.getFileSpans().get(id) == null) CloseableIterator<SAMRecord> iterator = null;
if(!shard.isUnmapped() && shard.getFileSpans().get(id) == null)
continue; continue;
CloseableIterator<SAMRecord> iterator = readers.getReader(id).iterator(shard.getFileSpans().get(id)); iterator = shard.getFileSpans().get(id) != null ?
readers.getReader(id).iterator(shard.getFileSpans().get(id)) :
readers.getReader(id).queryUnmapped();
if(readProperties.getReadBufferSize() != null) if(readProperties.getReadBufferSize() != null)
iterator = new BufferingReadIterator(iterator,readProperties.getReadBufferSize()); iterator = new BufferingReadIterator(iterator,readProperties.getReadBufferSize());
if(shard.getGenomeLocs() != null) if(shard.getGenomeLocs() != null)
@ -823,6 +826,11 @@ public class SAMDataSource implements SimpleDataSource {
*/ */
private SAMRecord nextRead; private SAMRecord nextRead;
/**
* Rather than using the straight genomic bounds, use filter out only mapped reads.
*/
private boolean keepOnlyUnmappedReads;
/** /**
* Custom representation of interval bounds. * Custom representation of interval bounds.
* Makes it simpler to track current position. * Makes it simpler to track current position.
@ -837,14 +845,30 @@ public class SAMDataSource implements SimpleDataSource {
public IntervalOverlapFilteringIterator(CloseableIterator<SAMRecord> iterator, List<GenomeLoc> intervals) { public IntervalOverlapFilteringIterator(CloseableIterator<SAMRecord> iterator, List<GenomeLoc> intervals) {
this.iterator = iterator; this.iterator = iterator;
this.intervalStarts = new int[intervals.size()];
this.intervalEnds = new int[intervals.size()]; // Look at the interval list to detect whether we should worry about unmapped reads.
int i = 0; // If we find a mix of mapped/unmapped intervals, throw an exception.
for(GenomeLoc interval: intervals) { boolean foundMappedIntervals = false;
intervalStarts[i] = (int)interval.getStart(); for(GenomeLoc location: intervals) {
intervalEnds[i] = (int)interval.getStop(); if(location != GenomeLoc.UNMAPPED)
i++; foundMappedIntervals = true;
keepOnlyUnmappedReads |= (location == GenomeLoc.UNMAPPED);
} }
if(foundMappedIntervals) {
if(keepOnlyUnmappedReads)
throw new ReviewedStingException("Tried to apply IntervalOverlapFilteringIterator to a mixed of mapped and unmapped intervals. Please apply this filter to only mapped or only unmapped reads");
this.intervalStarts = new int[intervals.size()];
this.intervalEnds = new int[intervals.size()];
int i = 0;
for(GenomeLoc interval: intervals) {
intervalStarts[i] = (int)interval.getStart();
intervalEnds[i] = (int)interval.getStop();
i++;
}
}
advance(); advance();
} }
@ -876,21 +900,31 @@ public class SAMDataSource implements SimpleDataSource {
return; return;
SAMRecord candidateRead = iterator.next(); SAMRecord candidateRead = iterator.next();
while(nextRead == null && currentBound < intervalStarts.length) { while(nextRead == null && (keepOnlyUnmappedReads || currentBound < intervalStarts.length)) {
if(candidateRead.getAlignmentEnd() >= intervalStarts[currentBound] || if(!keepOnlyUnmappedReads) {
(candidateRead.getReadUnmappedFlag() && candidateRead.getAlignmentStart() >= intervalStarts[currentBound])) { // Mapped read filter; check against GenomeLoc-derived bounds.
// This read ends after the current interval begins (or, if unmapped, starts within the bounds of the interval. if(candidateRead.getAlignmentEnd() >= intervalStarts[currentBound] ||
// Promising, but this read must be checked against the ending bound. (candidateRead.getReadUnmappedFlag() && candidateRead.getAlignmentStart() >= intervalStarts[currentBound])) {
if(candidateRead.getAlignmentStart() <= intervalEnds[currentBound]) { // This read ends after the current interval begins (or, if unmapped, starts within the bounds of the interval.
// Yes, this read is within both bounds. This must be our next read. // Promising, but this read must be checked against the ending bound.
nextRead = candidateRead; if(candidateRead.getAlignmentStart() <= intervalEnds[currentBound]) {
break; // Yes, this read is within both bounds. This must be our next read.
nextRead = candidateRead;
break;
}
else {
// Oops, we're past the end bound. Increment the current bound and try again.
currentBound++;
continue;
}
} }
else { }
// Oops, we're past the end bound. Increment the current bound and try again. else {
currentBound++; // Unmapped read filter; just check getReadUnmappedFlag().
if(!candidateRead.getReadUnmappedFlag())
continue; continue;
} nextRead = candidateRead;
break;
} }
// No more reads available. Stop the search. // No more reads available. Stop the search.

View File

@ -26,6 +26,14 @@ public class GenomeLoc implements Comparable<GenomeLoc>, Cloneable, Serializable
protected final int start; protected final int start;
protected final int stop; protected final int stop;
protected final String contigName; protected final String contigName;
/**
* A static constant to use when referring to the unmapped section of a datafile
* file. The unmapped region cannot be subdivided. Only this instance of
* the object may be used to refer to the region, as '==' comparisons are used
* in comparators, etc.
*/
public static final GenomeLoc UNMAPPED = new GenomeLoc(null,-1,0,0);
// -------------------------------------------------------------------------------------------------------------- // --------------------------------------------------------------------------------------------------------------
// //
@ -64,6 +72,7 @@ public class GenomeLoc implements Comparable<GenomeLoc>, Cloneable, Serializable
public final int getStart() { return this.start; } public final int getStart() { return this.start; }
public final int getStop() { return this.stop; } public final int getStop() { return this.stop; }
public final String toString() { public final String toString() {
if(this == UNMAPPED) return "unmapped";
if ( throughEndOfContigP() && atBeginningOfContigP() ) if ( throughEndOfContigP() && atBeginningOfContigP() )
return getContig(); return getContig();
else if ( throughEndOfContigP() || getStart() == getStop() ) else if ( throughEndOfContigP() || getStart() == getStop() )
@ -91,6 +100,12 @@ public class GenomeLoc implements Comparable<GenomeLoc>, Cloneable, Serializable
} }
public GenomeLoc merge( GenomeLoc that ) throws ReviewedStingException { public GenomeLoc merge( GenomeLoc that ) throws ReviewedStingException {
if(this == UNMAPPED || that == UNMAPPED) {
if(this != UNMAPPED || that != UNMAPPED)
throw new ReviewedStingException("Tried to merge a mapped and an unmapped genome loc");
return UNMAPPED;
}
if (!(this.contiguousP(that))) { if (!(this.contiguousP(that))) {
throw new ReviewedStingException("The two genome loc's need to be contigous"); throw new ReviewedStingException("The two genome loc's need to be contigous");
} }
@ -101,6 +116,12 @@ public class GenomeLoc implements Comparable<GenomeLoc>, Cloneable, Serializable
} }
public GenomeLoc intersect( GenomeLoc that ) throws ReviewedStingException { public GenomeLoc intersect( GenomeLoc that ) throws ReviewedStingException {
if(this == UNMAPPED || that == UNMAPPED) {
if(this != UNMAPPED || that != UNMAPPED)
throw new ReviewedStingException("Tried to intersect a mapped and an unmapped genome loc");
return UNMAPPED;
}
if (!(this.overlapsP(that))) { if (!(this.overlapsP(that))) {
throw new ReviewedStingException("GenomeLoc::intersect(): The two genome loc's need to overlap"); throw new ReviewedStingException("GenomeLoc::intersect(): The two genome loc's need to overlap");
} }
@ -216,7 +237,12 @@ public class GenomeLoc implements Comparable<GenomeLoc>, Cloneable, Serializable
int result = 0; int result = 0;
if ( this == that ) { if ( this == that ) {
result = 0; result = 0;
} else { }
else if(this == UNMAPPED)
result = 1;
else if(that == UNMAPPED)
result = -1;
else {
final int cmpContig = compareContigs(that); final int cmpContig = compareContigs(that);
if ( cmpContig != 0 ) { if ( cmpContig != 0 ) {

View File

@ -41,20 +41,21 @@ public class IntervalUtils {
// ensure that the arg list isn't null before looping. // ensure that the arg list isn't null before looping.
for (String argument : argList) { for (String argument : argList) {
// if any interval argument is '-L all', consider all loci by returning no intervals
if (argument.equals("all")) {
if (argList.size() != 1) {
// throw error if '-L all' is not only interval - potentially conflicting commands
throw new UserException.CommandLineException(String.format("Conflicting arguments: Intervals given along with \"-L all\""));
}
return null;
}
// separate argument on semicolon first // separate argument on semicolon first
for (String fileOrInterval : argument.split(";")) { for (String fileOrInterval : argument.split(";")) {
// if any interval argument is '-L all', consider all loci by returning no intervals
if (fileOrInterval.trim().toLowerCase().equals("all")) {
if (argList.size() != 1) {
// throw error if '-L all' is not only interval - potentially conflicting commands
throw new UserException.CommandLineException(String.format("Conflicting arguments: Intervals given along with \"-L all\""));
}
return null;
}
// if any argument is 'unmapped', "parse" it to a null entry. A null in this case means 'all the intervals with no alignment data'.
else if(fileOrInterval.trim().toLowerCase().equals("unmapped"))
rawIntervals.add(GenomeLoc.UNMAPPED);
// if it's a file, add items to raw interval list // if it's a file, add items to raw interval list
if (isIntervalFile(fileOrInterval)) { else if (isIntervalFile(fileOrInterval)) {
try { try {
rawIntervals.addAll(parser.intervalFileToList(fileOrInterval, allowEmptyIntervalList)); rawIntervals.addAll(parser.intervalFileToList(fileOrInterval, allowEmptyIntervalList));
} }