Support for '-L unmapped' in read walkers. DO NOT USE THIS PATCH YET. It has been
subjected to and passes cursory testing on one dataset (and all integration tests pass). However, there's a small library of validation checks, and unit and integration tests that must be added. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4813 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
a2d6cef181
commit
d4d3170436
|
|
@ -53,4 +53,12 @@ public interface BAMFormatAwareShard extends Shard {
|
|||
* @return An iterator over the reads stored in the shard.
|
||||
*/
|
||||
public StingSAMIterator iterator();
|
||||
|
||||
/**
|
||||
* Whether this shard points to an unmapped region.
|
||||
* Some shard types conceptually be unmapped (e.g. LocusShards). In
|
||||
* this case, isUnmapped should always return false.
|
||||
* @return True if this shard is unmapped. False otherwise.
|
||||
*/
|
||||
public boolean isUnmapped();
|
||||
}
|
||||
|
|
|
|||
|
|
@ -87,21 +87,38 @@ public class IntervalSharder {
|
|||
GenomeLocSortedSet nextBatch = new GenomeLocSortedSet(loci.getGenomeLocParser());
|
||||
String contig = null;
|
||||
|
||||
// If the next section of the BAM to be processed is unmapped, handle this region separately.
|
||||
while(locusIterator.hasNext() && nextBatch.isEmpty()) {
|
||||
contig = null;
|
||||
while(locusIterator.hasNext() && (contig == null || locusIterator.peek().getContig().equals(contig))) {
|
||||
while(locusIterator.hasNext() && (contig == null || (locusIterator.peek() != GenomeLoc.UNMAPPED && locusIterator.peek().getContig().equals(contig)))) {
|
||||
GenomeLoc nextLocus = locusIterator.next();
|
||||
contig = nextLocus.getContig();
|
||||
nextBatch.add(nextLocus);
|
||||
}
|
||||
}
|
||||
|
||||
if(nextBatch.size() > 0)
|
||||
if(nextBatch.size() > 0) {
|
||||
cachedFilePointers.addAll(shardIntervalsOnContig(dataSource,contig,nextBatch));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Merge / split intervals based on an awareness of the structure of the BAM file.
|
||||
* @param dataSource
|
||||
* @param contig Contig against which to align the intervals. If null, create a file pointer across unmapped reads.
|
||||
* @param loci
|
||||
* @return
|
||||
*/
|
||||
private static List<FilePointer> shardIntervalsOnContig(final SAMDataSource dataSource, final String contig, final GenomeLocSortedSet loci) {
|
||||
// If the contig is null, eliminate the chopping process and build out a file pointer consisting of the unmapped region of all BAMs.
|
||||
if(contig == null) {
|
||||
FilePointer filePointer = new FilePointer(GenomeLoc.UNMAPPED);
|
||||
for(SAMReaderID id: dataSource.getReaderIDs())
|
||||
filePointer.addFileSpans(id,null);
|
||||
return Collections.singletonList(filePointer);
|
||||
}
|
||||
|
||||
// Gather bins for the given loci, splitting loci as necessary so that each falls into exactly one lowest-level bin.
|
||||
List<FilePointer> filePointers = new ArrayList<FilePointer>();
|
||||
FilePointer lastFilePointer = null;
|
||||
|
|
@ -111,7 +128,8 @@ public class IntervalSharder {
|
|||
BinMergingIterator binMerger = new BinMergingIterator();
|
||||
for(SAMReaderID id: dataSource.getReaderIDs()) {
|
||||
final SAMSequenceRecord referenceSequence = dataSource.getHeader(id).getSequence(contig);
|
||||
if(referenceSequence == null)
|
||||
// If this contig can't be found in the reference, skip over it.
|
||||
if(referenceSequence == null && contig != null)
|
||||
continue;
|
||||
final BrowseableBAMIndex index = dataSource.getIndex(id);
|
||||
binMerger.addReader(id,
|
||||
|
|
@ -360,16 +378,23 @@ class FilePointer {
|
|||
protected final BAMOverlap overlap;
|
||||
protected final List<GenomeLoc> locations;
|
||||
|
||||
/**
|
||||
* Does this file pointer point into an unmapped region?
|
||||
*/
|
||||
protected final boolean isRegionUnmapped;
|
||||
|
||||
public FilePointer(final GenomeLoc location) {
|
||||
referenceSequence = location.getContig();
|
||||
overlap = null;
|
||||
locations = Collections.singletonList(location);
|
||||
this.referenceSequence = location.getContig();
|
||||
this.overlap = null;
|
||||
this.locations = Collections.singletonList(location);
|
||||
this.isRegionUnmapped = location == GenomeLoc.UNMAPPED;
|
||||
}
|
||||
|
||||
public FilePointer(final String referenceSequence,final BAMOverlap overlap) {
|
||||
this.referenceSequence = referenceSequence;
|
||||
this.overlap = overlap;
|
||||
this.locations = new ArrayList<GenomeLoc>();
|
||||
this.isRegionUnmapped = false;
|
||||
}
|
||||
|
||||
public void addLocation(GenomeLoc location) {
|
||||
|
|
|
|||
|
|
@ -117,6 +117,15 @@ public class LocusShard implements BAMFormatAwareShard {
|
|||
return ShardType.LOCUS;
|
||||
}
|
||||
|
||||
/**
|
||||
* Locus shards don't make sense as unmapped regions. Always return false.
|
||||
* @return False always.
|
||||
*/
|
||||
@Override
|
||||
public boolean isUnmapped() {
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets key read validation and filtering properties.
|
||||
* @return set of read properties associated with this shard.
|
||||
|
|
|
|||
|
|
@ -54,15 +54,21 @@ public class ReadShard implements BAMFormatAwareShard {
|
|||
*/
|
||||
private final List<GenomeLoc> loci;
|
||||
|
||||
/**
|
||||
* Whether the current location is unmapped.
|
||||
*/
|
||||
private final boolean isUnmapped;
|
||||
|
||||
/**
|
||||
* Statistics about which reads in this shards were used and which were filtered away.
|
||||
*/
|
||||
private final ReadMetrics readMetrics = new ReadMetrics();
|
||||
|
||||
public ReadShard(SAMDataSource readsDataSource, Map<SAMReaderID,SAMFileSpan> fileSpans, List<GenomeLoc> loci) {
|
||||
public ReadShard(SAMDataSource readsDataSource, Map<SAMReaderID,SAMFileSpan> fileSpans, List<GenomeLoc> loci, boolean isUnmapped) {
|
||||
this.readsDataSource = readsDataSource;
|
||||
this.fileSpans = fileSpans;
|
||||
this.loci = loci;
|
||||
this.isUnmapped = isUnmapped;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -88,6 +94,16 @@ public class ReadShard implements BAMFormatAwareShard {
|
|||
return loci;
|
||||
}
|
||||
|
||||
/**
|
||||
* Whether this shard points to an unmapped region.
|
||||
* @return True if this shard is unmapped. False otherwise.
|
||||
*/
|
||||
@Override
|
||||
public boolean isUnmapped() {
|
||||
return isUnmapped;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Returns true if this shard is meant to buffer reads, rather
|
||||
* than just holding pointers to their locations.
|
||||
|
|
|
|||
|
|
@ -79,6 +79,11 @@ public class ReadShardStrategy implements ShardStrategy {
|
|||
*/
|
||||
private Map<SAMReaderID,SAMFileSpan> position;
|
||||
|
||||
/**
|
||||
* An indicator whether the strategy has sharded into the unmapped region.
|
||||
*/
|
||||
private boolean isIntoUnmappedRegion = false;
|
||||
|
||||
/**
|
||||
* Create a new read shard strategy, loading read shards from the given BAM file.
|
||||
* @param dataSource Data source from which to load shards.
|
||||
|
|
@ -130,13 +135,27 @@ public class ReadShardStrategy implements ShardStrategy {
|
|||
while(selectedReaders.size() == 0 && currentFilePointer != null) {
|
||||
shardPosition = currentFilePointer.fileSpans;
|
||||
for(SAMReaderID id: shardPosition.keySet()) {
|
||||
SAMFileSpan fileSpan = shardPosition.get(id).removeContentsBefore(position.get(id));
|
||||
if(!fileSpan.isEmpty())
|
||||
selectedReaders.put(id,fileSpan);
|
||||
// If the region contains location information (in other words, it is not at
|
||||
// the start of the unmapped region), add the region.
|
||||
if(currentFilePointer.isRegionUnmapped) {
|
||||
// If the region is unmapped and no location data exists, add a null as an indicator to
|
||||
// start at the next unmapped region.
|
||||
if(!isIntoUnmappedRegion) {
|
||||
selectedReaders.put(id,null);
|
||||
isIntoUnmappedRegion = true;
|
||||
}
|
||||
else
|
||||
selectedReaders.put(id,position.get(id));
|
||||
}
|
||||
else {
|
||||
SAMFileSpan fileSpan = shardPosition.get(id).removeContentsBefore(position.get(id));
|
||||
if(!fileSpan.isEmpty())
|
||||
selectedReaders.put(id,fileSpan);
|
||||
}
|
||||
}
|
||||
|
||||
if(selectedReaders.size() > 0) {
|
||||
BAMFormatAwareShard shard = new ReadShard(dataSource,selectedReaders,currentFilePointer.locations);
|
||||
BAMFormatAwareShard shard = new ReadShard(dataSource,selectedReaders,currentFilePointer.locations,currentFilePointer.isRegionUnmapped);
|
||||
dataSource.fillShard(shard);
|
||||
|
||||
if(!shard.isBufferEmpty()) {
|
||||
|
|
@ -150,7 +169,7 @@ public class ReadShardStrategy implements ShardStrategy {
|
|||
}
|
||||
}
|
||||
else {
|
||||
BAMFormatAwareShard shard = new ReadShard(dataSource,position,null);
|
||||
BAMFormatAwareShard shard = new ReadShard(dataSource,position,null,false);
|
||||
dataSource.fillShard(shard);
|
||||
nextShard = !shard.isBufferEmpty() ? shard : null;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -501,9 +501,12 @@ public class SAMDataSource implements SimpleDataSource {
|
|||
MergingSamRecordIterator mergingIterator = new MergingSamRecordIterator(headerMerger,readers.values(),true);
|
||||
|
||||
for(SAMReaderID id: getReaderIDs()) {
|
||||
if(shard.getFileSpans().get(id) == null)
|
||||
CloseableIterator<SAMRecord> iterator = null;
|
||||
if(!shard.isUnmapped() && shard.getFileSpans().get(id) == null)
|
||||
continue;
|
||||
CloseableIterator<SAMRecord> iterator = readers.getReader(id).iterator(shard.getFileSpans().get(id));
|
||||
iterator = shard.getFileSpans().get(id) != null ?
|
||||
readers.getReader(id).iterator(shard.getFileSpans().get(id)) :
|
||||
readers.getReader(id).queryUnmapped();
|
||||
if(readProperties.getReadBufferSize() != null)
|
||||
iterator = new BufferingReadIterator(iterator,readProperties.getReadBufferSize());
|
||||
if(shard.getGenomeLocs() != null)
|
||||
|
|
@ -823,6 +826,11 @@ public class SAMDataSource implements SimpleDataSource {
|
|||
*/
|
||||
private SAMRecord nextRead;
|
||||
|
||||
/**
|
||||
* Rather than using the straight genomic bounds, use filter out only mapped reads.
|
||||
*/
|
||||
private boolean keepOnlyUnmappedReads;
|
||||
|
||||
/**
|
||||
* Custom representation of interval bounds.
|
||||
* Makes it simpler to track current position.
|
||||
|
|
@ -837,14 +845,30 @@ public class SAMDataSource implements SimpleDataSource {
|
|||
|
||||
public IntervalOverlapFilteringIterator(CloseableIterator<SAMRecord> iterator, List<GenomeLoc> intervals) {
|
||||
this.iterator = iterator;
|
||||
this.intervalStarts = new int[intervals.size()];
|
||||
this.intervalEnds = new int[intervals.size()];
|
||||
int i = 0;
|
||||
for(GenomeLoc interval: intervals) {
|
||||
intervalStarts[i] = (int)interval.getStart();
|
||||
intervalEnds[i] = (int)interval.getStop();
|
||||
i++;
|
||||
|
||||
// Look at the interval list to detect whether we should worry about unmapped reads.
|
||||
// If we find a mix of mapped/unmapped intervals, throw an exception.
|
||||
boolean foundMappedIntervals = false;
|
||||
for(GenomeLoc location: intervals) {
|
||||
if(location != GenomeLoc.UNMAPPED)
|
||||
foundMappedIntervals = true;
|
||||
keepOnlyUnmappedReads |= (location == GenomeLoc.UNMAPPED);
|
||||
}
|
||||
|
||||
|
||||
if(foundMappedIntervals) {
|
||||
if(keepOnlyUnmappedReads)
|
||||
throw new ReviewedStingException("Tried to apply IntervalOverlapFilteringIterator to a mixed of mapped and unmapped intervals. Please apply this filter to only mapped or only unmapped reads");
|
||||
this.intervalStarts = new int[intervals.size()];
|
||||
this.intervalEnds = new int[intervals.size()];
|
||||
int i = 0;
|
||||
for(GenomeLoc interval: intervals) {
|
||||
intervalStarts[i] = (int)interval.getStart();
|
||||
intervalEnds[i] = (int)interval.getStop();
|
||||
i++;
|
||||
}
|
||||
}
|
||||
|
||||
advance();
|
||||
}
|
||||
|
||||
|
|
@ -876,21 +900,31 @@ public class SAMDataSource implements SimpleDataSource {
|
|||
return;
|
||||
|
||||
SAMRecord candidateRead = iterator.next();
|
||||
while(nextRead == null && currentBound < intervalStarts.length) {
|
||||
if(candidateRead.getAlignmentEnd() >= intervalStarts[currentBound] ||
|
||||
(candidateRead.getReadUnmappedFlag() && candidateRead.getAlignmentStart() >= intervalStarts[currentBound])) {
|
||||
// This read ends after the current interval begins (or, if unmapped, starts within the bounds of the interval.
|
||||
// Promising, but this read must be checked against the ending bound.
|
||||
if(candidateRead.getAlignmentStart() <= intervalEnds[currentBound]) {
|
||||
// Yes, this read is within both bounds. This must be our next read.
|
||||
nextRead = candidateRead;
|
||||
break;
|
||||
while(nextRead == null && (keepOnlyUnmappedReads || currentBound < intervalStarts.length)) {
|
||||
if(!keepOnlyUnmappedReads) {
|
||||
// Mapped read filter; check against GenomeLoc-derived bounds.
|
||||
if(candidateRead.getAlignmentEnd() >= intervalStarts[currentBound] ||
|
||||
(candidateRead.getReadUnmappedFlag() && candidateRead.getAlignmentStart() >= intervalStarts[currentBound])) {
|
||||
// This read ends after the current interval begins (or, if unmapped, starts within the bounds of the interval.
|
||||
// Promising, but this read must be checked against the ending bound.
|
||||
if(candidateRead.getAlignmentStart() <= intervalEnds[currentBound]) {
|
||||
// Yes, this read is within both bounds. This must be our next read.
|
||||
nextRead = candidateRead;
|
||||
break;
|
||||
}
|
||||
else {
|
||||
// Oops, we're past the end bound. Increment the current bound and try again.
|
||||
currentBound++;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
else {
|
||||
// Oops, we're past the end bound. Increment the current bound and try again.
|
||||
currentBound++;
|
||||
}
|
||||
else {
|
||||
// Unmapped read filter; just check getReadUnmappedFlag().
|
||||
if(!candidateRead.getReadUnmappedFlag())
|
||||
continue;
|
||||
}
|
||||
nextRead = candidateRead;
|
||||
break;
|
||||
}
|
||||
|
||||
// No more reads available. Stop the search.
|
||||
|
|
|
|||
|
|
@ -26,6 +26,14 @@ public class GenomeLoc implements Comparable<GenomeLoc>, Cloneable, Serializable
|
|||
protected final int start;
|
||||
protected final int stop;
|
||||
protected final String contigName;
|
||||
|
||||
/**
|
||||
* A static constant to use when referring to the unmapped section of a datafile
|
||||
* file. The unmapped region cannot be subdivided. Only this instance of
|
||||
* the object may be used to refer to the region, as '==' comparisons are used
|
||||
* in comparators, etc.
|
||||
*/
|
||||
public static final GenomeLoc UNMAPPED = new GenomeLoc(null,-1,0,0);
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
|
|
@ -64,6 +72,7 @@ public class GenomeLoc implements Comparable<GenomeLoc>, Cloneable, Serializable
|
|||
public final int getStart() { return this.start; }
|
||||
public final int getStop() { return this.stop; }
|
||||
public final String toString() {
|
||||
if(this == UNMAPPED) return "unmapped";
|
||||
if ( throughEndOfContigP() && atBeginningOfContigP() )
|
||||
return getContig();
|
||||
else if ( throughEndOfContigP() || getStart() == getStop() )
|
||||
|
|
@ -91,6 +100,12 @@ public class GenomeLoc implements Comparable<GenomeLoc>, Cloneable, Serializable
|
|||
}
|
||||
|
||||
public GenomeLoc merge( GenomeLoc that ) throws ReviewedStingException {
|
||||
if(this == UNMAPPED || that == UNMAPPED) {
|
||||
if(this != UNMAPPED || that != UNMAPPED)
|
||||
throw new ReviewedStingException("Tried to merge a mapped and an unmapped genome loc");
|
||||
return UNMAPPED;
|
||||
}
|
||||
|
||||
if (!(this.contiguousP(that))) {
|
||||
throw new ReviewedStingException("The two genome loc's need to be contigous");
|
||||
}
|
||||
|
|
@ -101,6 +116,12 @@ public class GenomeLoc implements Comparable<GenomeLoc>, Cloneable, Serializable
|
|||
}
|
||||
|
||||
public GenomeLoc intersect( GenomeLoc that ) throws ReviewedStingException {
|
||||
if(this == UNMAPPED || that == UNMAPPED) {
|
||||
if(this != UNMAPPED || that != UNMAPPED)
|
||||
throw new ReviewedStingException("Tried to intersect a mapped and an unmapped genome loc");
|
||||
return UNMAPPED;
|
||||
}
|
||||
|
||||
if (!(this.overlapsP(that))) {
|
||||
throw new ReviewedStingException("GenomeLoc::intersect(): The two genome loc's need to overlap");
|
||||
}
|
||||
|
|
@ -216,7 +237,12 @@ public class GenomeLoc implements Comparable<GenomeLoc>, Cloneable, Serializable
|
|||
int result = 0;
|
||||
if ( this == that ) {
|
||||
result = 0;
|
||||
} else {
|
||||
}
|
||||
else if(this == UNMAPPED)
|
||||
result = 1;
|
||||
else if(that == UNMAPPED)
|
||||
result = -1;
|
||||
else {
|
||||
final int cmpContig = compareContigs(that);
|
||||
|
||||
if ( cmpContig != 0 ) {
|
||||
|
|
|
|||
|
|
@ -41,20 +41,21 @@ public class IntervalUtils {
|
|||
// ensure that the arg list isn't null before looping.
|
||||
for (String argument : argList) {
|
||||
|
||||
// if any interval argument is '-L all', consider all loci by returning no intervals
|
||||
if (argument.equals("all")) {
|
||||
if (argList.size() != 1) {
|
||||
// throw error if '-L all' is not only interval - potentially conflicting commands
|
||||
throw new UserException.CommandLineException(String.format("Conflicting arguments: Intervals given along with \"-L all\""));
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
// separate argument on semicolon first
|
||||
for (String fileOrInterval : argument.split(";")) {
|
||||
|
||||
// if any interval argument is '-L all', consider all loci by returning no intervals
|
||||
if (fileOrInterval.trim().toLowerCase().equals("all")) {
|
||||
if (argList.size() != 1) {
|
||||
// throw error if '-L all' is not only interval - potentially conflicting commands
|
||||
throw new UserException.CommandLineException(String.format("Conflicting arguments: Intervals given along with \"-L all\""));
|
||||
}
|
||||
return null;
|
||||
}
|
||||
// if any argument is 'unmapped', "parse" it to a null entry. A null in this case means 'all the intervals with no alignment data'.
|
||||
else if(fileOrInterval.trim().toLowerCase().equals("unmapped"))
|
||||
rawIntervals.add(GenomeLoc.UNMAPPED);
|
||||
// if it's a file, add items to raw interval list
|
||||
if (isIntervalFile(fileOrInterval)) {
|
||||
else if (isIntervalFile(fileOrInterval)) {
|
||||
try {
|
||||
rawIntervals.addAll(parser.intervalFileToList(fileOrInterval, allowEmptyIntervalList));
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue