Commit to emit in sorted order so we don't have to use /tmp
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1133 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
f5cba5a6bb
commit
8d3dc57c3d
|
|
@ -6,10 +6,7 @@ import net.sf.samtools.SAMRecord;
|
|||
import net.sf.samtools.util.CloseableIterator;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||
|
||||
import java.util.*;
|
||||
|
|
@ -32,12 +29,6 @@ import java.util.*;
|
|||
*/
|
||||
public class CleanedReadInjector extends ReadWalker<Integer,Integer> {
|
||||
|
||||
/**
|
||||
* The source of all cleaned intervals.
|
||||
*/
|
||||
@Argument(fullName="cleaned_intervals",shortName="ci",doc="Intervals which have been cleaned.",required=true)
|
||||
List<String> intervalsSource = null;
|
||||
|
||||
/**
|
||||
* The source of all cleaned reads.
|
||||
*/
|
||||
|
|
@ -51,19 +42,14 @@ public class CleanedReadInjector extends ReadWalker<Integer,Integer> {
|
|||
String outputBAMFileName = null;
|
||||
|
||||
/**
|
||||
* A stream of processed intervals. The current head of the queue represents the next interval.
|
||||
* The set of (sorted) cleaned reads
|
||||
*/
|
||||
private Queue<GenomeLoc> intervals;
|
||||
private Queue<SAMRecord> cleanedReads = new LinkedList<SAMRecord>();
|
||||
|
||||
/**
|
||||
* The interval currently in process.
|
||||
* A fast lookup table for uniquified read info
|
||||
*/
|
||||
private GenomeLoc interval = null;
|
||||
|
||||
/**
|
||||
* A structure for fast lookup of all reads that have been cleaned in the current interval.
|
||||
*/
|
||||
private Map<String,SAMRecord> cleanedReads = new HashMap<String,SAMRecord>();
|
||||
private HashSet<String> cleanedReadHash = new HashSet<String>();
|
||||
|
||||
/**
|
||||
* The writer that handles writing of SAM files.
|
||||
|
|
@ -72,23 +58,32 @@ public class CleanedReadInjector extends ReadWalker<Integer,Integer> {
|
|||
|
||||
@Override
|
||||
public void initialize() {
|
||||
intervals = parseIntervals( intervalsSource );
|
||||
interval = intervals.remove();
|
||||
loadCleanedReadsOverlappingInterval( interval );
|
||||
|
||||
// For now, read the whole damn file into memory. If this becomes a problem,
|
||||
// then we just need to read the hash into memory and the first read; we'd then
|
||||
// need to query the BAM file every time we needed to update the cleaned read iterator.
|
||||
CloseableIterator<SAMRecord> allReads = cleanedReadsSource.iterator();
|
||||
while ( allReads.hasNext() ) {
|
||||
SAMRecord read = allReads.next();
|
||||
cleanedReads.add(read);
|
||||
String uniquifiedReadName = getUniquifiedReadName(read);
|
||||
cleanedReadHash.add(getUniquifiedReadName(read));
|
||||
}
|
||||
allReads.close();
|
||||
|
||||
// HACK: The unit tests create their own output files. Make sure this walker doesn't step
|
||||
// on any toes.
|
||||
if( outputBAM == null ) {
|
||||
outputBAM = Utils.createSAMFileWriterWithCompression(getToolkit().getEngine().getSAMHeader(),
|
||||
false,
|
||||
true,
|
||||
outputBAMFileName,
|
||||
getToolkit().getBAMCompression());
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* For every read, inspect the set of cleaned reads that could possibly replace the current one.
|
||||
* If one exists, add the cleaned read to the output. Otherwise, add the current read.
|
||||
* For every read, if it exists in the cleaned read set, ignore it; otherwise, emit it.
|
||||
* Also, if the head of the cleaned read list could be emitted here, do so.
|
||||
* @param ref Portion of the reference sequence overlapping this read.
|
||||
* @param read The read to inspect.
|
||||
* @return Number of reads cleaned by this iteration of the map function.
|
||||
|
|
@ -96,33 +91,22 @@ public class CleanedReadInjector extends ReadWalker<Integer,Integer> {
|
|||
@Override
|
||||
public Integer map(char[] ref, SAMRecord read) {
|
||||
|
||||
if( read.getReadUnmappedFlag() ) {
|
||||
outputBAM.addAlignment(read);
|
||||
return 0;
|
||||
}
|
||||
|
||||
GenomeLoc loc = GenomeLocParser.createGenomeLoc(read);
|
||||
|
||||
while( loc.isPast(interval) && intervals.size() > 0 ) {
|
||||
interval = intervals.remove();
|
||||
loadCleanedReadsOverlappingInterval(interval);
|
||||
// first emit reads from the cleaned set if appropriate
|
||||
int cleanedReadCount = 0;
|
||||
SAMRecord firstCleanedRead = cleanedReads.peek();
|
||||
while ( firstCleanedRead != null && firstCleanedRead.getAlignmentStart() <= read.getAlignmentStart() ) {
|
||||
outputBAM.addAlignment(firstCleanedRead);
|
||||
cleanedReadCount++;
|
||||
cleanedReads.remove();
|
||||
firstCleanedRead = cleanedReads.peek();
|
||||
}
|
||||
|
||||
String uniquifiedReadName = getUniquifiedReadName(read);
|
||||
int numCleaned = 0;
|
||||
|
||||
if( loc.overlapsP(interval) && cleanedReads.containsKey(uniquifiedReadName) ) {
|
||||
SAMRecord cleanedRead = cleanedReads.get(uniquifiedReadName);
|
||||
cleanedRead.setReadName(read.getReadName());
|
||||
outputBAM.addAlignment(cleanedRead);
|
||||
numCleaned = 1;
|
||||
}
|
||||
else {
|
||||
if ( !cleanedReadHash.contains(uniquifiedReadName) )
|
||||
{
|
||||
outputBAM.addAlignment(read);
|
||||
numCleaned = 0;
|
||||
}
|
||||
|
||||
return numCleaned;
|
||||
return cleanedReadCount;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -148,35 +132,6 @@ public class CleanedReadInjector extends ReadWalker<Integer,Integer> {
|
|||
outputBAM.close();
|
||||
}
|
||||
|
||||
/**
|
||||
* Load the intervals directly from the command-line or from file, as appropriate.
|
||||
* Merge overlapping intervals.
|
||||
* @param intervalsSource Source of intervals.
|
||||
* @return a queue of sorted, merged intervals.
|
||||
*/
|
||||
private Queue parseIntervals( List<String> intervalsSource ) {
|
||||
List<GenomeLoc> parsedIntervals = GenomeAnalysisEngine.parseIntervalRegion(intervalsSource);
|
||||
GenomeLocSortedSet intervalSortedSet = new GenomeLocSortedSet();
|
||||
for( GenomeLoc parsedInterval: parsedIntervals )
|
||||
intervalSortedSet.addRegion(parsedInterval);
|
||||
|
||||
return new LinkedList<GenomeLoc>( intervalSortedSet );
|
||||
}
|
||||
|
||||
/**
|
||||
* Load a list of all the reads overlapping the given interval into memory.
|
||||
* @param interval
|
||||
*/
|
||||
private void loadCleanedReadsOverlappingInterval( GenomeLoc interval ) {
|
||||
CloseableIterator<SAMRecord> overlappingReads = cleanedReadsSource.queryOverlapping( interval.getContig(), (int)interval.getStart(), (int)interval.getStop() );
|
||||
// Load in all reads mapped to this region. The cleaner will augment the read name in a way that uniquifies it.
|
||||
while( overlappingReads.hasNext() ) {
|
||||
SAMRecord read = overlappingReads.next();
|
||||
cleanedReads.put( getUniquifiedReadName(read), read );
|
||||
}
|
||||
overlappingReads.close();
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the distinguishing elements of the read name from the given read.
|
||||
* @param read read to uniquify.
|
||||
|
|
|
|||
Loading…
Reference in New Issue