Commit to emit in sorted order so we don't have to use /tmp

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1133 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
ebanks 2009-06-30 19:47:15 +00:00
parent f5cba5a6bb
commit 8d3dc57c3d
1 changed files with 31 additions and 76 deletions

View File

@ -6,10 +6,7 @@ import net.sf.samtools.SAMRecord;
import net.sf.samtools.util.CloseableIterator;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import java.util.*;
@ -32,12 +29,6 @@ import java.util.*;
*/
public class CleanedReadInjector extends ReadWalker<Integer,Integer> {
/**
* The source of all cleaned intervals.
*/
@Argument(fullName="cleaned_intervals",shortName="ci",doc="Intervals which have been cleaned.",required=true)
List<String> intervalsSource = null;
/**
* The source of all cleaned reads.
*/
@ -51,19 +42,14 @@ public class CleanedReadInjector extends ReadWalker<Integer,Integer> {
String outputBAMFileName = null;
/**
* A stream of processed intervals. The current head of the queue represents the next interval.
* The set of (sorted) cleaned reads
*/
private Queue<GenomeLoc> intervals;
private Queue<SAMRecord> cleanedReads = new LinkedList<SAMRecord>();
/**
* The interval currently in process.
* A fast lookup table for uniquified read info
*/
private GenomeLoc interval = null;
/**
* A structure for fast lookup of all reads that have been cleaned in the current interval.
*/
private Map<String,SAMRecord> cleanedReads = new HashMap<String,SAMRecord>();
private HashSet<String> cleanedReadHash = new HashSet<String>();
/**
* The writer that handles writing of SAM files.
@ -72,23 +58,32 @@ public class CleanedReadInjector extends ReadWalker<Integer,Integer> {
@Override
public void initialize() {
intervals = parseIntervals( intervalsSource );
interval = intervals.remove();
loadCleanedReadsOverlappingInterval( interval );
// For now, read the whole damn file into memory. If this becomes a problem,
// then we just need to read the hash into memory and the first read; we'd then
// need to query the BAM file every time we needed to update the cleaned read iterator.
CloseableIterator<SAMRecord> allReads = cleanedReadsSource.iterator();
while ( allReads.hasNext() ) {
SAMRecord read = allReads.next();
cleanedReads.add(read);
String uniquifiedReadName = getUniquifiedReadName(read);
cleanedReadHash.add(getUniquifiedReadName(read));
}
allReads.close();
// HACK: The unit tests create their own output files. Make sure this walker doesn't step
// on any toes.
if( outputBAM == null ) {
outputBAM = Utils.createSAMFileWriterWithCompression(getToolkit().getEngine().getSAMHeader(),
false,
true,
outputBAMFileName,
getToolkit().getBAMCompression());
}
}
/**
* For every read, inspect the set of cleaned reads that could possibly replace the current one.
* If one exists, add the cleaned read to the output. Otherwise, add the current read.
* For every read, if it exists in the cleaned read set, ignore it; otherwise, emit it.
* Also, if the head of the cleaned read list could be emitted here, do so.
* @param ref Portion of the reference sequence overlapping this read.
* @param read The read to inspect.
* @return Number of reads cleaned by this iteration of the map function.
@ -96,33 +91,22 @@ public class CleanedReadInjector extends ReadWalker<Integer,Integer> {
@Override
public Integer map(char[] ref, SAMRecord read) {
if( read.getReadUnmappedFlag() ) {
outputBAM.addAlignment(read);
return 0;
}
GenomeLoc loc = GenomeLocParser.createGenomeLoc(read);
while( loc.isPast(interval) && intervals.size() > 0 ) {
interval = intervals.remove();
loadCleanedReadsOverlappingInterval(interval);
// first emit reads from the cleaned set if appropriate
int cleanedReadCount = 0;
SAMRecord firstCleanedRead = cleanedReads.peek();
while ( firstCleanedRead != null && firstCleanedRead.getAlignmentStart() <= read.getAlignmentStart() ) {
outputBAM.addAlignment(firstCleanedRead);
cleanedReadCount++;
cleanedReads.remove();
firstCleanedRead = cleanedReads.peek();
}
String uniquifiedReadName = getUniquifiedReadName(read);
int numCleaned = 0;
if( loc.overlapsP(interval) && cleanedReads.containsKey(uniquifiedReadName) ) {
SAMRecord cleanedRead = cleanedReads.get(uniquifiedReadName);
cleanedRead.setReadName(read.getReadName());
outputBAM.addAlignment(cleanedRead);
numCleaned = 1;
}
else {
if ( !cleanedReadHash.contains(uniquifiedReadName) )
{
outputBAM.addAlignment(read);
numCleaned = 0;
}
return numCleaned;
return cleanedReadCount;
}
/**
@ -148,35 +132,6 @@ public class CleanedReadInjector extends ReadWalker<Integer,Integer> {
outputBAM.close();
}
/**
* Load the intervals directly from the command-line or from file, as appropriate.
* Merge overlapping intervals.
* @param intervalsSource Source of intervals.
* @return a queue of sorted, merged intervals.
*/
private Queue parseIntervals( List<String> intervalsSource ) {
List<GenomeLoc> parsedIntervals = GenomeAnalysisEngine.parseIntervalRegion(intervalsSource);
GenomeLocSortedSet intervalSortedSet = new GenomeLocSortedSet();
for( GenomeLoc parsedInterval: parsedIntervals )
intervalSortedSet.addRegion(parsedInterval);
return new LinkedList<GenomeLoc>( intervalSortedSet );
}
/**
* Load a list of all the reads overlapping the given interval into memory.
* @param interval
*/
private void loadCleanedReadsOverlappingInterval( GenomeLoc interval ) {
CloseableIterator<SAMRecord> overlappingReads = cleanedReadsSource.queryOverlapping( interval.getContig(), (int)interval.getStart(), (int)interval.getStop() );
// Load in all reads mapped to this region. The cleaner will augment the read name in a way that uniquifies it.
while( overlappingReads.hasNext() ) {
SAMRecord read = overlappingReads.next();
cleanedReads.put( getUniquifiedReadName(read), read );
}
overlappingReads.close();
}
/**
* Gets the distinguishing elements of the read name from the given read.
* @param read read to uniquify.