Commit to emit in sorted order so we don't have to use /tmp

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1133 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
ebanks 2009-06-30 19:47:15 +00:00
parent f5cba5a6bb
commit 8d3dc57c3d
1 changed files with 31 additions and 76 deletions

View File

@ -6,10 +6,7 @@ import net.sf.samtools.SAMRecord;
import net.sf.samtools.util.CloseableIterator; import net.sf.samtools.util.CloseableIterator;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.walkers.ReadWalker; import org.broadinstitute.sting.gatk.walkers.ReadWalker;
import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.cmdLine.Argument; import org.broadinstitute.sting.utils.cmdLine.Argument;
import java.util.*; import java.util.*;
@ -32,12 +29,6 @@ import java.util.*;
*/ */
public class CleanedReadInjector extends ReadWalker<Integer,Integer> { public class CleanedReadInjector extends ReadWalker<Integer,Integer> {
/**
* The source of all cleaned intervals.
*/
@Argument(fullName="cleaned_intervals",shortName="ci",doc="Intervals which have been cleaned.",required=true)
List<String> intervalsSource = null;
/** /**
* The source of all cleaned reads. * The source of all cleaned reads.
*/ */
@ -51,19 +42,14 @@ public class CleanedReadInjector extends ReadWalker<Integer,Integer> {
String outputBAMFileName = null; String outputBAMFileName = null;
/** /**
* A stream of processed intervals. The current head of the queue represents the next interval. * The set of (sorted) cleaned reads
*/ */
private Queue<GenomeLoc> intervals; private Queue<SAMRecord> cleanedReads = new LinkedList<SAMRecord>();
/** /**
* The interval currently in process. * A fast lookup table for uniquified read info
*/ */
private GenomeLoc interval = null; private HashSet<String> cleanedReadHash = new HashSet<String>();
/**
* A structure for fast lookup of all reads that have been cleaned in the current interval.
*/
private Map<String,SAMRecord> cleanedReads = new HashMap<String,SAMRecord>();
/** /**
* The writer that handles writing of SAM files. * The writer that handles writing of SAM files.
@ -72,23 +58,32 @@ public class CleanedReadInjector extends ReadWalker<Integer,Integer> {
@Override @Override
public void initialize() { public void initialize() {
intervals = parseIntervals( intervalsSource );
interval = intervals.remove(); // For now, read the whole damn file into memory. If this becomes a problem,
loadCleanedReadsOverlappingInterval( interval ); // then we just need to read the hash into memory and the first read; we'd then
// need to query the BAM file every time we needed to update the cleaned read iterator.
CloseableIterator<SAMRecord> allReads = cleanedReadsSource.iterator();
while ( allReads.hasNext() ) {
SAMRecord read = allReads.next();
cleanedReads.add(read);
String uniquifiedReadName = getUniquifiedReadName(read);
cleanedReadHash.add(getUniquifiedReadName(read));
}
allReads.close();
// HACK: The unit tests create their own output files. Make sure this walker doesn't step // HACK: The unit tests create their own output files. Make sure this walker doesn't step
// on any toes. // on any toes.
if( outputBAM == null ) { if( outputBAM == null ) {
outputBAM = Utils.createSAMFileWriterWithCompression(getToolkit().getEngine().getSAMHeader(), outputBAM = Utils.createSAMFileWriterWithCompression(getToolkit().getEngine().getSAMHeader(),
false, true,
outputBAMFileName, outputBAMFileName,
getToolkit().getBAMCompression()); getToolkit().getBAMCompression());
} }
} }
/** /**
* For every read, inspect the set of cleaned reads that could possibly replace the current one. * For every read, if it exists in the cleaned read set, ignore it; otherwise, emit it.
* If one exists, add the cleaned read to the output. Otherwise, add the current read. * Also, if the head of the cleaned read list could be emitted here, do so.
* @param ref Portion of the reference sequence overlapping this read. * @param ref Portion of the reference sequence overlapping this read.
* @param read The read to inspect. * @param read The read to inspect.
* @return Number of reads cleaned by this iteration of the map function. * @return Number of reads cleaned by this iteration of the map function.
@ -96,33 +91,22 @@ public class CleanedReadInjector extends ReadWalker<Integer,Integer> {
@Override @Override
public Integer map(char[] ref, SAMRecord read) { public Integer map(char[] ref, SAMRecord read) {
if( read.getReadUnmappedFlag() ) { // first emit reads from the cleaned set if appropriate
outputBAM.addAlignment(read); int cleanedReadCount = 0;
return 0; SAMRecord firstCleanedRead = cleanedReads.peek();
} while ( firstCleanedRead != null && firstCleanedRead.getAlignmentStart() <= read.getAlignmentStart() ) {
outputBAM.addAlignment(firstCleanedRead);
GenomeLoc loc = GenomeLocParser.createGenomeLoc(read); cleanedReadCount++;
cleanedReads.remove();
while( loc.isPast(interval) && intervals.size() > 0 ) { firstCleanedRead = cleanedReads.peek();
interval = intervals.remove();
loadCleanedReadsOverlappingInterval(interval);
} }
String uniquifiedReadName = getUniquifiedReadName(read); String uniquifiedReadName = getUniquifiedReadName(read);
int numCleaned = 0; if ( !cleanedReadHash.contains(uniquifiedReadName) )
{
if( loc.overlapsP(interval) && cleanedReads.containsKey(uniquifiedReadName) ) {
SAMRecord cleanedRead = cleanedReads.get(uniquifiedReadName);
cleanedRead.setReadName(read.getReadName());
outputBAM.addAlignment(cleanedRead);
numCleaned = 1;
}
else {
outputBAM.addAlignment(read); outputBAM.addAlignment(read);
numCleaned = 0;
} }
return cleanedReadCount;
return numCleaned;
} }
/** /**
@ -148,35 +132,6 @@ public class CleanedReadInjector extends ReadWalker<Integer,Integer> {
outputBAM.close(); outputBAM.close();
} }
/**
* Load the intervals directly from the command-line or from file, as appropriate.
* Merge overlapping intervals.
* @param intervalsSource Source of intervals.
* @return a queue of sorted, merged intervals.
*/
private Queue parseIntervals( List<String> intervalsSource ) {
List<GenomeLoc> parsedIntervals = GenomeAnalysisEngine.parseIntervalRegion(intervalsSource);
GenomeLocSortedSet intervalSortedSet = new GenomeLocSortedSet();
for( GenomeLoc parsedInterval: parsedIntervals )
intervalSortedSet.addRegion(parsedInterval);
return new LinkedList<GenomeLoc>( intervalSortedSet );
}
/**
* Load a list of all the reads overlapping the given interval into memory.
* @param interval
*/
private void loadCleanedReadsOverlappingInterval( GenomeLoc interval ) {
CloseableIterator<SAMRecord> overlappingReads = cleanedReadsSource.queryOverlapping( interval.getContig(), (int)interval.getStart(), (int)interval.getStop() );
// Load in all reads mapped to this region. The cleaner will augment the read name in a way that uniquifies it.
while( overlappingReads.hasNext() ) {
SAMRecord read = overlappingReads.next();
cleanedReads.put( getUniquifiedReadName(read), read );
}
overlappingReads.close();
}
/** /**
* Gets the distinguishing elements of the read name from the given read. * Gets the distinguishing elements of the read name from the given read.
* @param read read to uniquify. * @param read read to uniquify.