Commit to emit in sorted order so we don't have to use /tmp
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1133 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
f5cba5a6bb
commit
8d3dc57c3d
|
|
@ -6,10 +6,7 @@ import net.sf.samtools.SAMRecord;
|
||||||
import net.sf.samtools.util.CloseableIterator;
|
import net.sf.samtools.util.CloseableIterator;
|
||||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||||
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
|
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
|
||||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
import org.broadinstitute.sting.utils.*;
|
||||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
|
||||||
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
|
|
||||||
import org.broadinstitute.sting.utils.Utils;
|
|
||||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||||
|
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
|
@ -32,12 +29,6 @@ import java.util.*;
|
||||||
*/
|
*/
|
||||||
public class CleanedReadInjector extends ReadWalker<Integer,Integer> {
|
public class CleanedReadInjector extends ReadWalker<Integer,Integer> {
|
||||||
|
|
||||||
/**
|
|
||||||
* The source of all cleaned intervals.
|
|
||||||
*/
|
|
||||||
@Argument(fullName="cleaned_intervals",shortName="ci",doc="Intervals which have been cleaned.",required=true)
|
|
||||||
List<String> intervalsSource = null;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* The source of all cleaned reads.
|
* The source of all cleaned reads.
|
||||||
*/
|
*/
|
||||||
|
|
@ -51,19 +42,14 @@ public class CleanedReadInjector extends ReadWalker<Integer,Integer> {
|
||||||
String outputBAMFileName = null;
|
String outputBAMFileName = null;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A stream of processed intervals. The current head of the queue represents the next interval.
|
* The set of (sorted) cleaned reads
|
||||||
*/
|
*/
|
||||||
private Queue<GenomeLoc> intervals;
|
private Queue<SAMRecord> cleanedReads = new LinkedList<SAMRecord>();
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* The interval currently in process.
|
* A fast lookup table for uniquified read info
|
||||||
*/
|
*/
|
||||||
private GenomeLoc interval = null;
|
private HashSet<String> cleanedReadHash = new HashSet<String>();
|
||||||
|
|
||||||
/**
|
|
||||||
* A structure for fast lookup of all reads that have been cleaned in the current interval.
|
|
||||||
*/
|
|
||||||
private Map<String,SAMRecord> cleanedReads = new HashMap<String,SAMRecord>();
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* The writer that handles writing of SAM files.
|
* The writer that handles writing of SAM files.
|
||||||
|
|
@ -72,23 +58,32 @@ public class CleanedReadInjector extends ReadWalker<Integer,Integer> {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void initialize() {
|
public void initialize() {
|
||||||
intervals = parseIntervals( intervalsSource );
|
|
||||||
interval = intervals.remove();
|
// For now, read the whole damn file into memory. If this becomes a problem,
|
||||||
loadCleanedReadsOverlappingInterval( interval );
|
// then we just need to read the hash into memory and the first read; we'd then
|
||||||
|
// need to query the BAM file every time we needed to update the cleaned read iterator.
|
||||||
|
CloseableIterator<SAMRecord> allReads = cleanedReadsSource.iterator();
|
||||||
|
while ( allReads.hasNext() ) {
|
||||||
|
SAMRecord read = allReads.next();
|
||||||
|
cleanedReads.add(read);
|
||||||
|
String uniquifiedReadName = getUniquifiedReadName(read);
|
||||||
|
cleanedReadHash.add(getUniquifiedReadName(read));
|
||||||
|
}
|
||||||
|
allReads.close();
|
||||||
|
|
||||||
// HACK: The unit tests create their own output files. Make sure this walker doesn't step
|
// HACK: The unit tests create their own output files. Make sure this walker doesn't step
|
||||||
// on any toes.
|
// on any toes.
|
||||||
if( outputBAM == null ) {
|
if( outputBAM == null ) {
|
||||||
outputBAM = Utils.createSAMFileWriterWithCompression(getToolkit().getEngine().getSAMHeader(),
|
outputBAM = Utils.createSAMFileWriterWithCompression(getToolkit().getEngine().getSAMHeader(),
|
||||||
false,
|
true,
|
||||||
outputBAMFileName,
|
outputBAMFileName,
|
||||||
getToolkit().getBAMCompression());
|
getToolkit().getBAMCompression());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* For every read, inspect the set of cleaned reads that could possibly replace the current one.
|
* For every read, if it exists in the cleaned read set, ignore it; otherwise, emit it.
|
||||||
* If one exists, add the cleaned read to the output. Otherwise, add the current read.
|
* Also, if the head of the cleaned read list could be emitted here, do so.
|
||||||
* @param ref Portion of the reference sequence overlapping this read.
|
* @param ref Portion of the reference sequence overlapping this read.
|
||||||
* @param read The read to inspect.
|
* @param read The read to inspect.
|
||||||
* @return Number of reads cleaned by this iteration of the map function.
|
* @return Number of reads cleaned by this iteration of the map function.
|
||||||
|
|
@ -96,33 +91,22 @@ public class CleanedReadInjector extends ReadWalker<Integer,Integer> {
|
||||||
@Override
|
@Override
|
||||||
public Integer map(char[] ref, SAMRecord read) {
|
public Integer map(char[] ref, SAMRecord read) {
|
||||||
|
|
||||||
if( read.getReadUnmappedFlag() ) {
|
// first emit reads from the cleaned set if appropriate
|
||||||
outputBAM.addAlignment(read);
|
int cleanedReadCount = 0;
|
||||||
return 0;
|
SAMRecord firstCleanedRead = cleanedReads.peek();
|
||||||
}
|
while ( firstCleanedRead != null && firstCleanedRead.getAlignmentStart() <= read.getAlignmentStart() ) {
|
||||||
|
outputBAM.addAlignment(firstCleanedRead);
|
||||||
GenomeLoc loc = GenomeLocParser.createGenomeLoc(read);
|
cleanedReadCount++;
|
||||||
|
cleanedReads.remove();
|
||||||
while( loc.isPast(interval) && intervals.size() > 0 ) {
|
firstCleanedRead = cleanedReads.peek();
|
||||||
interval = intervals.remove();
|
|
||||||
loadCleanedReadsOverlappingInterval(interval);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
String uniquifiedReadName = getUniquifiedReadName(read);
|
String uniquifiedReadName = getUniquifiedReadName(read);
|
||||||
int numCleaned = 0;
|
if ( !cleanedReadHash.contains(uniquifiedReadName) )
|
||||||
|
{
|
||||||
if( loc.overlapsP(interval) && cleanedReads.containsKey(uniquifiedReadName) ) {
|
|
||||||
SAMRecord cleanedRead = cleanedReads.get(uniquifiedReadName);
|
|
||||||
cleanedRead.setReadName(read.getReadName());
|
|
||||||
outputBAM.addAlignment(cleanedRead);
|
|
||||||
numCleaned = 1;
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
outputBAM.addAlignment(read);
|
outputBAM.addAlignment(read);
|
||||||
numCleaned = 0;
|
|
||||||
}
|
}
|
||||||
|
return cleanedReadCount;
|
||||||
return numCleaned;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -148,35 +132,6 @@ public class CleanedReadInjector extends ReadWalker<Integer,Integer> {
|
||||||
outputBAM.close();
|
outputBAM.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Load the intervals directly from the command-line or from file, as appropriate.
|
|
||||||
* Merge overlapping intervals.
|
|
||||||
* @param intervalsSource Source of intervals.
|
|
||||||
* @return a queue of sorted, merged intervals.
|
|
||||||
*/
|
|
||||||
private Queue parseIntervals( List<String> intervalsSource ) {
|
|
||||||
List<GenomeLoc> parsedIntervals = GenomeAnalysisEngine.parseIntervalRegion(intervalsSource);
|
|
||||||
GenomeLocSortedSet intervalSortedSet = new GenomeLocSortedSet();
|
|
||||||
for( GenomeLoc parsedInterval: parsedIntervals )
|
|
||||||
intervalSortedSet.addRegion(parsedInterval);
|
|
||||||
|
|
||||||
return new LinkedList<GenomeLoc>( intervalSortedSet );
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Load a list of all the reads overlapping the given interval into memory.
|
|
||||||
* @param interval
|
|
||||||
*/
|
|
||||||
private void loadCleanedReadsOverlappingInterval( GenomeLoc interval ) {
|
|
||||||
CloseableIterator<SAMRecord> overlappingReads = cleanedReadsSource.queryOverlapping( interval.getContig(), (int)interval.getStart(), (int)interval.getStop() );
|
|
||||||
// Load in all reads mapped to this region. The cleaner will augment the read name in a way that uniquifies it.
|
|
||||||
while( overlappingReads.hasNext() ) {
|
|
||||||
SAMRecord read = overlappingReads.next();
|
|
||||||
cleanedReads.put( getUniquifiedReadName(read), read );
|
|
||||||
}
|
|
||||||
overlappingReads.close();
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Gets the distinguishing elements of the read name from the given read.
|
* Gets the distinguishing elements of the read name from the given read.
|
||||||
* @param read read to uniquify.
|
* @param read read to uniquify.
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue