From 8d3dc57c3dfb1a6dfa595eb7f2018e81f96f6d5e Mon Sep 17 00:00:00 2001 From: ebanks Date: Tue, 30 Jun 2009 19:47:15 +0000 Subject: [PATCH] Commit to emit in sorted order so we don't have to use /tmp git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1133 348d0f76-0448-11de-a6fe-93d51630548a --- .../walkers/indels/CleanedReadInjector.java | 107 +++++------------- 1 file changed, 31 insertions(+), 76 deletions(-) diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/indels/CleanedReadInjector.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/indels/CleanedReadInjector.java index 75f7e56d7..0f8b4ef15 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/indels/CleanedReadInjector.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/indels/CleanedReadInjector.java @@ -6,10 +6,7 @@ import net.sf.samtools.SAMRecord; import net.sf.samtools.util.CloseableIterator; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.walkers.ReadWalker; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.GenomeLocSortedSet; -import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.cmdLine.Argument; import java.util.*; @@ -32,12 +29,6 @@ import java.util.*; */ public class CleanedReadInjector extends ReadWalker { - /** - * The source of all cleaned intervals. - */ - @Argument(fullName="cleaned_intervals",shortName="ci",doc="Intervals which have been cleaned.",required=true) - List intervalsSource = null; - /** * The source of all cleaned reads. */ @@ -51,19 +42,14 @@ public class CleanedReadInjector extends ReadWalker { String outputBAMFileName = null; /** - * A stream of processed intervals. The current head of the queue represents the next interval. + * The set of (sorted) cleaned reads */ - private Queue intervals; + private Queue cleanedReads = new LinkedList(); /** - * The interval currently in process. + * A fast lookup table for uniquified read info */ - private GenomeLoc interval = null; - - /** - * A structure for fast lookup of all reads that have been cleaned in the current interval. - */ - private Map cleanedReads = new HashMap(); + private HashSet cleanedReadHash = new HashSet(); /** * The writer that handles writing of SAM files. @@ -72,23 +58,32 @@ public class CleanedReadInjector extends ReadWalker { @Override public void initialize() { - intervals = parseIntervals( intervalsSource ); - interval = intervals.remove(); - loadCleanedReadsOverlappingInterval( interval ); + + // For now, read the whole damn file into memory. If this becomes a problem, + // then we just need to read the hash into memory and the first read; we'd then + // need to query the BAM file every time we needed to update the cleaned read iterator. + CloseableIterator allReads = cleanedReadsSource.iterator(); + while ( allReads.hasNext() ) { + SAMRecord read = allReads.next(); + cleanedReads.add(read); + String uniquifiedReadName = getUniquifiedReadName(read); + cleanedReadHash.add(getUniquifiedReadName(read)); + } + allReads.close(); // HACK: The unit tests create their own output files. Make sure this walker doesn't step // on any toes. if( outputBAM == null ) { outputBAM = Utils.createSAMFileWriterWithCompression(getToolkit().getEngine().getSAMHeader(), - false, + true, outputBAMFileName, getToolkit().getBAMCompression()); } } /** - * For every read, inspect the set of cleaned reads that could possibly replace the current one. - * If one exists, add the cleaned read to the output. Otherwise, add the current read. + * For every read, if it exists in the cleaned read set, ignore it; otherwise, emit it. + * Also, if the head of the cleaned read list could be emitted here, do so. * @param ref Portion of the reference sequence overlapping this read. * @param read The read to inspect. * @return Number of reads cleaned by this iteration of the map function. @@ -96,33 +91,22 @@ public class CleanedReadInjector extends ReadWalker { @Override public Integer map(char[] ref, SAMRecord read) { - if( read.getReadUnmappedFlag() ) { - outputBAM.addAlignment(read); - return 0; - } - - GenomeLoc loc = GenomeLocParser.createGenomeLoc(read); - - while( loc.isPast(interval) && intervals.size() > 0 ) { - interval = intervals.remove(); - loadCleanedReadsOverlappingInterval(interval); + // first emit reads from the cleaned set if appropriate + int cleanedReadCount = 0; + SAMRecord firstCleanedRead = cleanedReads.peek(); + while ( firstCleanedRead != null && firstCleanedRead.getAlignmentStart() <= read.getAlignmentStart() ) { + outputBAM.addAlignment(firstCleanedRead); + cleanedReadCount++; + cleanedReads.remove(); + firstCleanedRead = cleanedReads.peek(); } String uniquifiedReadName = getUniquifiedReadName(read); - int numCleaned = 0; - - if( loc.overlapsP(interval) && cleanedReads.containsKey(uniquifiedReadName) ) { - SAMRecord cleanedRead = cleanedReads.get(uniquifiedReadName); - cleanedRead.setReadName(read.getReadName()); - outputBAM.addAlignment(cleanedRead); - numCleaned = 1; - } - else { + if ( !cleanedReadHash.contains(uniquifiedReadName) ) + { outputBAM.addAlignment(read); - numCleaned = 0; } - - return numCleaned; + return cleanedReadCount; } /** @@ -148,35 +132,6 @@ public class CleanedReadInjector extends ReadWalker { outputBAM.close(); } - /** - * Load the intervals directly from the command-line or from file, as appropriate. - * Merge overlapping intervals. - * @param intervalsSource Source of intervals. - * @return a queue of sorted, merged intervals. - */ - private Queue parseIntervals( List intervalsSource ) { - List parsedIntervals = GenomeAnalysisEngine.parseIntervalRegion(intervalsSource); - GenomeLocSortedSet intervalSortedSet = new GenomeLocSortedSet(); - for( GenomeLoc parsedInterval: parsedIntervals ) - intervalSortedSet.addRegion(parsedInterval); - - return new LinkedList( intervalSortedSet ); - } - - /** - * Load a list of all the reads overlapping the given interval into memory. - * @param interval - */ - private void loadCleanedReadsOverlappingInterval( GenomeLoc interval ) { - CloseableIterator overlappingReads = cleanedReadsSource.queryOverlapping( interval.getContig(), (int)interval.getStart(), (int)interval.getStop() ); - // Load in all reads mapped to this region. The cleaner will augment the read name in a way that uniquifies it. - while( overlappingReads.hasNext() ) { - SAMRecord read = overlappingReads.next(); - cleanedReads.put( getUniquifiedReadName(read), read ); - } - overlappingReads.close(); - } - /** * Gets the distinguishing elements of the read name from the given read. * @param read read to uniquify.