From 1905b5defa04f82d37226385706da7c88d926c21 Mon Sep 17 00:00:00 2001 From: ebanks Date: Wed, 7 Oct 2009 20:06:25 +0000 Subject: [PATCH] Hash by chromosome for now to reduce memory. This is a temporary solution until we decide how to reture the Injector for good. Also, with Picard's latest changes, we need to make sure we don't double-close the sam writer. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1779 348d0f76-0448-11de-a6fe-93d51630548a --- .../walkers/indels/CleanedReadInjector.java | 137 ++++++++++++------ 1 file changed, 89 insertions(+), 48 deletions(-) diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/indels/CleanedReadInjector.java b/java/src/org/broadinstitute/sting/gatk/walkers/indels/CleanedReadInjector.java index 206592adf..27e3ff49d 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/indels/CleanedReadInjector.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/indels/CleanedReadInjector.java @@ -42,6 +42,11 @@ public class CleanedReadInjector extends ReadWalker { @Argument(fullName="output_bam",shortName="ob",doc="Output BAM file",required=true) SAMFileWriter outputBAM = null; + /** + * The iterator for the cleaned reads + */ + private ByContigIterator cleanedReadsIterator; + /** * The set of (sorted) cleaned reads */ @@ -60,31 +65,26 @@ public class CleanedReadInjector extends ReadWalker { @Override public void initialize() { - // For now, read the whole damn file into memory. If this becomes a problem, - // then we just need to read the hash into memory and the first read; we'd then - // need to query the BAM file every time we needed to update the cleaned read iterator. - CloseableIterator allReads = cleanedReadsSource.iterator(); - while ( allReads.hasNext() ) { - SAMRecord read = allReads.next(); - cleanedReads.add(read); - cleanedReadHash.add(getUniquifiedReadName(read)); - } - allReads.close(); + // For now, read the whole file into memory a chromosome at a time (because we can + // never clean from one chromosome to another). If We ever have time to do this correctly + // (or it becomes a memory problem), then we'll need to read the hash into memory and the + // first read; we'd then query the BAM file every time we needed to update the cleaned read iterator. + cleanedReadsIterator = new ByContigIterator(cleanedReadsSource.iterator()); - // If there are intervals specified by the user,record them so we can make sure not - // to emit reads outside the intervals. For now, we'll group them by chromosome to - // make lookup a bit faster. + // If there are intervals specified by the user, record them so we can make sure not + // to emit reads outside the intervals. For now, we'll group them by chromosome to + // make lookup a bit faster. if ( this.getToolkit() != null && - this.getToolkit().getArguments().intervals != null ) { - intervals = new HashMap>(); - List locs = GenomeAnalysisEngine.parseIntervalRegion(this.getToolkit().getArguments().intervals); - Iterator iter = GenomeLocSortedSet.createSetFromList(locs).iterator(); - while ( iter.hasNext() ) { - GenomeLoc loc = iter.next(); - if ( intervals.get(loc.getContig()) == null ) - intervals.put(loc.getContig(), new ArrayList()); - intervals.get(loc.getContig()).add(loc); - } + this.getToolkit().getArguments().intervals != null ) { + intervals = new HashMap>(); + List locs = GenomeAnalysisEngine.parseIntervalRegion(this.getToolkit().getArguments().intervals); + Iterator iter = GenomeLocSortedSet.createSetFromList(locs).iterator(); + while ( iter.hasNext() ) { + GenomeLoc loc = iter.next(); + if ( intervals.get(loc.getContig()) == null ) + intervals.put(loc.getContig(), new ArrayList()); + intervals.get(loc.getContig()).add(loc); + } } } @@ -104,12 +104,15 @@ public class CleanedReadInjector extends ReadWalker { while ( firstCleanedRead != null && firstCleanedRead.getReferenceIndex() <= read.getReferenceIndex() && firstCleanedRead.getAlignmentStart() <= read.getAlignmentStart() ) { - if ( emit(firstCleanedRead) ) - cleanedReadCount++; - cleanedReads.remove(); + if ( emit(firstCleanedRead) ) + cleanedReadCount++; + cleanedReads.remove(); firstCleanedRead = cleanedReads.peek(); } + // update the hashes if necessary + cleanedReadsIterator.readNextContig(read.getReferenceIndex()); + if ( !cleanedReadHash.contains(getUniquifiedReadName(read)) ) outputBAM.addAlignment(read); return cleanedReadCount; @@ -119,31 +122,31 @@ public class CleanedReadInjector extends ReadWalker { * Determine whether to emit the given read; if so, return true. */ private boolean emit(SAMRecord read) { - // if no intervals were specified, emit everything - if ( intervals == null ) { - outputBAM.addAlignment(read); - return true; - } + // if no intervals were specified, emit everything + if ( intervals == null ) { + outputBAM.addAlignment(read); + return true; + } - ArrayList intervalList = intervals.get(read.getReferenceName()); - if ( intervalList == null ) - return false; + ArrayList intervalList = intervals.get(read.getReferenceName()); + if ( intervalList == null ) + return false; - GenomeLoc readLoc = GenomeLocParser.createGenomeLoc(read); - for ( GenomeLoc interval : intervalList ) { - // if it overlaps an interval, then we can emit it - if ( interval.overlapsP(readLoc) ) { - outputBAM.addAlignment(read); - return true; - } + GenomeLoc readLoc = GenomeLocParser.createGenomeLoc(read); + for ( GenomeLoc interval : intervalList ) { + // if it overlaps an interval, then we can emit it + if ( interval.overlapsP(readLoc) ) { + outputBAM.addAlignment(read); + return true; + } - // once we've passed any interval that could overlap it, just quit - if ( interval.isPast(readLoc) ) - return false; - } + // once we've passed any interval that could overlap it, just quit + if ( interval.isPast(readLoc) ) + return false; + } - // it didn't overlap an interval - return false; + // it didn't overlap an interval + return false; } /** @@ -166,7 +169,7 @@ public class CleanedReadInjector extends ReadWalker { @Override public void onTraversalDone( Integer value ) { - outputBAM.close(); + cleanedReadsIterator.iterator.close(); } /** @@ -177,4 +180,42 @@ public class CleanedReadInjector extends ReadWalker { private static String getUniquifiedReadName( SAMRecord read ) { return String.format("%s.%s.%s.%s",read.getAttribute("RG"),read.getReadName(),read.getFlags(),read.getReadString()); } + + private class ByContigIterator { + + SAMRecord nextRead; + CloseableIterator iterator; + int contig = -1; + + public ByContigIterator(CloseableIterator iterator) { + this.iterator = iterator; + nextRead = (iterator.hasNext() ? iterator.next() : null); + if ( nextRead != null ) + readNextContig(nextRead.getReferenceIndex()); + } + + public void readNextContig(int newContig) { + // don't do anything if we're in the right contig or have no reads + if ( newContig == contig || nextRead == null ) + return; + + contig = newContig; + cleanedReadHash.clear(); + cleanedReads.clear(); + + // first, get to the right contig + while ( nextRead != null && + nextRead.getReferenceIndex() != contig ) { + nextRead = (iterator.hasNext() ? iterator.next() : null); + } + + // now, read in all of the reads for this contig + while ( nextRead != null && + nextRead.getReferenceIndex() == contig ) { + cleanedReads.add(nextRead); + cleanedReadHash.add(getUniquifiedReadName(nextRead)); + nextRead = (iterator.hasNext() ? iterator.next() : null); + } + } + } }