Hash by chromosome for now to reduce memory. This is a temporary solution until we decide how to reture the Injector for good.

Also, with Picard's latest changes, we need to make sure we don't double-close the sam writer.


git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1779 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
ebanks 2009-10-07 20:06:25 +00:00
parent f9a1598d75
commit 1905b5defa
1 changed files with 89 additions and 48 deletions

View File

@ -42,6 +42,11 @@ public class CleanedReadInjector extends ReadWalker<Integer,Integer> {
@Argument(fullName="output_bam",shortName="ob",doc="Output BAM file",required=true) @Argument(fullName="output_bam",shortName="ob",doc="Output BAM file",required=true)
SAMFileWriter outputBAM = null; SAMFileWriter outputBAM = null;
/**
* The iterator for the cleaned reads
*/
private ByContigIterator cleanedReadsIterator;
/** /**
* The set of (sorted) cleaned reads * The set of (sorted) cleaned reads
*/ */
@ -60,18 +65,13 @@ public class CleanedReadInjector extends ReadWalker<Integer,Integer> {
@Override @Override
public void initialize() { public void initialize() {
// For now, read the whole damn file into memory. If this becomes a problem, // For now, read the whole file into memory a chromosome at a time (because we can
// then we just need to read the hash into memory and the first read; we'd then // never clean from one chromosome to another). If We ever have time to do this correctly
// need to query the BAM file every time we needed to update the cleaned read iterator. // (or it becomes a memory problem), then we'll need to read the hash into memory and the
CloseableIterator<SAMRecord> allReads = cleanedReadsSource.iterator(); // first read; we'd then query the BAM file every time we needed to update the cleaned read iterator.
while ( allReads.hasNext() ) { cleanedReadsIterator = new ByContigIterator(cleanedReadsSource.iterator());
SAMRecord read = allReads.next();
cleanedReads.add(read);
cleanedReadHash.add(getUniquifiedReadName(read));
}
allReads.close();
// If there are intervals specified by the user,record them so we can make sure not // If there are intervals specified by the user, record them so we can make sure not
// to emit reads outside the intervals. For now, we'll group them by chromosome to // to emit reads outside the intervals. For now, we'll group them by chromosome to
// make lookup a bit faster. // make lookup a bit faster.
if ( this.getToolkit() != null && if ( this.getToolkit() != null &&
@ -110,6 +110,9 @@ public class CleanedReadInjector extends ReadWalker<Integer,Integer> {
firstCleanedRead = cleanedReads.peek(); firstCleanedRead = cleanedReads.peek();
} }
// update the hashes if necessary
cleanedReadsIterator.readNextContig(read.getReferenceIndex());
if ( !cleanedReadHash.contains(getUniquifiedReadName(read)) ) if ( !cleanedReadHash.contains(getUniquifiedReadName(read)) )
outputBAM.addAlignment(read); outputBAM.addAlignment(read);
return cleanedReadCount; return cleanedReadCount;
@ -166,7 +169,7 @@ public class CleanedReadInjector extends ReadWalker<Integer,Integer> {
@Override @Override
public void onTraversalDone( Integer value ) { public void onTraversalDone( Integer value ) {
outputBAM.close(); cleanedReadsIterator.iterator.close();
} }
/** /**
@ -177,4 +180,42 @@ public class CleanedReadInjector extends ReadWalker<Integer,Integer> {
private static String getUniquifiedReadName( SAMRecord read ) { private static String getUniquifiedReadName( SAMRecord read ) {
return String.format("%s.%s.%s.%s",read.getAttribute("RG"),read.getReadName(),read.getFlags(),read.getReadString()); return String.format("%s.%s.%s.%s",read.getAttribute("RG"),read.getReadName(),read.getFlags(),read.getReadString());
} }
private class ByContigIterator {
SAMRecord nextRead;
CloseableIterator<SAMRecord> iterator;
int contig = -1;
public ByContigIterator(CloseableIterator<SAMRecord> iterator) {
this.iterator = iterator;
nextRead = (iterator.hasNext() ? iterator.next() : null);
if ( nextRead != null )
readNextContig(nextRead.getReferenceIndex());
}
public void readNextContig(int newContig) {
// don't do anything if we're in the right contig or have no reads
if ( newContig == contig || nextRead == null )
return;
contig = newContig;
cleanedReadHash.clear();
cleanedReads.clear();
// first, get to the right contig
while ( nextRead != null &&
nextRead.getReferenceIndex() != contig ) {
nextRead = (iterator.hasNext() ? iterator.next() : null);
}
// now, read in all of the reads for this contig
while ( nextRead != null &&
nextRead.getReferenceIndex() == contig ) {
cleanedReads.add(nextRead);
cleanedReadHash.add(getUniquifiedReadName(nextRead));
nextRead = (iterator.hasNext() ? iterator.next() : null);
}
}
}
} }