Hash by chromosome for now to reduce memory. This is a temporary solution until we decide how to reture the Injector for good.
Also, with Picard's latest changes, we need to make sure we don't double-close the sam writer. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1779 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
f9a1598d75
commit
1905b5defa
|
|
@ -42,6 +42,11 @@ public class CleanedReadInjector extends ReadWalker<Integer,Integer> {
|
||||||
@Argument(fullName="output_bam",shortName="ob",doc="Output BAM file",required=true)
|
@Argument(fullName="output_bam",shortName="ob",doc="Output BAM file",required=true)
|
||||||
SAMFileWriter outputBAM = null;
|
SAMFileWriter outputBAM = null;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The iterator for the cleaned reads
|
||||||
|
*/
|
||||||
|
private ByContigIterator cleanedReadsIterator;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* The set of (sorted) cleaned reads
|
* The set of (sorted) cleaned reads
|
||||||
*/
|
*/
|
||||||
|
|
@ -60,31 +65,26 @@ public class CleanedReadInjector extends ReadWalker<Integer,Integer> {
|
||||||
@Override
|
@Override
|
||||||
public void initialize() {
|
public void initialize() {
|
||||||
|
|
||||||
// For now, read the whole damn file into memory. If this becomes a problem,
|
// For now, read the whole file into memory a chromosome at a time (because we can
|
||||||
// then we just need to read the hash into memory and the first read; we'd then
|
// never clean from one chromosome to another). If We ever have time to do this correctly
|
||||||
// need to query the BAM file every time we needed to update the cleaned read iterator.
|
// (or it becomes a memory problem), then we'll need to read the hash into memory and the
|
||||||
CloseableIterator<SAMRecord> allReads = cleanedReadsSource.iterator();
|
// first read; we'd then query the BAM file every time we needed to update the cleaned read iterator.
|
||||||
while ( allReads.hasNext() ) {
|
cleanedReadsIterator = new ByContigIterator(cleanedReadsSource.iterator());
|
||||||
SAMRecord read = allReads.next();
|
|
||||||
cleanedReads.add(read);
|
|
||||||
cleanedReadHash.add(getUniquifiedReadName(read));
|
|
||||||
}
|
|
||||||
allReads.close();
|
|
||||||
|
|
||||||
// If there are intervals specified by the user,record them so we can make sure not
|
// If there are intervals specified by the user, record them so we can make sure not
|
||||||
// to emit reads outside the intervals. For now, we'll group them by chromosome to
|
// to emit reads outside the intervals. For now, we'll group them by chromosome to
|
||||||
// make lookup a bit faster.
|
// make lookup a bit faster.
|
||||||
if ( this.getToolkit() != null &&
|
if ( this.getToolkit() != null &&
|
||||||
this.getToolkit().getArguments().intervals != null ) {
|
this.getToolkit().getArguments().intervals != null ) {
|
||||||
intervals = new HashMap<String, ArrayList<GenomeLoc>>();
|
intervals = new HashMap<String, ArrayList<GenomeLoc>>();
|
||||||
List<GenomeLoc> locs = GenomeAnalysisEngine.parseIntervalRegion(this.getToolkit().getArguments().intervals);
|
List<GenomeLoc> locs = GenomeAnalysisEngine.parseIntervalRegion(this.getToolkit().getArguments().intervals);
|
||||||
Iterator<GenomeLoc> iter = GenomeLocSortedSet.createSetFromList(locs).iterator();
|
Iterator<GenomeLoc> iter = GenomeLocSortedSet.createSetFromList(locs).iterator();
|
||||||
while ( iter.hasNext() ) {
|
while ( iter.hasNext() ) {
|
||||||
GenomeLoc loc = iter.next();
|
GenomeLoc loc = iter.next();
|
||||||
if ( intervals.get(loc.getContig()) == null )
|
if ( intervals.get(loc.getContig()) == null )
|
||||||
intervals.put(loc.getContig(), new ArrayList<GenomeLoc>());
|
intervals.put(loc.getContig(), new ArrayList<GenomeLoc>());
|
||||||
intervals.get(loc.getContig()).add(loc);
|
intervals.get(loc.getContig()).add(loc);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -104,12 +104,15 @@ public class CleanedReadInjector extends ReadWalker<Integer,Integer> {
|
||||||
while ( firstCleanedRead != null &&
|
while ( firstCleanedRead != null &&
|
||||||
firstCleanedRead.getReferenceIndex() <= read.getReferenceIndex() &&
|
firstCleanedRead.getReferenceIndex() <= read.getReferenceIndex() &&
|
||||||
firstCleanedRead.getAlignmentStart() <= read.getAlignmentStart() ) {
|
firstCleanedRead.getAlignmentStart() <= read.getAlignmentStart() ) {
|
||||||
if ( emit(firstCleanedRead) )
|
if ( emit(firstCleanedRead) )
|
||||||
cleanedReadCount++;
|
cleanedReadCount++;
|
||||||
cleanedReads.remove();
|
cleanedReads.remove();
|
||||||
firstCleanedRead = cleanedReads.peek();
|
firstCleanedRead = cleanedReads.peek();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// update the hashes if necessary
|
||||||
|
cleanedReadsIterator.readNextContig(read.getReferenceIndex());
|
||||||
|
|
||||||
if ( !cleanedReadHash.contains(getUniquifiedReadName(read)) )
|
if ( !cleanedReadHash.contains(getUniquifiedReadName(read)) )
|
||||||
outputBAM.addAlignment(read);
|
outputBAM.addAlignment(read);
|
||||||
return cleanedReadCount;
|
return cleanedReadCount;
|
||||||
|
|
@ -119,31 +122,31 @@ public class CleanedReadInjector extends ReadWalker<Integer,Integer> {
|
||||||
* Determine whether to emit the given read; if so, return true.
|
* Determine whether to emit the given read; if so, return true.
|
||||||
*/
|
*/
|
||||||
private boolean emit(SAMRecord read) {
|
private boolean emit(SAMRecord read) {
|
||||||
// if no intervals were specified, emit everything
|
// if no intervals were specified, emit everything
|
||||||
if ( intervals == null ) {
|
if ( intervals == null ) {
|
||||||
outputBAM.addAlignment(read);
|
outputBAM.addAlignment(read);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
ArrayList<GenomeLoc> intervalList = intervals.get(read.getReferenceName());
|
ArrayList<GenomeLoc> intervalList = intervals.get(read.getReferenceName());
|
||||||
if ( intervalList == null )
|
if ( intervalList == null )
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
GenomeLoc readLoc = GenomeLocParser.createGenomeLoc(read);
|
GenomeLoc readLoc = GenomeLocParser.createGenomeLoc(read);
|
||||||
for ( GenomeLoc interval : intervalList ) {
|
for ( GenomeLoc interval : intervalList ) {
|
||||||
// if it overlaps an interval, then we can emit it
|
// if it overlaps an interval, then we can emit it
|
||||||
if ( interval.overlapsP(readLoc) ) {
|
if ( interval.overlapsP(readLoc) ) {
|
||||||
outputBAM.addAlignment(read);
|
outputBAM.addAlignment(read);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
// once we've passed any interval that could overlap it, just quit
|
// once we've passed any interval that could overlap it, just quit
|
||||||
if ( interval.isPast(readLoc) )
|
if ( interval.isPast(readLoc) )
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
// it didn't overlap an interval
|
// it didn't overlap an interval
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -166,7 +169,7 @@ public class CleanedReadInjector extends ReadWalker<Integer,Integer> {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void onTraversalDone( Integer value ) {
|
public void onTraversalDone( Integer value ) {
|
||||||
outputBAM.close();
|
cleanedReadsIterator.iterator.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -177,4 +180,42 @@ public class CleanedReadInjector extends ReadWalker<Integer,Integer> {
|
||||||
private static String getUniquifiedReadName( SAMRecord read ) {
|
private static String getUniquifiedReadName( SAMRecord read ) {
|
||||||
return String.format("%s.%s.%s.%s",read.getAttribute("RG"),read.getReadName(),read.getFlags(),read.getReadString());
|
return String.format("%s.%s.%s.%s",read.getAttribute("RG"),read.getReadName(),read.getFlags(),read.getReadString());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private class ByContigIterator {
|
||||||
|
|
||||||
|
SAMRecord nextRead;
|
||||||
|
CloseableIterator<SAMRecord> iterator;
|
||||||
|
int contig = -1;
|
||||||
|
|
||||||
|
public ByContigIterator(CloseableIterator<SAMRecord> iterator) {
|
||||||
|
this.iterator = iterator;
|
||||||
|
nextRead = (iterator.hasNext() ? iterator.next() : null);
|
||||||
|
if ( nextRead != null )
|
||||||
|
readNextContig(nextRead.getReferenceIndex());
|
||||||
|
}
|
||||||
|
|
||||||
|
public void readNextContig(int newContig) {
|
||||||
|
// don't do anything if we're in the right contig or have no reads
|
||||||
|
if ( newContig == contig || nextRead == null )
|
||||||
|
return;
|
||||||
|
|
||||||
|
contig = newContig;
|
||||||
|
cleanedReadHash.clear();
|
||||||
|
cleanedReads.clear();
|
||||||
|
|
||||||
|
// first, get to the right contig
|
||||||
|
while ( nextRead != null &&
|
||||||
|
nextRead.getReferenceIndex() != contig ) {
|
||||||
|
nextRead = (iterator.hasNext() ? iterator.next() : null);
|
||||||
|
}
|
||||||
|
|
||||||
|
// now, read in all of the reads for this contig
|
||||||
|
while ( nextRead != null &&
|
||||||
|
nextRead.getReferenceIndex() == contig ) {
|
||||||
|
cleanedReads.add(nextRead);
|
||||||
|
cleanedReadHash.add(getUniquifiedReadName(nextRead));
|
||||||
|
nextRead = (iterator.hasNext() ? iterator.next() : null);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue