- Fix the CleanedReadInjector to deal with -L intervals correctly.

- Some walkers don't use the ref base, so speed up traversals by not requiring it


git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1652 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
ebanks 2009-09-18 16:17:58 +00:00
parent 7da9ff2a9e
commit 2b2df4e1ba
4 changed files with 61 additions and 11 deletions

View File

@ -4,7 +4,8 @@ import net.sf.samtools.SAMFileReader;
import net.sf.samtools.SAMFileWriter;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.util.CloseableIterator;
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.cmdLine.Argument;
@ -26,6 +27,7 @@ import java.util.*;
* Copies reads from the input stream into the <code>outputBAM</code>, replacing those
* reads which have been cleaned with their new clean copies.
*/
@Requires({DataSource.READS})
public class CleanedReadInjector extends ReadWalker<Integer,Integer> {
/**
@ -45,6 +47,11 @@ public class CleanedReadInjector extends ReadWalker<Integer,Integer> {
*/
private Queue<SAMRecord> cleanedReads = new LinkedList<SAMRecord>();
/**
* The intervals specified by the user
*/
private HashMap<String, ArrayList<GenomeLoc>> intervals = null;
/**
* A fast lookup table for uniquified read info
*/
@ -63,6 +70,21 @@ public class CleanedReadInjector extends ReadWalker<Integer,Integer> {
cleanedReadHash.add(getUniquifiedReadName(read));
}
allReads.close();
// If there are intervals specified by the user,record them so we can make sure not
// to emit reads outside the intervals. For now, we'll group them by chromosome to
// make lookup a bit faster.
if ( this.getToolkit().getArguments().intervals != null ) {
intervals = new HashMap<String, ArrayList<GenomeLoc>>();
List<GenomeLoc> locs = GenomeAnalysisEngine.parseIntervalRegion(this.getToolkit().getArguments().intervals);
Iterator<GenomeLoc> iter = GenomeLocSortedSet.createSetFromList(locs).iterator();
while ( iter.hasNext() ) {
GenomeLoc loc = iter.next();
if ( intervals.get(loc.getContig()) == null )
intervals.put(loc.getContig(), new ArrayList<GenomeLoc>());
intervals.get(loc.getContig()).add(loc);
}
}
}
/**
@ -81,9 +103,9 @@ public class CleanedReadInjector extends ReadWalker<Integer,Integer> {
while ( firstCleanedRead != null &&
firstCleanedRead.getReferenceIndex() <= read.getReferenceIndex() &&
firstCleanedRead.getAlignmentStart() <= read.getAlignmentStart() ) {
outputBAM.addAlignment(firstCleanedRead);
cleanedReadCount++;
cleanedReads.remove();
if ( emit(firstCleanedRead) )
cleanedReadCount++;
cleanedReads.remove();
firstCleanedRead = cleanedReads.peek();
}
@ -92,6 +114,37 @@ public class CleanedReadInjector extends ReadWalker<Integer,Integer> {
return cleanedReadCount;
}
/**
* Determine whether to emit the given read; if so, return true.
*/
private boolean emit(SAMRecord read) {
// if no intervals were specified, emit everything
if ( intervals == null ) {
outputBAM.addAlignment(read);
return true;
}
ArrayList<GenomeLoc> intervalList = intervals.get(read.getReferenceName());
if ( intervalList == null )
return false;
GenomeLoc readLoc = GenomeLocParser.createGenomeLoc(read);
for ( GenomeLoc interval : intervalList ) {
// if it overlaps an interval, then we can emit it
if ( interval.overlapsP(readLoc) ) {
outputBAM.addAlignment(read);
return true;
}
// once we've passed any interval that could overlap it, just quit
if ( interval.isPast(readLoc) )
return false;
}
// it didn't overlap an interval
return false;
}
/**
* Initialize traversal with number of reads which have been replaced with a clean version.
* @return 0 to initialize the traversal.

View File

@ -1,8 +1,6 @@
package org.broadinstitute.sting.gatk.walkers.indels;
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
import org.broadinstitute.sting.gatk.walkers.WalkerName;
import org.broadinstitute.sting.gatk.walkers.ReadFilters;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.gatk.filters.Platform454Filter;
import org.broadinstitute.sting.gatk.filters.ZeroMappingQualityReadFilter;
import org.broadinstitute.sting.utils.cmdLine.Argument;
@ -18,6 +16,7 @@ import java.util.List;
// although this can easily be changed if necessary.
@WalkerName("IndelIntervals")
@Requires({DataSource.READS})
@ReadFilters({Platform454Filter.class, ZeroMappingQualityReadFilter.class})
public class IndelIntervalWalker extends ReadWalker<IndelIntervalWalker.Interval, IndelIntervalWalker.Interval> {
@Argument(fullName="allow454Reads", shortName="454", doc="process 454 reads", required=false)

View File

@ -1,4 +1,3 @@
package org.broadinstitute.sting.gatk.walkers.indels;
import net.sf.samtools.*;

View File

@ -1,4 +1,3 @@
package org.broadinstitute.sting.gatk.walkers.indels;
import org.broadinstitute.sting.gatk.refdata.*;
@ -9,7 +8,7 @@ import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.utils.cmdLine.Argument;
@WalkerName("SNPClusters")
@Requires(value={DataSource.REFERENCE},referenceMetaData={@RMD(name="snps",type=AllelicVariant.class)})
@Requires(value={},referenceMetaData={@RMD(name="snps",type=AllelicVariant.class)})
public class SNPClusterWalker extends RefWalker<GenomeLoc, GenomeLoc> {
@Argument(fullName="windowSize", shortName="window", doc="window size for calculating clusters", required=false)
int windowSize = 10;
@ -60,4 +59,4 @@ public class SNPClusterWalker extends RefWalker<GenomeLoc, GenomeLoc> {
out.println(sum);
return value;
}
}
}