Added functionality to allow for a contract between LocusWindowTraversalEngine and LocusWindowWalker which allows the Walker to act upon reads outside of the provided intervals.

(Really, all we want to do is spit out all reads, but this allows the Walker to do other things with the reads if it wants)


git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@641 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
ebanks 2009-05-08 17:28:16 +00:00
parent de1c282e62
commit 3aabc144c6
3 changed files with 105 additions and 48 deletions

View File

@ -69,20 +69,13 @@ public class TraverseByLocusWindows extends TraversalEngine {
if ( locations.isEmpty() ) {
logger.debug("There are no intervals provided for the traversal");
} else {
if ( ! samReader.hasIndex() )
if ( !samReader.hasIndex() )
Utils.scareUser("Processing locations were requested, but no index was found for the input SAM/BAM file. This operation is potentially dangerously slow, aborting.");
for ( GenomeLoc interval : locations ) {
logger.debug(String.format("Processing interval %s", interval.toString()));
CloseableIterator<SAMRecord> readIter = samReader.queryOverlapping( interval.getContig(),
(int)interval.getStart(),
(int)interval.getStop());
Iterator<SAMRecord> wrappedIter = WrapReadsIterator( readIter, false );
sum = carryWalkerOverInterval(walker, wrappedIter, sum, interval);
readIter.close();
}
if ( walker.actOnNonIntervalReads() )
sum = fullInputTraversal(walker, locations, sum);
else
sum = strictIntervalTraversal(walker, locations, sum);
}
//printOnTraversalDone("intervals", sum);
@ -90,20 +83,81 @@ public class TraverseByLocusWindows extends TraversalEngine {
return sum;
}
protected <M, T> T strictIntervalTraversal(LocusWindowWalker<M, T> walker, ArrayList<GenomeLoc> locations, T sum) {
for ( GenomeLoc interval : locations ) {
logger.debug(String.format("Processing interval %s", interval.toString()));
CloseableIterator<SAMRecord> readIter = samReader.queryOverlapping( interval.getContig(),
(int)interval.getStart(),
(int)interval.getStop());
Iterator<SAMRecord> wrappedIter = WrapReadsIterator( readIter, false );
sum = carryWalkerOverInterval(walker, wrappedIter, sum, interval);
readIter.close();
}
return sum;
}
protected <M, T> T fullInputTraversal(LocusWindowWalker<M, T> walker, ArrayList<GenomeLoc> locations, T sum) {
// set everything up
GenomeLoc currentInterval = (locations.size() > 0 ? locations.get(0) : null);
int locationsIndex = 0;
ArrayList<SAMRecord> intervalReads = new ArrayList<SAMRecord>();
Iterator<SAMRecord> readIter = samReader.iterator();
while (readIter.hasNext()) {
TraversalStatistics.nRecords++;
SAMRecord read = readIter.next();
// if there are no locations or we're past the last one or it's unmapped, then act on the read separately
if ( currentInterval == null || read.getReadUnmappedFlag() ) {
walker.nonIntervalReadAction(read);
}
else {
GenomeLoc loc = new GenomeLoc(read);
// if we're in the current interval, add it to the list
if ( currentInterval.overlapsP(loc) ) {
intervalReads.add(read);
}
// if we're not yet in the interval, act on the read separately
else if ( currentInterval.isPast(loc) ) {
walker.nonIntervalReadAction(read);
}
// otherwise, we're past the interval so first deal with the collected reads and then this one
else {
if ( intervalReads.size() > 0 ) {
Iterator<SAMRecord> wrappedIter = WrapReadsIterator(intervalReads.iterator(), false);
sum = carryWalkerOverInterval(walker, wrappedIter, sum, currentInterval);
// then prepare for the next interval
intervalReads.clear();
currentInterval = (++locationsIndex < locations.size() ? locations.get(locationsIndex) : null);
}
walker.nonIntervalReadAction(read);
}
}
}
// some cleanup
if ( intervalReads.size() > 0 ) {
Iterator<SAMRecord> wrappedIter = WrapReadsIterator(intervalReads.iterator(), false);
sum = carryWalkerOverInterval(walker, wrappedIter, sum, currentInterval);
}
return sum;
}
protected <M, T> T carryWalkerOverInterval(LocusWindowWalker<M, T> walker, Iterator<SAMRecord> readIter, T sum, GenomeLoc interval ) {
logger.debug(String.format("TraverseByIntervals.carryWalkerOverInterval Genomic interval is %s", interval));
// prepare the read filtering read iterator and provide it to a new interval iterator
FilteringIterator filterIter = new FilteringIterator(readIter, new locusStreamFilterFunc());
ArrayList<SAMRecord> reads = new ArrayList<SAMRecord>();
ArrayList<Integer> offsets = new ArrayList<Integer>();
boolean done = false;
long leftmostIndex = interval.getStart(),
rightmostIndex = interval.getStop();
while (filterIter.hasNext() && !done) {
while (readIter.hasNext() && !done) {
TraversalStatistics.nRecords++;
SAMRecord read = filterIter.next();
SAMRecord read = readIter.next();
reads.add(read);
offsets.add((int)(read.getAlignmentStart() - interval.getStart()));
if ( read.getAlignmentStart() < leftmostIndex )

View File

@ -3,6 +3,8 @@ package org.broadinstitute.sting.gatk.walkers;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.LocusContext;
import net.sf.samtools.SAMRecord;
/**
* Created by IntelliJ IDEA.
* User: ebanks
@ -13,9 +15,17 @@ import org.broadinstitute.sting.gatk.LocusContext;
public abstract class LocusWindowWalker<MapType, ReduceType> extends Walker<MapType, ReduceType> {
// Do we actually want to operate on the context?
public boolean filter(RefMetaDataTracker tracker, String ref, LocusContext context) {
return true; // We are keeping all the reads
return true; // We are keeping all the intervals
}
// do we care about reads that are not part of our intervals?
public boolean actOnNonIntervalReads() {
return false; // Don't act on them
}
// What do we do with the reads that are not part of our intervals?
public void nonIntervalReadAction(SAMRecord read) { }
// Map over the org.broadinstitute.sting.gatk.LocusContext
public abstract MapType map(RefMetaDataTracker tracker, String ref, LocusContext context);

View File

@ -21,6 +21,9 @@ public class IntervalCleanerWalker extends LocusWindowWalker<Integer, Integer>
public int maxReadLength = -1;
@Argument(fullName="OutputCleaned", shortName="O", required=true, doc="Output file (sam or bam) for improved (realigned) reads")
public String OUT;
@Argument(fullName="OutputAllReads", shortName="all", doc="print out all reads (otherwise, just those within the intervals)", required=false)
public boolean printAllReads = false;
public static final int MAX_QUAL = 99;
private SAMFileWriter writer;
@ -30,12 +33,28 @@ public class IntervalCleanerWalker extends LocusWindowWalker<Integer, Integer>
writer = new SAMFileWriterFactory().makeSAMOrBAMWriter(header, false, new File(OUT));
}
// do we care about reads that are not part of our intervals?
public boolean actOnNonIntervalReads() {
return printAllReads;
}
// What do we do with the reads that are not part of our intervals?
public void nonIntervalReadAction(SAMRecord read) {
writer.addAlignment(read);
}
public Integer map(RefMetaDataTracker tracker, String ref, LocusContext context) {
List<SAMRecord> reads = context.getReads();
ArrayList<SAMRecord> goodReads = new ArrayList<SAMRecord>();
for ( SAMRecord read : reads ) {
if ( read.getReadLength() <= maxReadLength )
if ( read.getReadLength() <= maxReadLength &&
!read.getReadUnmappedFlag() &&
!read.getNotPrimaryAlignmentFlag() &&
!read.getDuplicateReadFlag() &&
read.getAlignmentStart() != SAMRecord.NO_ALIGNMENT_START )
goodReads.add(read);
else
writer.addAlignment(read);
}
clean(goodReads, ref, context.getLocation().getStart());
@ -59,34 +78,6 @@ public class IntervalCleanerWalker extends LocusWindowWalker<Integer, Integer>
writer.close();
}
private static int mismatchQualitySumCigar(AlignedRead aRead, String ref, int refIndex) {
String read = aRead.getReadString();
String quals = aRead.getBaseQualityString();
Cigar c = aRead.getCigar();
int sum = 0;
int readIndex = 0;
for ( int i = 0 ; i < c.numCigarElements() ; i++ ) {
CigarElement ce = c.getCigarElement(i);
switch( ce.getOperator() ) {
case M:
for ( int j = 0 ; j < ce.getLength() ; j++, refIndex++, readIndex++ ) {
if ( Character.toUpperCase(read.charAt(readIndex)) != Character.toUpperCase(ref.charAt(refIndex)) )
sum += (int)quals.charAt(readIndex) - 33;
}
break;
case I:
readIndex += ce.getLength();
break;
case D:
refIndex += ce.getLength();
break;
default: throw new RuntimeException("Unrecognized cigar element");
}
}
return sum;
}
private static int mismatchQualitySum(AlignedRead aRead, String ref, int refIndex) {
String read = aRead.getReadString();
String quals = aRead.getBaseQualityString();
@ -136,6 +127,8 @@ public class IntervalCleanerWalker extends LocusWindowWalker<Integer, Integer>
AlignedRead aRead = altReads.get(index);
SWPairwiseAlignment swConsensus = new SWPairwiseAlignment(reference, aRead.getReadString());
int refIdx = swConsensus.getAlignmentStart2wrt1();
if ( refIdx < 0 || refIdx > reference.length() - aRead.getReadString().length() )
continue;
// create the new consensus
StringBuffer sb = new StringBuffer();
@ -199,7 +192,7 @@ public class IntervalCleanerWalker extends LocusWindowWalker<Integer, Integer>
}
// if the best alternate consensus has a smaller sum of quality score mismatches, then clean!
if ( bestConsensus.mismatchSum < totalMismatchSum ) {
if ( bestConsensus != null && bestConsensus.mismatchSum < totalMismatchSum ) {
logger.info("CLEAN: " + bestConsensus.str);
// clean the appropriate reads