Added functionality to allow for a contract between LocusWindowTraversalEngine and LocusWindowWalker which allows the Walker to act upon reads outside of the provided intervals.
(Really, all we want to do is spit out all reads, but this allows the Walker to do other things with the reads if it wants) git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@641 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
de1c282e62
commit
3aabc144c6
|
|
@ -69,20 +69,13 @@ public class TraverseByLocusWindows extends TraversalEngine {
|
|||
if ( locations.isEmpty() ) {
|
||||
logger.debug("There are no intervals provided for the traversal");
|
||||
} else {
|
||||
if ( ! samReader.hasIndex() )
|
||||
if ( !samReader.hasIndex() )
|
||||
Utils.scareUser("Processing locations were requested, but no index was found for the input SAM/BAM file. This operation is potentially dangerously slow, aborting.");
|
||||
|
||||
for ( GenomeLoc interval : locations ) {
|
||||
logger.debug(String.format("Processing interval %s", interval.toString()));
|
||||
|
||||
CloseableIterator<SAMRecord> readIter = samReader.queryOverlapping( interval.getContig(),
|
||||
(int)interval.getStart(),
|
||||
(int)interval.getStop());
|
||||
|
||||
Iterator<SAMRecord> wrappedIter = WrapReadsIterator( readIter, false );
|
||||
sum = carryWalkerOverInterval(walker, wrappedIter, sum, interval);
|
||||
readIter.close();
|
||||
}
|
||||
if ( walker.actOnNonIntervalReads() )
|
||||
sum = fullInputTraversal(walker, locations, sum);
|
||||
else
|
||||
sum = strictIntervalTraversal(walker, locations, sum);
|
||||
}
|
||||
|
||||
//printOnTraversalDone("intervals", sum);
|
||||
|
|
@ -90,20 +83,81 @@ public class TraverseByLocusWindows extends TraversalEngine {
|
|||
return sum;
|
||||
}
|
||||
|
||||
protected <M, T> T strictIntervalTraversal(LocusWindowWalker<M, T> walker, ArrayList<GenomeLoc> locations, T sum) {
|
||||
for ( GenomeLoc interval : locations ) {
|
||||
logger.debug(String.format("Processing interval %s", interval.toString()));
|
||||
|
||||
CloseableIterator<SAMRecord> readIter = samReader.queryOverlapping( interval.getContig(),
|
||||
(int)interval.getStart(),
|
||||
(int)interval.getStop());
|
||||
|
||||
Iterator<SAMRecord> wrappedIter = WrapReadsIterator( readIter, false );
|
||||
sum = carryWalkerOverInterval(walker, wrappedIter, sum, interval);
|
||||
readIter.close();
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
|
||||
protected <M, T> T fullInputTraversal(LocusWindowWalker<M, T> walker, ArrayList<GenomeLoc> locations, T sum) {
|
||||
|
||||
// set everything up
|
||||
GenomeLoc currentInterval = (locations.size() > 0 ? locations.get(0) : null);
|
||||
int locationsIndex = 0;
|
||||
ArrayList<SAMRecord> intervalReads = new ArrayList<SAMRecord>();
|
||||
Iterator<SAMRecord> readIter = samReader.iterator();
|
||||
|
||||
while (readIter.hasNext()) {
|
||||
TraversalStatistics.nRecords++;
|
||||
SAMRecord read = readIter.next();
|
||||
|
||||
// if there are no locations or we're past the last one or it's unmapped, then act on the read separately
|
||||
if ( currentInterval == null || read.getReadUnmappedFlag() ) {
|
||||
walker.nonIntervalReadAction(read);
|
||||
}
|
||||
else {
|
||||
GenomeLoc loc = new GenomeLoc(read);
|
||||
// if we're in the current interval, add it to the list
|
||||
if ( currentInterval.overlapsP(loc) ) {
|
||||
intervalReads.add(read);
|
||||
}
|
||||
// if we're not yet in the interval, act on the read separately
|
||||
else if ( currentInterval.isPast(loc) ) {
|
||||
walker.nonIntervalReadAction(read);
|
||||
}
|
||||
// otherwise, we're past the interval so first deal with the collected reads and then this one
|
||||
else {
|
||||
if ( intervalReads.size() > 0 ) {
|
||||
Iterator<SAMRecord> wrappedIter = WrapReadsIterator(intervalReads.iterator(), false);
|
||||
sum = carryWalkerOverInterval(walker, wrappedIter, sum, currentInterval);
|
||||
|
||||
// then prepare for the next interval
|
||||
intervalReads.clear();
|
||||
currentInterval = (++locationsIndex < locations.size() ? locations.get(locationsIndex) : null);
|
||||
}
|
||||
walker.nonIntervalReadAction(read);
|
||||
}
|
||||
}
|
||||
}
|
||||
// some cleanup
|
||||
if ( intervalReads.size() > 0 ) {
|
||||
Iterator<SAMRecord> wrappedIter = WrapReadsIterator(intervalReads.iterator(), false);
|
||||
sum = carryWalkerOverInterval(walker, wrappedIter, sum, currentInterval);
|
||||
}
|
||||
|
||||
return sum;
|
||||
}
|
||||
|
||||
protected <M, T> T carryWalkerOverInterval(LocusWindowWalker<M, T> walker, Iterator<SAMRecord> readIter, T sum, GenomeLoc interval ) {
|
||||
logger.debug(String.format("TraverseByIntervals.carryWalkerOverInterval Genomic interval is %s", interval));
|
||||
|
||||
// prepare the read filtering read iterator and provide it to a new interval iterator
|
||||
FilteringIterator filterIter = new FilteringIterator(readIter, new locusStreamFilterFunc());
|
||||
|
||||
ArrayList<SAMRecord> reads = new ArrayList<SAMRecord>();
|
||||
ArrayList<Integer> offsets = new ArrayList<Integer>();
|
||||
boolean done = false;
|
||||
long leftmostIndex = interval.getStart(),
|
||||
rightmostIndex = interval.getStop();
|
||||
while (filterIter.hasNext() && !done) {
|
||||
while (readIter.hasNext() && !done) {
|
||||
TraversalStatistics.nRecords++;
|
||||
SAMRecord read = filterIter.next();
|
||||
SAMRecord read = readIter.next();
|
||||
reads.add(read);
|
||||
offsets.add((int)(read.getAlignmentStart() - interval.getStart()));
|
||||
if ( read.getAlignmentStart() < leftmostIndex )
|
||||
|
|
|
|||
|
|
@ -3,6 +3,8 @@ package org.broadinstitute.sting.gatk.walkers;
|
|||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.LocusContext;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: ebanks
|
||||
|
|
@ -13,9 +15,17 @@ import org.broadinstitute.sting.gatk.LocusContext;
|
|||
public abstract class LocusWindowWalker<MapType, ReduceType> extends Walker<MapType, ReduceType> {
|
||||
// Do we actually want to operate on the context?
|
||||
public boolean filter(RefMetaDataTracker tracker, String ref, LocusContext context) {
|
||||
return true; // We are keeping all the reads
|
||||
return true; // We are keeping all the intervals
|
||||
}
|
||||
|
||||
// do we care about reads that are not part of our intervals?
|
||||
public boolean actOnNonIntervalReads() {
|
||||
return false; // Don't act on them
|
||||
}
|
||||
|
||||
// What do we do with the reads that are not part of our intervals?
|
||||
public void nonIntervalReadAction(SAMRecord read) { }
|
||||
|
||||
// Map over the org.broadinstitute.sting.gatk.LocusContext
|
||||
public abstract MapType map(RefMetaDataTracker tracker, String ref, LocusContext context);
|
||||
|
||||
|
|
|
|||
|
|
@ -21,6 +21,9 @@ public class IntervalCleanerWalker extends LocusWindowWalker<Integer, Integer>
|
|||
public int maxReadLength = -1;
|
||||
@Argument(fullName="OutputCleaned", shortName="O", required=true, doc="Output file (sam or bam) for improved (realigned) reads")
|
||||
public String OUT;
|
||||
@Argument(fullName="OutputAllReads", shortName="all", doc="print out all reads (otherwise, just those within the intervals)", required=false)
|
||||
public boolean printAllReads = false;
|
||||
|
||||
public static final int MAX_QUAL = 99;
|
||||
|
||||
private SAMFileWriter writer;
|
||||
|
|
@ -30,12 +33,28 @@ public class IntervalCleanerWalker extends LocusWindowWalker<Integer, Integer>
|
|||
writer = new SAMFileWriterFactory().makeSAMOrBAMWriter(header, false, new File(OUT));
|
||||
}
|
||||
|
||||
// do we care about reads that are not part of our intervals?
|
||||
public boolean actOnNonIntervalReads() {
|
||||
return printAllReads;
|
||||
}
|
||||
|
||||
// What do we do with the reads that are not part of our intervals?
|
||||
public void nonIntervalReadAction(SAMRecord read) {
|
||||
writer.addAlignment(read);
|
||||
}
|
||||
|
||||
public Integer map(RefMetaDataTracker tracker, String ref, LocusContext context) {
|
||||
List<SAMRecord> reads = context.getReads();
|
||||
ArrayList<SAMRecord> goodReads = new ArrayList<SAMRecord>();
|
||||
for ( SAMRecord read : reads ) {
|
||||
if ( read.getReadLength() <= maxReadLength )
|
||||
if ( read.getReadLength() <= maxReadLength &&
|
||||
!read.getReadUnmappedFlag() &&
|
||||
!read.getNotPrimaryAlignmentFlag() &&
|
||||
!read.getDuplicateReadFlag() &&
|
||||
read.getAlignmentStart() != SAMRecord.NO_ALIGNMENT_START )
|
||||
goodReads.add(read);
|
||||
else
|
||||
writer.addAlignment(read);
|
||||
}
|
||||
|
||||
clean(goodReads, ref, context.getLocation().getStart());
|
||||
|
|
@ -59,34 +78,6 @@ public class IntervalCleanerWalker extends LocusWindowWalker<Integer, Integer>
|
|||
writer.close();
|
||||
}
|
||||
|
||||
private static int mismatchQualitySumCigar(AlignedRead aRead, String ref, int refIndex) {
|
||||
String read = aRead.getReadString();
|
||||
String quals = aRead.getBaseQualityString();
|
||||
Cigar c = aRead.getCigar();
|
||||
|
||||
int sum = 0;
|
||||
int readIndex = 0;
|
||||
for ( int i = 0 ; i < c.numCigarElements() ; i++ ) {
|
||||
CigarElement ce = c.getCigarElement(i);
|
||||
switch( ce.getOperator() ) {
|
||||
case M:
|
||||
for ( int j = 0 ; j < ce.getLength() ; j++, refIndex++, readIndex++ ) {
|
||||
if ( Character.toUpperCase(read.charAt(readIndex)) != Character.toUpperCase(ref.charAt(refIndex)) )
|
||||
sum += (int)quals.charAt(readIndex) - 33;
|
||||
}
|
||||
break;
|
||||
case I:
|
||||
readIndex += ce.getLength();
|
||||
break;
|
||||
case D:
|
||||
refIndex += ce.getLength();
|
||||
break;
|
||||
default: throw new RuntimeException("Unrecognized cigar element");
|
||||
}
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
|
||||
private static int mismatchQualitySum(AlignedRead aRead, String ref, int refIndex) {
|
||||
String read = aRead.getReadString();
|
||||
String quals = aRead.getBaseQualityString();
|
||||
|
|
@ -136,6 +127,8 @@ public class IntervalCleanerWalker extends LocusWindowWalker<Integer, Integer>
|
|||
AlignedRead aRead = altReads.get(index);
|
||||
SWPairwiseAlignment swConsensus = new SWPairwiseAlignment(reference, aRead.getReadString());
|
||||
int refIdx = swConsensus.getAlignmentStart2wrt1();
|
||||
if ( refIdx < 0 || refIdx > reference.length() - aRead.getReadString().length() )
|
||||
continue;
|
||||
|
||||
// create the new consensus
|
||||
StringBuffer sb = new StringBuffer();
|
||||
|
|
@ -199,7 +192,7 @@ public class IntervalCleanerWalker extends LocusWindowWalker<Integer, Integer>
|
|||
}
|
||||
|
||||
// if the best alternate consensus has a smaller sum of quality score mismatches, then clean!
|
||||
if ( bestConsensus.mismatchSum < totalMismatchSum ) {
|
||||
if ( bestConsensus != null && bestConsensus.mismatchSum < totalMismatchSum ) {
|
||||
logger.info("CLEAN: " + bestConsensus.str);
|
||||
|
||||
// clean the appropriate reads
|
||||
|
|
|
|||
Loading…
Reference in New Issue