diff --git a/java/src/org/broadinstitute/sting/gatk/traversals/TraverseByLocusWindows.java b/java/src/org/broadinstitute/sting/gatk/traversals/TraverseByLocusWindows.java index cd87d17c4..1e9d86b2f 100755 --- a/java/src/org/broadinstitute/sting/gatk/traversals/TraverseByLocusWindows.java +++ b/java/src/org/broadinstitute/sting/gatk/traversals/TraverseByLocusWindows.java @@ -69,20 +69,13 @@ public class TraverseByLocusWindows extends TraversalEngine { if ( locations.isEmpty() ) { logger.debug("There are no intervals provided for the traversal"); } else { - if ( ! samReader.hasIndex() ) + if ( !samReader.hasIndex() ) Utils.scareUser("Processing locations were requested, but no index was found for the input SAM/BAM file. This operation is potentially dangerously slow, aborting."); - for ( GenomeLoc interval : locations ) { - logger.debug(String.format("Processing interval %s", interval.toString())); - - CloseableIterator readIter = samReader.queryOverlapping( interval.getContig(), - (int)interval.getStart(), - (int)interval.getStop()); - - Iterator wrappedIter = WrapReadsIterator( readIter, false ); - sum = carryWalkerOverInterval(walker, wrappedIter, sum, interval); - readIter.close(); - } + if ( walker.actOnNonIntervalReads() ) + sum = fullInputTraversal(walker, locations, sum); + else + sum = strictIntervalTraversal(walker, locations, sum); } //printOnTraversalDone("intervals", sum); @@ -90,20 +83,81 @@ public class TraverseByLocusWindows extends TraversalEngine { return sum; } + protected T strictIntervalTraversal(LocusWindowWalker walker, ArrayList locations, T sum) { + for ( GenomeLoc interval : locations ) { + logger.debug(String.format("Processing interval %s", interval.toString())); + + CloseableIterator readIter = samReader.queryOverlapping( interval.getContig(), + (int)interval.getStart(), + (int)interval.getStop()); + + Iterator wrappedIter = WrapReadsIterator( readIter, false ); + sum = carryWalkerOverInterval(walker, wrappedIter, sum, interval); + readIter.close(); + } + return sum; + } + + protected T fullInputTraversal(LocusWindowWalker walker, ArrayList locations, T sum) { + + // set everything up + GenomeLoc currentInterval = (locations.size() > 0 ? locations.get(0) : null); + int locationsIndex = 0; + ArrayList intervalReads = new ArrayList(); + Iterator readIter = samReader.iterator(); + + while (readIter.hasNext()) { + TraversalStatistics.nRecords++; + SAMRecord read = readIter.next(); + + // if there are no locations or we're past the last one or it's unmapped, then act on the read separately + if ( currentInterval == null || read.getReadUnmappedFlag() ) { + walker.nonIntervalReadAction(read); + } + else { + GenomeLoc loc = new GenomeLoc(read); + // if we're in the current interval, add it to the list + if ( currentInterval.overlapsP(loc) ) { + intervalReads.add(read); + } + // if we're not yet in the interval, act on the read separately + else if ( currentInterval.isPast(loc) ) { + walker.nonIntervalReadAction(read); + } + // otherwise, we're past the interval so first deal with the collected reads and then this one + else { + if ( intervalReads.size() > 0 ) { + Iterator wrappedIter = WrapReadsIterator(intervalReads.iterator(), false); + sum = carryWalkerOverInterval(walker, wrappedIter, sum, currentInterval); + + // then prepare for the next interval + intervalReads.clear(); + currentInterval = (++locationsIndex < locations.size() ? locations.get(locationsIndex) : null); + } + walker.nonIntervalReadAction(read); + } + } + } + // some cleanup + if ( intervalReads.size() > 0 ) { + Iterator wrappedIter = WrapReadsIterator(intervalReads.iterator(), false); + sum = carryWalkerOverInterval(walker, wrappedIter, sum, currentInterval); + } + + return sum; + } + protected T carryWalkerOverInterval(LocusWindowWalker walker, Iterator readIter, T sum, GenomeLoc interval ) { logger.debug(String.format("TraverseByIntervals.carryWalkerOverInterval Genomic interval is %s", interval)); - // prepare the read filtering read iterator and provide it to a new interval iterator - FilteringIterator filterIter = new FilteringIterator(readIter, new locusStreamFilterFunc()); - ArrayList reads = new ArrayList(); ArrayList offsets = new ArrayList(); boolean done = false; long leftmostIndex = interval.getStart(), rightmostIndex = interval.getStop(); - while (filterIter.hasNext() && !done) { + while (readIter.hasNext() && !done) { TraversalStatistics.nRecords++; - SAMRecord read = filterIter.next(); + SAMRecord read = readIter.next(); reads.add(read); offsets.add((int)(read.getAlignmentStart() - interval.getStart())); if ( read.getAlignmentStart() < leftmostIndex ) diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/LocusWindowWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/LocusWindowWalker.java index 2c533b6df..ba0ec2b5d 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/LocusWindowWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/LocusWindowWalker.java @@ -3,6 +3,8 @@ package org.broadinstitute.sting.gatk.walkers; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.LocusContext; +import net.sf.samtools.SAMRecord; + /** * Created by IntelliJ IDEA. * User: ebanks @@ -13,9 +15,17 @@ import org.broadinstitute.sting.gatk.LocusContext; public abstract class LocusWindowWalker extends Walker { // Do we actually want to operate on the context? public boolean filter(RefMetaDataTracker tracker, String ref, LocusContext context) { - return true; // We are keeping all the reads + return true; // We are keeping all the intervals } + // do we care about reads that are not part of our intervals? + public boolean actOnNonIntervalReads() { + return false; // Don't act on them + } + + // What do we do with the reads that are not part of our intervals? + public void nonIntervalReadAction(SAMRecord read) { } + // Map over the org.broadinstitute.sting.gatk.LocusContext public abstract MapType map(RefMetaDataTracker tracker, String ref, LocusContext context); diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/IntervalCleanerWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/IntervalCleanerWalker.java index c0b584c6e..9e5e696ae 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/IntervalCleanerWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/IntervalCleanerWalker.java @@ -21,6 +21,9 @@ public class IntervalCleanerWalker extends LocusWindowWalker public int maxReadLength = -1; @Argument(fullName="OutputCleaned", shortName="O", required=true, doc="Output file (sam or bam) for improved (realigned) reads") public String OUT; + @Argument(fullName="OutputAllReads", shortName="all", doc="print out all reads (otherwise, just those within the intervals)", required=false) + public boolean printAllReads = false; + public static final int MAX_QUAL = 99; private SAMFileWriter writer; @@ -30,12 +33,28 @@ public class IntervalCleanerWalker extends LocusWindowWalker writer = new SAMFileWriterFactory().makeSAMOrBAMWriter(header, false, new File(OUT)); } + // do we care about reads that are not part of our intervals? + public boolean actOnNonIntervalReads() { + return printAllReads; + } + + // What do we do with the reads that are not part of our intervals? + public void nonIntervalReadAction(SAMRecord read) { + writer.addAlignment(read); + } + public Integer map(RefMetaDataTracker tracker, String ref, LocusContext context) { List reads = context.getReads(); ArrayList goodReads = new ArrayList(); for ( SAMRecord read : reads ) { - if ( read.getReadLength() <= maxReadLength ) + if ( read.getReadLength() <= maxReadLength && + !read.getReadUnmappedFlag() && + !read.getNotPrimaryAlignmentFlag() && + !read.getDuplicateReadFlag() && + read.getAlignmentStart() != SAMRecord.NO_ALIGNMENT_START ) goodReads.add(read); + else + writer.addAlignment(read); } clean(goodReads, ref, context.getLocation().getStart()); @@ -59,34 +78,6 @@ public class IntervalCleanerWalker extends LocusWindowWalker writer.close(); } - private static int mismatchQualitySumCigar(AlignedRead aRead, String ref, int refIndex) { - String read = aRead.getReadString(); - String quals = aRead.getBaseQualityString(); - Cigar c = aRead.getCigar(); - - int sum = 0; - int readIndex = 0; - for ( int i = 0 ; i < c.numCigarElements() ; i++ ) { - CigarElement ce = c.getCigarElement(i); - switch( ce.getOperator() ) { - case M: - for ( int j = 0 ; j < ce.getLength() ; j++, refIndex++, readIndex++ ) { - if ( Character.toUpperCase(read.charAt(readIndex)) != Character.toUpperCase(ref.charAt(refIndex)) ) - sum += (int)quals.charAt(readIndex) - 33; - } - break; - case I: - readIndex += ce.getLength(); - break; - case D: - refIndex += ce.getLength(); - break; - default: throw new RuntimeException("Unrecognized cigar element"); - } - } - return sum; - } - private static int mismatchQualitySum(AlignedRead aRead, String ref, int refIndex) { String read = aRead.getReadString(); String quals = aRead.getBaseQualityString(); @@ -136,6 +127,8 @@ public class IntervalCleanerWalker extends LocusWindowWalker AlignedRead aRead = altReads.get(index); SWPairwiseAlignment swConsensus = new SWPairwiseAlignment(reference, aRead.getReadString()); int refIdx = swConsensus.getAlignmentStart2wrt1(); + if ( refIdx < 0 || refIdx > reference.length() - aRead.getReadString().length() ) + continue; // create the new consensus StringBuffer sb = new StringBuffer(); @@ -199,7 +192,7 @@ public class IntervalCleanerWalker extends LocusWindowWalker } // if the best alternate consensus has a smaller sum of quality score mismatches, then clean! - if ( bestConsensus.mismatchSum < totalMismatchSum ) { + if ( bestConsensus != null && bestConsensus.mismatchSum < totalMismatchSum ) { logger.info("CLEAN: " + bestConsensus.str); // clean the appropriate reads