From 3aabc144c651a2f3a64485ed6197fbd31cc8d825 Mon Sep 17 00:00:00 2001 From: ebanks Date: Fri, 8 May 2009 17:28:16 +0000 Subject: [PATCH] Added functionality to allow for a contract between LocusWindowTraversalEngine and LocusWindowWalker which allows the Walker to act upon reads outside of the provided intervals. (Really, all we want to do is spit out all reads, but this allows the Walker to do other things with the reads if it wants) git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@641 348d0f76-0448-11de-a6fe-93d51630548a --- .../traversals/TraverseByLocusWindows.java | 88 +++++++++++++++---- .../sting/gatk/walkers/LocusWindowWalker.java | 12 ++- .../gatk/walkers/IntervalCleanerWalker.java | 53 +++++------ 3 files changed, 105 insertions(+), 48 deletions(-) diff --git a/java/src/org/broadinstitute/sting/gatk/traversals/TraverseByLocusWindows.java b/java/src/org/broadinstitute/sting/gatk/traversals/TraverseByLocusWindows.java index cd87d17c4..1e9d86b2f 100755 --- a/java/src/org/broadinstitute/sting/gatk/traversals/TraverseByLocusWindows.java +++ b/java/src/org/broadinstitute/sting/gatk/traversals/TraverseByLocusWindows.java @@ -69,20 +69,13 @@ public class TraverseByLocusWindows extends TraversalEngine { if ( locations.isEmpty() ) { logger.debug("There are no intervals provided for the traversal"); } else { - if ( ! samReader.hasIndex() ) + if ( !samReader.hasIndex() ) Utils.scareUser("Processing locations were requested, but no index was found for the input SAM/BAM file. This operation is potentially dangerously slow, aborting."); - for ( GenomeLoc interval : locations ) { - logger.debug(String.format("Processing interval %s", interval.toString())); - - CloseableIterator readIter = samReader.queryOverlapping( interval.getContig(), - (int)interval.getStart(), - (int)interval.getStop()); - - Iterator wrappedIter = WrapReadsIterator( readIter, false ); - sum = carryWalkerOverInterval(walker, wrappedIter, sum, interval); - readIter.close(); - } + if ( walker.actOnNonIntervalReads() ) + sum = fullInputTraversal(walker, locations, sum); + else + sum = strictIntervalTraversal(walker, locations, sum); } //printOnTraversalDone("intervals", sum); @@ -90,20 +83,81 @@ public class TraverseByLocusWindows extends TraversalEngine { return sum; } + protected T strictIntervalTraversal(LocusWindowWalker walker, ArrayList locations, T sum) { + for ( GenomeLoc interval : locations ) { + logger.debug(String.format("Processing interval %s", interval.toString())); + + CloseableIterator readIter = samReader.queryOverlapping( interval.getContig(), + (int)interval.getStart(), + (int)interval.getStop()); + + Iterator wrappedIter = WrapReadsIterator( readIter, false ); + sum = carryWalkerOverInterval(walker, wrappedIter, sum, interval); + readIter.close(); + } + return sum; + } + + protected T fullInputTraversal(LocusWindowWalker walker, ArrayList locations, T sum) { + + // set everything up + GenomeLoc currentInterval = (locations.size() > 0 ? locations.get(0) : null); + int locationsIndex = 0; + ArrayList intervalReads = new ArrayList(); + Iterator readIter = samReader.iterator(); + + while (readIter.hasNext()) { + TraversalStatistics.nRecords++; + SAMRecord read = readIter.next(); + + // if there are no locations or we're past the last one or it's unmapped, then act on the read separately + if ( currentInterval == null || read.getReadUnmappedFlag() ) { + walker.nonIntervalReadAction(read); + } + else { + GenomeLoc loc = new GenomeLoc(read); + // if we're in the current interval, add it to the list + if ( currentInterval.overlapsP(loc) ) { + intervalReads.add(read); + } + // if we're not yet in the interval, act on the read separately + else if ( currentInterval.isPast(loc) ) { + walker.nonIntervalReadAction(read); + } + // otherwise, we're past the interval so first deal with the collected reads and then this one + else { + if ( intervalReads.size() > 0 ) { + Iterator wrappedIter = WrapReadsIterator(intervalReads.iterator(), false); + sum = carryWalkerOverInterval(walker, wrappedIter, sum, currentInterval); + + // then prepare for the next interval + intervalReads.clear(); + currentInterval = (++locationsIndex < locations.size() ? locations.get(locationsIndex) : null); + } + walker.nonIntervalReadAction(read); + } + } + } + // some cleanup + if ( intervalReads.size() > 0 ) { + Iterator wrappedIter = WrapReadsIterator(intervalReads.iterator(), false); + sum = carryWalkerOverInterval(walker, wrappedIter, sum, currentInterval); + } + + return sum; + } + protected T carryWalkerOverInterval(LocusWindowWalker walker, Iterator readIter, T sum, GenomeLoc interval ) { logger.debug(String.format("TraverseByIntervals.carryWalkerOverInterval Genomic interval is %s", interval)); - // prepare the read filtering read iterator and provide it to a new interval iterator - FilteringIterator filterIter = new FilteringIterator(readIter, new locusStreamFilterFunc()); - ArrayList reads = new ArrayList(); ArrayList offsets = new ArrayList(); boolean done = false; long leftmostIndex = interval.getStart(), rightmostIndex = interval.getStop(); - while (filterIter.hasNext() && !done) { + while (readIter.hasNext() && !done) { TraversalStatistics.nRecords++; - SAMRecord read = filterIter.next(); + SAMRecord read = readIter.next(); reads.add(read); offsets.add((int)(read.getAlignmentStart() - interval.getStart())); if ( read.getAlignmentStart() < leftmostIndex ) diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/LocusWindowWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/LocusWindowWalker.java index 2c533b6df..ba0ec2b5d 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/LocusWindowWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/LocusWindowWalker.java @@ -3,6 +3,8 @@ package org.broadinstitute.sting.gatk.walkers; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.LocusContext; +import net.sf.samtools.SAMRecord; + /** * Created by IntelliJ IDEA. * User: ebanks @@ -13,9 +15,17 @@ import org.broadinstitute.sting.gatk.LocusContext; public abstract class LocusWindowWalker extends Walker { // Do we actually want to operate on the context? public boolean filter(RefMetaDataTracker tracker, String ref, LocusContext context) { - return true; // We are keeping all the reads + return true; // We are keeping all the intervals } + // do we care about reads that are not part of our intervals? + public boolean actOnNonIntervalReads() { + return false; // Don't act on them + } + + // What do we do with the reads that are not part of our intervals? + public void nonIntervalReadAction(SAMRecord read) { } + // Map over the org.broadinstitute.sting.gatk.LocusContext public abstract MapType map(RefMetaDataTracker tracker, String ref, LocusContext context); diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/IntervalCleanerWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/IntervalCleanerWalker.java index c0b584c6e..9e5e696ae 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/IntervalCleanerWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/IntervalCleanerWalker.java @@ -21,6 +21,9 @@ public class IntervalCleanerWalker extends LocusWindowWalker public int maxReadLength = -1; @Argument(fullName="OutputCleaned", shortName="O", required=true, doc="Output file (sam or bam) for improved (realigned) reads") public String OUT; + @Argument(fullName="OutputAllReads", shortName="all", doc="print out all reads (otherwise, just those within the intervals)", required=false) + public boolean printAllReads = false; + public static final int MAX_QUAL = 99; private SAMFileWriter writer; @@ -30,12 +33,28 @@ public class IntervalCleanerWalker extends LocusWindowWalker writer = new SAMFileWriterFactory().makeSAMOrBAMWriter(header, false, new File(OUT)); } + // do we care about reads that are not part of our intervals? + public boolean actOnNonIntervalReads() { + return printAllReads; + } + + // What do we do with the reads that are not part of our intervals? + public void nonIntervalReadAction(SAMRecord read) { + writer.addAlignment(read); + } + public Integer map(RefMetaDataTracker tracker, String ref, LocusContext context) { List reads = context.getReads(); ArrayList goodReads = new ArrayList(); for ( SAMRecord read : reads ) { - if ( read.getReadLength() <= maxReadLength ) + if ( read.getReadLength() <= maxReadLength && + !read.getReadUnmappedFlag() && + !read.getNotPrimaryAlignmentFlag() && + !read.getDuplicateReadFlag() && + read.getAlignmentStart() != SAMRecord.NO_ALIGNMENT_START ) goodReads.add(read); + else + writer.addAlignment(read); } clean(goodReads, ref, context.getLocation().getStart()); @@ -59,34 +78,6 @@ public class IntervalCleanerWalker extends LocusWindowWalker writer.close(); } - private static int mismatchQualitySumCigar(AlignedRead aRead, String ref, int refIndex) { - String read = aRead.getReadString(); - String quals = aRead.getBaseQualityString(); - Cigar c = aRead.getCigar(); - - int sum = 0; - int readIndex = 0; - for ( int i = 0 ; i < c.numCigarElements() ; i++ ) { - CigarElement ce = c.getCigarElement(i); - switch( ce.getOperator() ) { - case M: - for ( int j = 0 ; j < ce.getLength() ; j++, refIndex++, readIndex++ ) { - if ( Character.toUpperCase(read.charAt(readIndex)) != Character.toUpperCase(ref.charAt(refIndex)) ) - sum += (int)quals.charAt(readIndex) - 33; - } - break; - case I: - readIndex += ce.getLength(); - break; - case D: - refIndex += ce.getLength(); - break; - default: throw new RuntimeException("Unrecognized cigar element"); - } - } - return sum; - } - private static int mismatchQualitySum(AlignedRead aRead, String ref, int refIndex) { String read = aRead.getReadString(); String quals = aRead.getBaseQualityString(); @@ -136,6 +127,8 @@ public class IntervalCleanerWalker extends LocusWindowWalker AlignedRead aRead = altReads.get(index); SWPairwiseAlignment swConsensus = new SWPairwiseAlignment(reference, aRead.getReadString()); int refIdx = swConsensus.getAlignmentStart2wrt1(); + if ( refIdx < 0 || refIdx > reference.length() - aRead.getReadString().length() ) + continue; // create the new consensus StringBuffer sb = new StringBuffer(); @@ -199,7 +192,7 @@ public class IntervalCleanerWalker extends LocusWindowWalker } // if the best alternate consensus has a smaller sum of quality score mismatches, then clean! - if ( bestConsensus.mismatchSum < totalMismatchSum ) { + if ( bestConsensus != null && bestConsensus.mismatchSum < totalMismatchSum ) { logger.info("CLEAN: " + bestConsensus.str); // clean the appropriate reads