diff --git a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisTK.java b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisTK.java index 004a98b10..97c37f76b 100644 --- a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisTK.java +++ b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisTK.java @@ -16,6 +16,7 @@ import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; import org.broadinstitute.sting.gatk.traversals.*; import org.broadinstitute.sting.gatk.walkers.LocusWalker; import org.broadinstitute.sting.gatk.walkers.ReadWalker; +import org.broadinstitute.sting.gatk.walkers.IntervalWalker; import org.broadinstitute.sting.gatk.walkers.Walker; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.Utils; @@ -247,8 +248,8 @@ public class GenomeAnalysisTK extends CommandLineProgram { MicroManager microManager = null; - // Try to get the walker specified - try { + // Get the walker specified + if ( my_walker instanceof LocusWalker ) { LocusWalker walker = (LocusWalker) my_walker; if ( REF_FILE_ARG == null ) @@ -277,10 +278,10 @@ public class GenomeAnalysisTK extends CommandLineProgram { else this.engine = new TraverseByLoci(INPUT_FILE, REF_FILE_ARG, rods); } - } - catch (java.lang.ClassCastException e) { - // I guess we're a read walker LOL - ReadWalker walker = (ReadWalker) my_walker; + } else if ( my_walker instanceof IntervalWalker ) { + this.engine = new TraverseByIntervals(INPUT_FILE, REF_FILE_ARG, rods); + } else { + // we're a read walker this.engine = new TraverseByReads(INPUT_FILE, REF_FILE_ARG, rods); } diff --git a/java/src/org/broadinstitute/sting/gatk/LocusContext.java b/java/src/org/broadinstitute/sting/gatk/LocusContext.java index 3492ea5e0..e4aa2829e 100755 --- a/java/src/org/broadinstitute/sting/gatk/LocusContext.java +++ b/java/src/org/broadinstitute/sting/gatk/LocusContext.java @@ -120,9 +120,11 @@ public class LocusContext { i++; } - ArrayList downsampledReads = new ArrayList(); + ArrayList downsampledReads = new ArrayList(); + ArrayList downsampledOffsets = new ArrayList(); Iterator positionIter = positions.iterator(); - Iterator readsIter = reads.iterator(); + Iterator readsIter = reads.iterator(); + Iterator offsetsIter = offsets.iterator(); int currentRead = 0; while ( positionIter.hasNext() ) { int nextReadToKeep = (Integer)positionIter.next(); @@ -130,13 +132,16 @@ public class LocusContext { // fast-forward to the right read while ( currentRead < nextReadToKeep ) { readsIter.next(); + offsetsIter.next(); currentRead++; } downsampledReads.add(readsIter.next()); + downsampledOffsets.add(offsetsIter.next()); currentRead++; } reads = downsampledReads; + offsets = downsampledOffsets; } } diff --git a/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java b/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java index 7343a4d07..e05ea91b8 100755 --- a/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java +++ b/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java @@ -496,8 +496,7 @@ public abstract class TraversalEngine { /** * Class to filter out un-handle-able reads from the stream. We currently are skipping - * unmapped reads, non-primary reads, unaligned reads, and those with indels. We should - * really change this to handle indel containing reads. + * unmapped reads, non-primary reads, unaligned reads, and duplicate reads. */ public static class locusStreamFilterFunc implements SamRecordFilter { SAMRecord lastRead = null; diff --git a/java/src/org/broadinstitute/sting/gatk/traversals/TraverseByIntervals.java b/java/src/org/broadinstitute/sting/gatk/traversals/TraverseByIntervals.java new file mode 100755 index 000000000..043a56ad3 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/traversals/TraverseByIntervals.java @@ -0,0 +1,156 @@ +package org.broadinstitute.sting.gatk.traversals; + +import org.broadinstitute.sting.gatk.walkers.IntervalWalker; +import org.broadinstitute.sting.gatk.walkers.Walker; +import org.broadinstitute.sting.gatk.LocusContext; +import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData; +import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.iterators.ReferenceIterator; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.Utils; + +import java.util.List; +import java.util.Iterator; +import java.util.ArrayList; +import java.io.File; + +import net.sf.samtools.SAMRecord; +import net.sf.samtools.util.CloseableIterator; +import edu.mit.broad.picard.filter.FilteringIterator; + +/** + * Created by IntelliJ IDEA. + * User: ebanks + * Date: Apr 23, 2009 + * Time: 10:26:03 AM + * To change this template use File | Settings | File Templates. + */ +public class TraverseByIntervals extends TraversalEngine { + + public TraverseByIntervals(File reads, File ref, List> rods) { + super(reads, ref, rods); + } + + public T traverse(Walker walker, ArrayList locations) { + if ( walker instanceof IntervalWalker ) { + IntervalWalker intervalWalker = (IntervalWalker)walker; + T sum = traverseByIntervals(intervalWalker, locations); + return sum; + } else { + throw new IllegalArgumentException("Walker isn't an interval walker!"); + } + } + + /** + * Traverse by intervals -- the key driver of linearly ordered traversal of intervals. Provides reads, RODs, and + * the reference base for each interval in the reference to the intervalWalker walker. Supports all of the + * interaction contract implied by the interval walker + * + * @param walker An interval walker object + * @param MapType -- the result of calling map() on walker + * @param ReduceType -- the result of calling reduce() on the walker + * @return 0 on success + */ + protected T traverseByIntervals(IntervalWalker walker, ArrayList locations) { + logger.debug("Entering traverseByIntervals"); + + samReader = initializeSAMFile(readsFile); + + verifySortOrder(true); + + walker.initialize(); + + T sum = walker.reduceInit(); + + if ( locations.isEmpty() ) { + logger.debug("There are no intervals provided for the traversal"); + } else { + if ( ! samReader.hasIndex() ) + Utils.scareUser("Processing locations were requested, but no index was found for the input SAM/BAM file. This operation is potentially dangerously slow, aborting."); + + for ( GenomeLoc interval : locations ) { + logger.debug(String.format("Processing interval %s", interval.toString())); + + CloseableIterator readIter = samReader.queryOverlapping( interval.getContig(), + (int)interval.getStart(), + (int)interval.getStop()); + + Iterator wrappedIter = WrapReadsIterator( readIter, false ); + sum = carryWalkerOverInterval(walker, wrappedIter, sum, interval); + readIter.close(); + } + } + + printOnTraversalDone("intervals", sum); + walker.onTraversalDone(sum); + return sum; + } + + protected T carryWalkerOverInterval(IntervalWalker walker, Iterator readIter, T sum, GenomeLoc interval ) { + logger.debug(String.format("TraverseByIntervals.carryWalkerOverInterval Genomic interval is %s", interval)); + + // prepare the read filtering read iterator and provide it to a new interval iterator + FilteringIterator filterIter = new FilteringIterator(readIter, new locusStreamFilterFunc()); + + ArrayList reads = new ArrayList(); + ArrayList offsets = new ArrayList(); + boolean done = false; + while (filterIter.hasNext() && !done) { + TraversalStatistics.nRecords++; + SAMRecord read = filterIter.next(); + reads.add(read); + offsets.add((int)(read.getAlignmentStart() - interval.getStart())); + if (this.maxReads > 0 && TraversalStatistics.nRecords > this.maxReads) { + logger.warn(String.format("Maximum number of reads encountered, terminating traversal " + TraversalStatistics.nRecords)); + done = true; + } + } + + LocusContext locus = new LocusContext(interval, reads, offsets); + if ( DOWNSAMPLE_BY_COVERAGE ) + locus.downsampleToCoverage(downsamplingCoverage); + + ReferenceIterator refSite = refIter.seekForward(locus.getLocation()); + locus.setReferenceContig(refSite.getCurrentContig()); + + // Iterate forward to get all reference ordered data covering this interval + final RefMetaDataTracker tracker = getReferenceOrderedDataAtLocus(locus.getLocation()); + + sum = walkAtinterval( walker, sum, locus, refSite, tracker ); + + //System.out.format("Working at %s\n", locus.getLocation().toString()); + + printProgress("intervals", locus.getLocation()); + + return sum; + } + + protected T walkAtinterval( final IntervalWalker walker, + T sum, + final LocusContext locus, + final ReferenceIterator refSite, + final RefMetaDataTracker tracker ) { + ReferenceIterator refSiteCopy = refSite; + StringBuffer refBases = new StringBuffer(refSiteCopy.getBaseAsString()); + int locusLength = (int)(locus.getLocation().getStop() - locus.getLocation().getStart()); + for ( int i = 0; i < locusLength; i++ ) { + refSiteCopy = refSiteCopy.next(); + refBases.append(refSiteCopy.getBaseAsChar()); + } + + //logger.debug(String.format(" Reference: %s:%d %c", refSite.getCurrentContig().getName(), refSite.getPosition(), refBase)); + + // + // Execute our contract with the walker. Call filter, map, and reduce + // + final boolean keepMeP = walker.filter(tracker, refBases.toString(), locus); + if (keepMeP) { + M x = walker.map(tracker, refBases.toString(), locus); + sum = walker.reduce(x, sum); + } + + //printProgress("intervals", interval.getLocation()); + return sum; + } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/IntervalWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/IntervalWalker.java new file mode 100755 index 000000000..5573c161c --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/walkers/IntervalWalker.java @@ -0,0 +1,25 @@ +package org.broadinstitute.sting.gatk.walkers; + +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.LocusContext; + +/** + * Created by IntelliJ IDEA. + * User: ebanks + * Date: Apr 23, 2009 + * Time: 2:52:28 PM + * To change this template use File | Settings | File Templates. + */ +public abstract class IntervalWalker extends Walker { + // Do we actually want to operate on the context? + public boolean filter(RefMetaDataTracker tracker, String ref, LocusContext context) { + return true; // We are keeping all the reads + } + + // Map over the org.broadinstitute.sting.gatk.LocusContext + public abstract MapType map(RefMetaDataTracker tracker, String ref, LocusContext context); + + // Given result of map function + public abstract ReduceType reduceInit(); + public abstract ReduceType reduce(MapType value, ReduceType sum); +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/IntervalCleanerWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/IntervalCleanerWalker.java new file mode 100755 index 000000000..dff15cb37 --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/IntervalCleanerWalker.java @@ -0,0 +1,35 @@ + +package org.broadinstitute.sting.playground.gatk.walkers; + +import org.broadinstitute.sting.gatk.refdata.*; +import org.broadinstitute.sting.gatk.walkers.IntervalWalker; +import org.broadinstitute.sting.gatk.walkers.WalkerName; +import org.broadinstitute.sting.gatk.LocusContext; +import org.broadinstitute.sting.utils.cmdLine.Argument; + +import net.sf.samtools.*; + +@WalkerName("IntervalCleaner") +public class IntervalCleanerWalker extends IntervalWalker { + @Argument(fullName="maxReadLength", shortName="maxRead", required=false, defaultValue="-1") + public int maxReadLength; + + public void initialize() {} + + public Integer map(RefMetaDataTracker tracker, String ref, LocusContext context) { + out.println(context.getLocation() + " (" + context.getReads().size() + ") => " + ref); + return 1; + } + + public Integer reduceInit() { + return 0; + } + + public Integer reduce(Integer value, Integer sum) { + return sum + value; + } + + public void onTraversalDone(Integer result) { + out.println("Saw " + result + " intervals"); + } +} \ No newline at end of file