diff --git a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisTK.java b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisTK.java
index 0879b21d0..5e5b425d6 100644
--- a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisTK.java
+++ b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisTK.java
@@ -11,9 +11,7 @@ import org.broadinstitute.sting.gatk.executive.MicroScheduler;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
import org.broadinstitute.sting.gatk.traversals.*;
-import org.broadinstitute.sting.gatk.walkers.LocusWalker;
-import org.broadinstitute.sting.gatk.walkers.LocusWindowWalker;
-import org.broadinstitute.sting.gatk.walkers.Walker;
+import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.cmdLine.CommandLineProgram;
@@ -269,9 +267,14 @@ public class GenomeAnalysisTK extends CommandLineProgram {
}
} else if ( my_walker instanceof LocusWindowWalker ) {
this.engine = new TraverseByLocusWindows(INPUT_FILES, REF_FILE_ARG, rods);
- } else {
+ } else if ( my_walker instanceof ReadWalker) {
// we're a read walker
this.engine = new TraverseByReads(INPUT_FILES, REF_FILE_ARG, rods);
+ } else if ( my_walker instanceof DuplicateWalker) {
+ // we're a duplicate walker
+ this.engine = new TraverseDuplicates(INPUT_FILES, REF_FILE_ARG, rods);
+ } else {
+ throw new RuntimeException("Unexpected walker type: " + my_walker);
}
// Prepare the sort ordering w.r.t. the sequence dictionary
diff --git a/java/src/org/broadinstitute/sting/gatk/traversals/TraverseDuplicates.java b/java/src/org/broadinstitute/sting/gatk/traversals/TraverseDuplicates.java
new file mode 100755
index 000000000..b8f783c93
--- /dev/null
+++ b/java/src/org/broadinstitute/sting/gatk/traversals/TraverseDuplicates.java
@@ -0,0 +1,289 @@
+package org.broadinstitute.sting.gatk.traversals;
+
+import net.sf.samtools.SAMRecord;
+import org.apache.log4j.Logger;
+import org.broadinstitute.sting.gatk.LocusContext;
+import org.broadinstitute.sting.gatk.dataSources.providers.LocusContextProvider;
+import org.broadinstitute.sting.gatk.dataSources.shards.ReadShard;
+import org.broadinstitute.sting.gatk.dataSources.shards.Shard;
+import org.broadinstitute.sting.gatk.iterators.BoundedReadIterator;
+import org.broadinstitute.sting.gatk.iterators.PushbackIterator;
+import org.broadinstitute.sting.gatk.iterators.ReferenceIterator;
+import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
+import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
+import org.broadinstitute.sting.gatk.walkers.Walker;
+import org.broadinstitute.sting.gatk.walkers.DuplicateWalker;
+import org.broadinstitute.sting.gatk.walkers.ReadWalker;
+import org.broadinstitute.sting.utils.GenomeLoc;
+import org.broadinstitute.sting.utils.fasta.FastaSequenceFile2;
+
+import java.io.File;
+import java.util.*;
+
+import edu.mit.broad.picard.filter.FilteringIterator;
+import edu.mit.broad.picard.filter.SamRecordFilter;
+
+/**
+ *
+ * User: aaron
+ * Date: Apr 24, 2009
+ * Time: 10:35:22 AM
+ *
+ * The Broad Institute
+ * SOFTWARE COPYRIGHT NOTICE AGREEMENT
+ * This software and its documentation are copyright 2009 by the
+ * Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
+ *
+ * This software is supplied without any warranty or guaranteed support whatsoever. Neither
+ * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
+ *
+ */
+
+
+/**
+ * @author Mark DePristo
+ * @version 0.1
+ * @date Apr 29, 2009
+ *
+ * Class TraverseDuplicates
+ *
+ * This class handles traversing lists of duplicate reads in the new shardable style
+ */
+public class TraverseDuplicates extends TraversalEngine {
+ /** our log, which we want to capture anything from this class */
+ protected static Logger logger = Logger.getLogger(TraverseDuplicates.class);
+
+ private final boolean DEBUG = false;
+
+ /**
+ * Creates a new, uninitialized TraversalEngine
+ *
+ * @param reads SAM/BAM file of reads
+ * @param ref Reference file in FASTA format, assumes a .dict file is also available
+ * @param rods Array of reference ordered data sets
+ */
+ public TraverseDuplicates(List reads, File ref, List> rods) {
+ super(reads, ref, rods);
+ }
+
+ private List readsAtLoc(final SAMRecord read, PushbackIterator iter)
+ {
+ GenomeLoc site = new GenomeLoc(read);
+ ArrayList l = new ArrayList();
+
+ l.add(read);
+ for (SAMRecord read2: iter) {
+ GenomeLoc site2 = new GenomeLoc(read2);
+
+ // the next read starts too late
+ if ( site2.getStart() != site.getStart() ) {
+ //System.out.printf("site = %s, site2 = %s%n", site, site2);
+ iter.pushback(read2);
+ break;
+ } else {
+ //System.out.printf("Read is a duplicate: %s%n", read.format());
+ l.add(read2);
+ }
+ }
+
+ return l;
+ }
+
+ private List collectDuplicates(List reads) {
+ ArrayList dups = new ArrayList();
+
+ // find the first duplicate
+ SAMRecord key = null;
+ for ( SAMRecord read : reads ) {
+ if ( read.getDuplicateReadFlag() ) {
+ // this is our key
+ key = read;
+ if (DEBUG) logger.debug(String.format("Key %s is a duplicate", read.getReadName()));
+ break;
+ }
+ }
+
+ // At this point, there are two possibilities, we have found at least one dup or not
+ //System.out.printf("Key is %s%n", key);
+ if ( key != null ) {
+ final GenomeLoc keyLoc = new GenomeLoc(key);
+ final GenomeLoc keyMateLoc = new GenomeLoc(key.getMateReferenceIndex(), key.getMateAlignmentStart(), key.getMateAlignmentStart());
+
+ for ( SAMRecord read : reads ) {
+ final GenomeLoc readLoc = new GenomeLoc(read);
+ final GenomeLoc readMateLoc = new GenomeLoc(read.getMateReferenceIndex(), read.getMateAlignmentStart(), read.getMateAlignmentStart());
+ if (DEBUG) logger.debug(String.format("Examining reads at %s vs. %s at %s / %s vs. %s / %s%n", key.getReadName(), read.getReadName(), keyLoc, keyMateLoc, readLoc, readMateLoc));
+ if ( readLoc.compareTo(keyLoc) == 0 && readMateLoc.compareTo(keyMateLoc) == 0 ) {
+ // we are at the same position as the dup and have the same mat pos, it's a dup
+ if (DEBUG) logger.debug(String.format(" => Adding read to dups list: %s%n", read));
+ dups.add(read);
+ }
+ }
+ }
+
+ return dups;
+ }
+
+ /**
+ * Traverse by reads, given the data and the walker
+ * @param walker the walker to execute over
+ * @param shard the shard of data to feed the walker
+ * @param locusProvider the factory for loci
+ * @param sum of type T, the return from the walker
+ * @param the generic type
+ * @param the return type of the reduce function
+ * @return
+ */
+ public T actuallyTraverse(DuplicateWalker dupWalker,
+ Iterator readIter,
+ T sum) {
+ // while we still have more reads
+ // ok, here's the idea. We get all the reads that start at the same position in the genome
+ // We then split the list of reads into sublists of reads:
+ // -> those with the same mate pair position, for paired reads
+ // -> those flagged as unpaired and duplicated but having the same start and end and
+ PushbackIterator iter = new PushbackIterator(readIter);
+ for (SAMRecord read: iter) {
+ // get the genome loc from the read
+ GenomeLoc site = new GenomeLoc(read);
+ logger.debug(String.format("*** TraverseDuplicates.traverse at %s", site));
+ List reads = readsAtLoc(read, iter);
+ List duplicateReads = collectDuplicates(reads);
+
+ // Jump forward in the reference to this locus location
+ LocusContext locus = new LocusContext(site, duplicateReads, Arrays.asList(0));
+
+ // update the number of duplicate sets we've seen
+ TraversalStatistics.nRecords++;
+
+ // we still have to fix the locus context provider to take care of this problem with > 1 length contexts
+ // LocusContext locus = locusProvider.getLocusContext(site);
+
+ byte[] refBases = new byte[0];
+
+ final boolean keepMeP = dupWalker.filter(site, refBases, locus, duplicateReads);
+ if (keepMeP) {
+ M x = dupWalker.map(site, refBases, locus, duplicateReads);
+ sum = dupWalker.reduce(x, sum);
+ }
+
+ printProgress("dups", site);
+
+ if (this.maxReads > 0 && TraversalStatistics.nRecords > this.maxReads) {
+ logger.warn(String.format(("Maximum number of duplicate sets encountered, terminating traversal " + TraversalStatistics.nRecords)));
+ break;
+ }
+ }
+
+ return sum;
+ }
+
+ /**
+ * Class to filter out un-handle-able reads from the stream. We currently are skipping
+ * unmapped reads, non-primary reads, unaligned reads, and duplicate reads.
+ */
+ public static class duplicateStreamFilterFunc implements SamRecordFilter {
+ SAMRecord lastRead = null;
+ public boolean filterOut(SAMRecord rec) {
+ boolean result = false;
+ String why = "";
+ if (rec.getReadUnmappedFlag()) {
+ TraversalStatistics.nUnmappedReads++;
+ result = true;
+ why = "Unmapped";
+ } else if (rec.getNotPrimaryAlignmentFlag()) {
+ TraversalStatistics.nNotPrimary++;
+ result = true;
+ why = "Not Primary";
+ } else if (rec.getAlignmentStart() == SAMRecord.NO_ALIGNMENT_START) {
+ TraversalStatistics.nBadAlignments++;
+ result = true;
+ why = "No alignment start";
+ }
+ else {
+ result = false;
+ }
+
+ if (result) {
+ TraversalStatistics.nSkippedReads++;
+ //System.out.printf(" [filter] %s => %b %s", rec.getReadName(), result, why);
+ } else {
+ TraversalStatistics.nReads++;
+ }
+ return result;
+ }
+ }
+
+
+ // --------------------------------------------------------------------------------------------------------------
+ //
+ // old style interface to the system
+ //
+ // --------------------------------------------------------------------------------------------------------------
+ public T traverse(Walker walker, ArrayList locations) {
+ if ( walker instanceof DuplicateWalker) {
+ Walker x = walker;
+ DuplicateWalker, ?> dupWalker = (DuplicateWalker, ?>)x;
+ return (T)this.traverseByRead(dupWalker, locations);
+ } else {
+ throw new IllegalArgumentException("Walker isn't a duplicate walker!");
+ }
+ }
+
+ /**
+ * Should we deleted at the soonist possible opportunity
+ */
+ public Object traverseByRead(DuplicateWalker walker, ArrayList locations) {
+ samReadIter = initializeReads();
+
+ // Initialize the walker
+ walker.initialize();
+
+ // Initialize the sum
+ FilteringIterator filterIter = new FilteringIterator(samReadIter, new duplicateStreamFilterFunc());
+ T sum = actuallyTraverse(walker, filterIter, walker.reduceInit());
+
+ //printOnTraversalDone("reads", sum);
+ walker.onTraversalDone(sum);
+ return sum;
+ }
+
+ // --------------------------------------------------------------------------------------------------------------
+ //
+ // new style interface to the system
+ //
+ // --------------------------------------------------------------------------------------------------------------
+ /**
+ * Traverse by reads, given the data and the walker
+ * @param walker the walker to execute over
+ * @param shard the shard of data to feed the walker
+ * @param locusProvider the factory for loci
+ * @param sum of type T, the return from the walker
+ * @param the generic type
+ * @param the return type of the reduce function
+ * @return
+ */
+ public T traverse(Walker walker,
+ Shard shard,
+ LocusContextProvider locusProvider,
+ BoundedReadIterator readIter,
+ T sum) {
+
+ logger.debug(String.format("TraverseDuplicates.traverse Genomic interval is %s", ((ReadShard)shard).getSize()));
+
+ if (!(walker instanceof DuplicateWalker))
+ throw new IllegalArgumentException("Walker isn't a duplicate walker!");
+
+ DuplicateWalker dupWalker = (DuplicateWalker) walker;
+
+ // while we still have more reads
+ // ok, here's the idea. We get all the reads that start at the same position in the genome
+ // We then split the list of reads into sublists of reads:
+ // -> those with the same mate pair position, for paired reads
+ // -> those flagged as unpaired and duplicated but having the same start and end and
+
+ FilteringIterator filterIter = new FilteringIterator(readIter, new duplicateStreamFilterFunc());
+ PushbackIterator iter = new PushbackIterator(filterIter);
+ return actuallyTraverse(dupWalker, readIter, sum);
+ }
+}
\ No newline at end of file
diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/DuplicateWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/DuplicateWalker.java
new file mode 100755
index 000000000..1fc494e6e
--- /dev/null
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/DuplicateWalker.java
@@ -0,0 +1,38 @@
+package org.broadinstitute.sting.gatk.walkers;
+
+import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
+import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
+import org.broadinstitute.sting.gatk.LocusContext;
+import org.broadinstitute.sting.utils.GenomeLoc;
+
+import java.util.List;
+
+import net.sf.samtools.SAMRecord;
+
+/**
+ * Created by IntelliJ IDEA.
+ * User: mdepristo
+ * Date: Feb 22, 2009
+ * Time: 2:52:28 PM
+ * To change this template use File | Settings | File Templates.
+ */
+public abstract class DuplicateWalker extends Walker {
+ // Do we actually want to operate on the context?
+ public boolean filter(GenomeLoc loc, byte[] refBases, LocusContext context, List duplicateReads) {
+ return true; // We are keeping all the reads
+ }
+
+ /**
+ * These two functions state whether we're don't make any sense without reads (requiresRead())
+ * or whether we can't take any reads at all (cannotHandleRead())
+ */
+ public boolean requiresReads() { return true; }
+ public boolean cannotHandleReads() { return false; }
+
+ // Map over the org.broadinstitute.sting.gatk.LocusContext
+ public abstract MapType map(GenomeLoc loc, byte[] refBases, LocusContext context, List duplicateReads);
+
+ // Given result of map function
+ public abstract ReduceType reduceInit();
+ public abstract ReduceType reduce(MapType value, ReduceType sum);
+}
\ No newline at end of file
diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/DuplicateQualsWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/DuplicateQualsWalker.java
new file mode 100755
index 000000000..1da9cab99
--- /dev/null
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/DuplicateQualsWalker.java
@@ -0,0 +1,171 @@
+package org.broadinstitute.sting.playground.gatk.walkers;
+
+import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
+import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
+import org.broadinstitute.sting.gatk.LocusContext;
+import org.broadinstitute.sting.gatk.walkers.Walker;
+import org.broadinstitute.sting.gatk.walkers.DuplicateWalker;
+import org.broadinstitute.sting.utils.GenomeLoc;
+import org.broadinstitute.sting.utils.QualityUtils;
+import org.broadinstitute.sting.utils.cmdLine.Argument;
+
+import java.util.List;
+import java.util.ArrayList;
+import java.io.PrintStream;
+import java.io.FileNotFoundException;
+
+import net.sf.samtools.SAMRecord;
+
+class MismatchCounter {
+ long nObs = 0;
+ long nMismatches = 0;
+
+ public void inc(long incNObs, long incNMismatches) {
+ nObs += incNObs;
+ nMismatches += incNMismatches;
+ }
+
+ public void inc(boolean mismatchP) {
+ inc(1, mismatchP ? 1 : 0);
+ }
+
+
+ public double mismatchRate() {
+ return (double)nMismatches / nObs;
+ }
+
+ public byte empiricalQualScore() {
+ return QualityUtils.probToQual(1 - mismatchRate(), 0);
+ }
+
+ public String headerString() {
+ return "mismatchRate\tempiricalQ\tnObs\tnMismatches";
+ }
+
+ public String toString() {
+ return String.format("%.10f\t%d\t%d\t%6d", mismatchRate(), empiricalQualScore(), nObs, nMismatches);
+ }
+}
+
+class QualityTracker {
+ final private int MAX_QUAL_SCORE = 50;
+ MismatchCounter[][] mismatchesByQ = new MismatchCounter[MAX_QUAL_SCORE][MAX_QUAL_SCORE];
+
+ public QualityTracker() {
+ for ( int i = 0; i < MAX_QUAL_SCORE; i++ ) {
+ for ( int j = 0; j < MAX_QUAL_SCORE; j++ ) {
+ mismatchesByQ[i][j] = new MismatchCounter();
+ }
+ }
+ }
+
+ public void inc(int b1Q, int b2Q, boolean mismatchP) {
+ if ( b1Q > MAX_QUAL_SCORE ) throw new RuntimeException("Unexpectedly large base quality " + b1Q);
+ if ( b2Q > MAX_QUAL_SCORE ) throw new RuntimeException("Unexpectedly large base quality " + b2Q);
+ mismatchesByQ[b1Q][b2Q].inc(mismatchP);
+ }
+
+ public void inc(DuplicateComp dc) {
+ inc(dc.qLarger, dc.qSmaller, dc.mismatchP);
+ }
+
+ public void printToStream(PrintStream out, boolean filterUnobserved) {
+ out.printf("Q1\tQ2\t%s%n", mismatchesByQ[0][0].headerString());
+ for ( int i = 0; i < MAX_QUAL_SCORE; i++ ) {
+ for ( int j = 0; j < MAX_QUAL_SCORE; j++ ) {
+ MismatchCounter mc = mismatchesByQ[i][j];
+ //System.out.printf("MC = %s%n", mc);
+ if ( filterUnobserved && mc.nObs == 0 )
+ continue;
+ out.printf("%d\t%d\t%s\t%n", i, j, mc.toString());
+ }
+ }
+ }
+}
+
+class DuplicateComp {
+ int qLarger;
+ int qSmaller;
+ boolean mismatchP;
+
+ public DuplicateComp( int b1Q, int b2Q, boolean mismatchP ) {
+ qLarger = Math.max(b1Q, b2Q);
+ qSmaller = Math.min(b1Q, b2Q);
+ this.mismatchP = mismatchP;
+ }
+
+ public String toString() {
+ return String.format("%d %d %b", qLarger, qSmaller, mismatchP);
+ }
+}
+
+/**
+ * Created by IntelliJ IDEA.
+ * User: mdepristo
+ * Date: Feb 22, 2009
+ * Time: 2:52:28 PM
+ * To change this template use File | Settings | File Templates.
+ */
+public class DuplicateQualsWalker extends DuplicateWalker, QualityTracker> {
+ @Argument(fullName="filterUnobservedQuals", shortName="filterUnobservedQuals", required=false, defaultValue="false", doc="Show only quality bins with at least one observation in the data")
+ public boolean FILTER_UNOBSERVED_QUALS;
+
+ @Argument(fullName="maxPairwiseCompsPerDupSet", shortName="maxPairwiseCompsPerDupSet", required=false, defaultValue="100", doc="Maximumize number of pairwise comparisons to perform among duplicate read sets")
+ public int MAX_PAIRSIZE_COMPS_PER_DUPLICATE_SET;
+
+ final private boolean ACTUALLY_DO_WORK = true;
+
+ public void onTraversalDone(QualityTracker result) {
+ result.printToStream(out, FILTER_UNOBSERVED_QUALS);
+ }
+
+ public QualityTracker reduceInit() {
+ return new QualityTracker();
+ }
+
+ public QualityTracker reduce(List dupComps, QualityTracker tracker) {
+ for ( DuplicateComp dc : dupComps ) {
+ tracker.inc(dc);
+ }
+
+ return tracker;
+ }
+
+ // Print out data for regression
+ public List map(GenomeLoc loc, byte[] refBases, LocusContext context, List duplicateReads) {
+ //out.printf("%s has %d duplicates%n", loc, duplicateReads.size());
+ List comps = new ArrayList();
+
+ if ( ! ACTUALLY_DO_WORK )
+ return comps;
+
+ int nComparisons = 0;
+ for ( SAMRecord read1 : duplicateReads ) {
+ byte[] read1Bases = read1.getReadBases();
+ byte[] read1Quals = read1.getBaseQualities();
+
+ for ( SAMRecord read2 : duplicateReads ) {
+ if ( read1 != read2 && read1.getReadLength() == read2.getReadLength()) {
+ byte[] read2Bases = read2.getReadBases();
+ byte[] read2Quals = read2.getBaseQualities();
+ nComparisons++;
+
+ for ( int i = 0; i < read1Bases.length; i++) {
+ byte read1Q = read1Quals[i];
+ byte read2Q = read2Quals[i];
+ boolean mismatchP = read1Bases[i] != read2Bases[i];
+ DuplicateComp dc = new DuplicateComp(read1Q, read2Q, mismatchP);
+
+ //logger.debug(String.format("dc: %s", dc));
+ comps.add(dc);
+ }
+
+ if ( nComparisons > MAX_PAIRSIZE_COMPS_PER_DUPLICATE_SET )
+ break;
+ }
+ }
+ }
+
+ return comps;
+ }
+}
\ No newline at end of file