Completely rewritten duplicate traversal, more free of bugs, with integration tests for count duplicates walker validated on a TCGA hybrid capture lane.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2458 348d0f76-0448-11de-a6fe-93d51630548a
2009-12-28 23:56:49 +00:00 · 2009-12-28 23:56:49 +00:00 · fcc80e8632
parent 4617052b3c
commit fcc80e8632
8 changed files with 227 additions and 240 deletions
--- a/java/src/org/broadinstitute/sting/gatk/traversals/TraverseDuplicates.java
+++ b/java/src/org/broadinstitute/sting/gatk/traversals/TraverseDuplicates.java
@ -32,6 +32,7 @@ import org.apache.log4j.Logger;
 import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
 import org.broadinstitute.sting.gatk.datasources.providers.ReadView;
 import org.broadinstitute.sting.gatk.datasources.providers.ShardDataProvider;
+import org.broadinstitute.sting.gatk.datasources.providers.ManagingReferenceOrderedView;
 import org.broadinstitute.sting.gatk.datasources.shards.ReadShard;
 import org.broadinstitute.sting.gatk.datasources.shards.Shard;
 import org.broadinstitute.sting.gatk.iterators.PushbackIterator;
@ -40,11 +41,10 @@ import org.broadinstitute.sting.gatk.walkers.Walker;
 import org.broadinstitute.sting.utils.GenomeLoc;
 import org.broadinstitute.sting.utils.GenomeLocParser;
 import org.broadinstitute.sting.utils.Pair;
+import org.broadinstitute.sting.utils.StingException;
+import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;

-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Iterator;
-import java.util.List;
+import java.util.*;

 /**
 * @author Mark DePristo
@ -85,128 +85,58 @@ public class TraverseDuplicates extends TraversalEngine {
        return l;
    }

-    protected Pair<List<SAMRecord>, List<SAMRecord>> splitDuplicates(List<SAMRecord> reads) {
-        List<SAMRecord> uniques = new ArrayList<SAMRecord>();
-        List<SAMRecord> dups = new ArrayList<SAMRecord>();
-        
-        // find the first duplicate
-        SAMRecord key = null;
-        for (SAMRecord read : reads) {
-            if (read.getDuplicateReadFlag()) {
-                // this is our key
-                key = read;
-                if (DEBUG) logger.debug(String.format("Key %s is a duplicate", read.getReadName()));
-                break;
+    protected Set<List<SAMRecord>> uniqueReadSets(List<SAMRecord> reads) {
+        Set<List<SAMRecord>> readSets = new HashSet<List<SAMRecord>>();
+        for ( SAMRecord read : reads ) {
+            
+            List<SAMRecord> readSet = findDuplicateReads(read, readSets);
+            if ( readSet == null ) {
+                readSets.add(new ArrayList<SAMRecord>(Arrays.asList(read)));    // copy so I can add to the list
+            } else {
+                readSet.add(read);
            }
        }

-        // At this point, there are two possibilities, we have found at least one dup or not
-        // if it's a dup, add it to the dups list, otherwise add it to the uniques list 
-        if (key != null) {
-            final GenomeLoc keyLoc = GenomeLocParser.createGenomeLoc(key);
-            final GenomeLoc keyMateLoc = (!key.getReadPairedFlag()) ? null :
-                    GenomeLocParser.createGenomeLoc(key.getMateReferenceIndex(), key.getMateAlignmentStart(), key.getMateAlignmentStart());
-            for (SAMRecord read : reads) {
-                final GenomeLoc readLoc = GenomeLocParser.createGenomeLoc(read);
-                final GenomeLoc readMateLoc = (!key.getReadPairedFlag()) ? null :
-                        GenomeLocParser.createGenomeLoc(read.getMateReferenceIndex(), read.getMateAlignmentStart(), read.getMateAlignmentStart());
-                if (DEBUG)
-                    logger.debug(String.format("Examining reads at %s vs. %s at %s / %s vs. %s / %s%n", key.getReadName(), read.getReadName(), keyLoc, keyMateLoc, readLoc, readMateLoc));
+        return readSets;
+    }
+
+    protected List<SAMRecord> findDuplicateReads(SAMRecord read, Set<List<SAMRecord>> readSets ) {
+        if ( read.getReadPairedFlag() ) {
+            // paired
+            final GenomeLoc readMateLoc = GenomeLocParser.createGenomeLoc(read.getMateReferenceIndex(), read.getMateAlignmentStart(), read.getMateAlignmentStart());
+
+            for (List<SAMRecord> reads : readSets) {
+                SAMRecord key = reads.get(0);
+                //if (DEBUG)
+                //    logger.debug(String.format("Examining reads at %s vs. %s at %s / %s vs. %s / %s%n", key.getReadName(), read.getReadName(), keyLoc, keyMateLoc, readLoc, readMateLoc));

                // read and key start at the same place, and either the this read and the key
                // share a mate location or the read is flagged as a duplicate
-                if (readLoc.compareTo(keyLoc) == 0 ||
-                        read.getDuplicateReadFlag()) {
-                    if ((readMateLoc != null && keyMateLoc != null && readMateLoc.compareTo(keyMateLoc) == 0) ||
-                            (readMateLoc == null && keyMateLoc == null)) {
+                if ( read.getAlignmentStart() == key.getAlignmentStart() && key.getReadPairedFlag() && ( key.getDuplicateReadFlag() || read.getDuplicateReadFlag() ) ) {
+                    // at least one has to be marked as a duplicate
+                    final GenomeLoc keyMateLoc = GenomeLocParser.createGenomeLoc(key.getMateReferenceIndex(), key.getMateAlignmentStart(), key.getMateAlignmentStart());
+                    if ( readMateLoc.compareTo(keyMateLoc) == 0 ) {
                        // we are at the same position as the dup and have the same mat pos, it's a dup
-                        if (DEBUG) logger.debug(String.format("  => Adding read to dups list: %s%n", read));
-                        dups.add(read);
-                    } else {
-                        uniques.add(read);
+                        if (DEBUG) logger.debug(String.format("  => Adding read to dups list: %s %d %s vs. %s", read, reads.size(), readMateLoc, keyMateLoc));
+                        return reads;
                    }
-                } else {
-                    uniques.add(read);
                }
            }
        } else {
-            uniques = reads;
-        }
-
-        return new Pair<List<SAMRecord>, List<SAMRecord>>(uniques, dups);
-    }
-
-    /**
-     * Traverse by reads, given the data and the walker
-     *
-     * @param sum       of type T, the return from the walker
-     * @param <M>       the generic type
-     * @param <T>       the return type of the reduce function
-     * @param dupWalker our duplicates walker
-     * @param readIter  our iterator
-     *
-     * @return the reduce type, T, the final product of all the reduce calls
-     */
-    private <M, T> T actuallyTraverse(DuplicateWalker<M, T> dupWalker,
-                                      Iterator<SAMRecord> readIter,
-                                      T sum) {
-        /**
-         * while we still have more reads:
-         * ok, here's the idea.  We get all the reads that start at the same position in the genome
-         * We then split the list of reads into sublists of reads:
-         *   -> those with the same mate pair position, for paired reads
-         *   -> those flagged as unpaired and duplicated but having the same start and end and
-         */
-        PushbackIterator<SAMRecord> iter = new PushbackIterator<SAMRecord>(readIter);
-
-
-        for (SAMRecord read : iter) {
-
-            // get the genome loc from the read
-            GenomeLoc site = GenomeLocParser.createGenomeLoc(read);
-
-            List<SAMRecord> reads = readsAtLoc(read, iter);
-            Pair<List<SAMRecord>, List<SAMRecord>> split = splitDuplicates(reads);
-            List<SAMRecord> uniqueReads = split.getFirst();
-            List<SAMRecord> duplicateReads = split.getSecond();
-
-            logger.debug(String.format("*** TraverseDuplicates.traverse at %s with %d reads has %d unique and %d duplicate reads",
-                                       site, reads.size(), uniqueReads.size(), duplicateReads.size()));
-            if (reads.size() != uniqueReads.size() + duplicateReads.size())
-                throw new RuntimeException(String.format("Bug occurred spliting reads [N=%d] at loc %s into unique [N=%d] and duplicates [N=%d], sizes don't match",
-                                                         reads.size(), site.toString(), uniqueReads.size(), duplicateReads.size()));
-
-            // Jump forward in the reference to this locus location
-            AlignmentContext locus = new AlignmentContext(site, duplicateReads, Arrays.asList(0));
-
-            // update the number of duplicate sets we've seen
-            TraversalStatistics.nRecords++;
-
-            // we still have to fix the locus context provider to take care of this problem with > 1 length contexts
-            // AlignmentContext locus = locusProvider.getLocusContext(site);
-
-            byte[] refBases = new byte[0];
-
-            if (dupWalker.mapUniqueReadsTooP()) {
-                // Send each unique read to the map function
-                for (SAMRecord unique : uniqueReads) {
-                    List<SAMRecord> l = Arrays.asList(unique);
-                    sum = mapOne(dupWalker, uniqueReads, l, site, refBases, locus, sum);
+            for (List<SAMRecord> reads : readSets) {
+                SAMRecord key = reads.get(0);
+                boolean v = (! key.getReadPairedFlag()) && read.getAlignmentStart() == key.getAlignmentStart() && ( key.getDuplicateReadFlag() || read.getDuplicateReadFlag() ) && read.getReadLength() == key.getReadLength();
+                //System.out.printf("%s %s %b %b %d %d %d %d => %b%n",
+                //        read.getReadPairedFlag(), key.getReadPairedFlag(), read.getDuplicateReadFlag(), key.getDuplicateReadFlag(),
+                //        read.getAlignmentStart(), key.getAlignmentStart(), read.getReadLength(), key.getReadLength(), v);
+                if ( v ) {
+                    //System.out.printf("Returning reads...%n");
+                    return reads;
                }
            }
-
-            if (duplicateReads.size() > 0)
-                sum = mapOne(dupWalker, uniqueReads, duplicateReads, site, refBases, locus, sum);
-
-            printProgress(DUPS_STRING, site);
-
-            if (this.maximumIterations > 0 && TraversalStatistics.nRecords > this.maximumIterations) {
-                logger.warn(String.format(("Maximum number of duplicate sets encountered, terminating traversal " + TraversalStatistics.nRecords)));
-                break;
-            }
        }

-        return sum;
+        return null;
    }

    /**
@ -232,7 +162,7 @@ public class TraverseDuplicates extends TraversalEngine {

            if (result) {
                TraversalStatistics.nSkippedReads++;
-                //System.out.printf("  [filter] %s => %b %s", rec.getReadName(), result, why);
+                //System.out.printf("  [filter] %s => %b", rec.getReadName(), result);
            } else {
                TraversalStatistics.nReads++;
            }
@ -240,27 +170,12 @@ public class TraverseDuplicates extends TraversalEngine {
        }
    }

-    public <M, T> T mapOne(DuplicateWalker<M, T> dupWalker,
-                           List<SAMRecord> uniqueReads,
-                           List<SAMRecord> duplicateReads,
-                           GenomeLoc site,
-                           byte[] refBases,
-                           AlignmentContext locus,
-                           T sum) {
-        final boolean keepMeP = dupWalker.filter(site, refBases, locus, uniqueReads, duplicateReads);
-        if (keepMeP) {
-            M x = dupWalker.map(site, refBases, locus, uniqueReads, duplicateReads);
-            sum = dupWalker.reduce(x, sum);
-        }
-        return sum;
-    }
-
-
    // --------------------------------------------------------------------------------------------------------------
    //
    // new style interface to the system
    //
    // --------------------------------------------------------------------------------------------------------------
+
    /**
     * Traverse by reads, given the data and the walker
     *
@ -276,8 +191,7 @@ public class TraverseDuplicates extends TraversalEngine {
                             Shard shard,
                             ShardDataProvider dataProvider,
                             T sum) {
-
-        logger.debug(String.format("TraverseDuplicates.traverse Genomic interval is %s", ((ReadShard) shard).getSize()));
+        //logger.debug(String.format("TraverseDuplicates.traverse Genomic interval is %s", shard.getGenomeLoc()));

        if (!(walker instanceof DuplicateWalker))
            throw new IllegalArgumentException("Walker isn't a duplicate walker!");
@ -292,13 +206,46 @@ public class TraverseDuplicates extends TraversalEngine {

        FilteringIterator filterIter = new FilteringIterator(new ReadView(dataProvider).iterator(), new duplicateStreamFilterFunc());
        PushbackIterator<SAMRecord> iter = new PushbackIterator<SAMRecord>(filterIter);
-        return actuallyTraverse(dupWalker, iter, sum);
-    }

+        /**
+         * while we still have more reads:
+         * ok, here's the idea.  We get all the reads that start at the same position in the genome
+         * We then split the list of reads into sublists of reads:
+         *   -> those with the same mate pair position, for paired reads
+         *   -> those flagged as unpaired and duplicated but having the same start and end
+         */
+        for (SAMRecord read : iter) {
+            // get the genome loc from the read
+            GenomeLoc site = GenomeLocParser.createGenomeLoc(read);
+
+            Set<List<SAMRecord>> readSets = uniqueReadSets(readsAtLoc(read, iter));
+            logger.debug(String.format("*** TraverseDuplicates.traverse at %s with %d read sets", site, readSets.size()));
+
+            // Jump forward in the reference to this locus location
+            AlignmentContext locus = new AlignmentContext(site, new ReadBackedPileup(site));
+
+            // update the number of duplicate sets we've seen
+            TraversalStatistics.nRecords++;
+
+            final boolean keepMeP = dupWalker.filter(site, locus, readSets);
+            if (keepMeP) {
+                M x = dupWalker.map(site, locus, readSets);
+                sum = dupWalker.reduce(x, sum);
+            }
+
+            printProgress(DUPS_STRING, site);
+
+            if (this.maximumIterations > 0 && TraversalStatistics.nRecords > this.maximumIterations) {
+                logger.warn(String.format(("Maximum number of duplicate sets encountered, terminating traversal " + TraversalStatistics.nRecords)));
+                break;
+            }
+        }
+
+        return sum;
+    }

    /**
     * Temporary override of printOnTraversalDone.
-     * TODO: Add some sort of TE.getName() function once all TraversalEngines are ported.
     *
     * @param sum Result of the computation.
     * @param <T> Type of the result.
--- a/java/src/org/broadinstitute/sting/gatk/walkers/DuplicateWalker.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/DuplicateWalker.java
@ -4,6 +4,7 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
 import org.broadinstitute.sting.utils.GenomeLoc;

 import java.util.List;
+import java.util.Set;

 import net.sf.samtools.SAMRecord;

@ -17,9 +18,7 @@ import net.sf.samtools.SAMRecord;
@Requires({DataSource.READS,DataSource.REFERENCE})
 public abstract class DuplicateWalker<MapType, ReduceType> extends Walker<MapType, ReduceType> {
    // Do we actually want to operate on the context?
-    public boolean filter(GenomeLoc loc, byte[] refBases, AlignmentContext context,
-                          List<SAMRecord> uniqueReads,
-                          List<SAMRecord> duplicateReads) {
+    public boolean filter(GenomeLoc loc, AlignmentContext context, Set<List<SAMRecord>> readSets ) {
        return true;    // We are keeping all the reads
    }

@ -31,9 +30,14 @@ public abstract class DuplicateWalker<MapType, ReduceType> extends Walker<MapTyp
     */
    public boolean mapUniqueReadsTooP() { return false; }

-    public abstract MapType map(GenomeLoc loc, byte[] refBases, AlignmentContext context,
-                                List<SAMRecord> uniqueReads,
-                                List<SAMRecord> duplicateReads);
+    /**
+     * Called by the traversal engine to decide whether to call map() at loci without duplicate reads
+     *
+     * @return true if you want to see non duplicates during the traversal
+     */
+    public boolean mapAtLociWithoutDuplicates() { return true; }
+    
+    public abstract MapType map(GenomeLoc loc, AlignmentContext context, Set<List<SAMRecord>> readSets );

    // Given result of map function
    public abstract ReduceType reduceInit();
--- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/DuplicateQualsWalker.java
+++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/DuplicateQualsWalker.java
@ -23,7 +23,7 @@
 * OTHER DEALINGS IN THE SOFTWARE.
 */

-package org.broadinstitute.sting.playground.gatk.walkers.duplicates;
+package org.broadinstitute.sting.oneoffprojects.walkers;

 import net.sf.samtools.SAMRecord;
 import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
@ -39,6 +39,7 @@ import org.broadinstitute.sting.utils.duplicates.DuplicateComp;
 import java.io.PrintStream;
 import java.util.ArrayList;
 import java.util.List;
+import java.util.Set;

 class MismatchCounter {
    long nObs = 0;
@ -160,41 +161,40 @@ public class DuplicateQualsWalker extends DuplicateWalker<List<DuplicateComp>, Q
    }

    // Print out data for regression
-    public List<DuplicateComp> map(GenomeLoc loc, byte[] refBases, AlignmentContext context,
-                                   List<SAMRecord> uniqueReads,
-                                   List<SAMRecord> duplicateReads) {
+    public List<DuplicateComp> map(GenomeLoc loc, AlignmentContext context, Set<List<SAMRecord>> readSets ) {
        //logger.info(String.format("%s has %d duplicates and %d non-duplicates", loc, duplicateReads.size(), uniqueReads.size()));
        List<DuplicateComp> pairwiseComps = new ArrayList<DuplicateComp>();
-        
-        if ( ! ACTUALLY_DO_WORK )
-            return pairwiseComps;

-        if ( COMBINE_QUALS ) {
-            Pair<SAMRecord, SAMRecord> combinedReads = DupUtils.combinedReadPair( duplicateReads );
-            if ( combinedReads != null ) {
-                SAMRecord combined1 = combinedReads.first;
-                SAMRecord combined2 = combinedReads.second;
-
-                if ( comparePairToSingleton )
-                    pairwiseComps = addPairwiseMatches( pairwiseComps, combined1, duplicateReads.get(2), uniqueReads );
-                else
-                    pairwiseComps = addPairwiseMatches( pairwiseComps, combined1, combined2, uniqueReads );
-            }
-        } else {
-            int nComparisons = 0;
-            for ( SAMRecord read1 : duplicateReads ) {
-                for ( SAMRecord read2 : duplicateReads ) {
-                    if ( read1.hashCode() < read2.hashCode() && DupUtils.usableDuplicate(read1, read2) ) {
-                        // the hashcode insures we don't do A vs. B and B vs. A
-                        //System.out.printf("Comparing %s against %s%n", read1, read2);
-                        nComparisons++;
-                        pairwiseComps = addPairwiseMatches( pairwiseComps, read1, read2, uniqueReads );
-                        if ( nComparisons > MAX_PAIRSIZE_COMPS_PER_DUPLICATE_SET )
-                            break;
-                    }
-                }
-            }
-        }
+        // todo -- fixme -- the logic here is all wrong given new interface        
+//        if ( ! ACTUALLY_DO_WORK )
+//            return pairwiseComps;
+//
+//        if ( COMBINE_QUALS ) {
+//            Pair<SAMRecord, SAMRecord> combinedReads = DupUtils.combinedReadPair( duplicateReads );
+//            if ( combinedReads != null ) {
+//                SAMRecord combined1 = combinedReads.first;
+//                SAMRecord combined2 = combinedReads.second;
+//
+//                if ( comparePairToSingleton )
+//                    pairwiseComps = addPairwiseMatches( pairwiseComps, combined1, duplicateReads.get(2), uniqueReads );
+//                else
+//                    pairwiseComps = addPairwiseMatches( pairwiseComps, combined1, combined2, uniqueReads );
+//            }
+//        } else {
+//            int nComparisons = 0;
+//            for ( SAMRecord read1 : duplicateReads ) {
+//                for ( SAMRecord read2 : duplicateReads ) {
+//                    if ( read1.hashCode() < read2.hashCode() && DupUtils.usableDuplicate(read1, read2) ) {
+//                        // the hashcode insures we don't do A vs. B and B vs. A
+//                        //System.out.printf("Comparing %s against %s%n", read1, read2);
+//                        nComparisons++;
+//                        pairwiseComps = addPairwiseMatches( pairwiseComps, read1, read2, uniqueReads );
+//                        if ( nComparisons > MAX_PAIRSIZE_COMPS_PER_DUPLICATE_SET )
+//                            break;
+//                    }
+//                }
+//            }
+//        }

        return pairwiseComps;
    }
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/duplicates/CombineDuplicatesWalker.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/duplicates/CombineDuplicatesWalker.java
@ -7,6 +7,8 @@ import org.broadinstitute.sting.utils.duplicates.DupUtils;
 import org.broadinstitute.sting.utils.cmdLine.Argument;

 import java.util.List;
+import java.util.Set;
+import java.util.ArrayList;

 import net.sf.samtools.SAMRecord;
 import net.sf.samtools.SAMFileWriter;
@ -15,7 +17,7 @@ import net.sf.samtools.SAMFileWriter;
 * Process the input bam file, optionally emitting all the unique reads found, and emitting the combined duplicate reads to
 * the specified output BAM location.  If no output location is specified, the reads are written to STDOUT. 
 */
-public class CombineDuplicatesWalker extends DuplicateWalker<SAMRecord, SAMFileWriter> {
+public class CombineDuplicatesWalker extends DuplicateWalker<List<SAMRecord>, SAMFileWriter> {
    @Argument(fullName="outputBAM", shortName="outputBAM", required=false, doc="BAM File to write combined duplicates to")
    public SAMFileWriter outputBAM = null;

@ -45,50 +47,58 @@ public class CombineDuplicatesWalker extends DuplicateWalker<SAMRecord, SAMFileW
    /**
     * emit the read that was produced by combining the dupplicates
     */
-    public SAMFileWriter reduce(SAMRecord read, SAMFileWriter output) {
-        if ( output != null ) {
-            output.addAlignment(read);
-        } else {
-            out.println(read.format());
+    public SAMFileWriter reduce(List<SAMRecord> reads, SAMFileWriter output) {
+        for ( SAMRecord read : reads ) {
+            if ( output != null ) {
+                output.addAlignment(read);
+            } else {
+                out.println(read.format());
+            }
        }

        return output;
    }

+    /**
+     * We don't want to see loci without duplicates, since 
+     * @return
+     */
+    public boolean mapAtLociWithoutDuplicates() { return false; }
+
    /**
     * Build a combined read given the input list of non-unique reads.  If there's just one read in the
     * set, it's considered unique and returned.  If there's more than one, the N-way combine
     * duplicate function is invoked.
     *
     * @param loc the genome loc
-     * @param refBases the reference bases for the given locus
     * @param context the alignment context that has the reads information
-     * @param duplicateReads a list of the dupplicate reads at this locus
-     * @param uniqueReads the unique read list at this locus
+     * @param readSets the set of unique reads list at this locus
     * @return a read that combines the dupplicate reads at this locus
     */
-    public SAMRecord map(GenomeLoc loc, byte[] refBases, AlignmentContext context,
-                         List<SAMRecord> uniqueReads,
-                         List<SAMRecord> duplicateReads) {
-        //logger.info(String.format("%s has %d duplicates and %d non-duplicates", loc, duplicateReads.size(), uniqueReads.size()));
+    public List<SAMRecord> map(GenomeLoc loc, AlignmentContext context, Set<List<SAMRecord>> readSets ) {
+        List<SAMRecord> combinedReads = new ArrayList<SAMRecord>();

-        SAMRecord combinedRead = null;
+        for ( List<SAMRecord> reads : readSets ) {
+            SAMRecord combinedRead = null;

-        if ( duplicateReads.size() == 1 && ! duplicateReads.get(0).getDuplicateReadFlag() ) {
-            // we are a unique read
-            combinedRead = duplicateReads.get(0);
-        } else {
-            // actually call the combine function
-            //for (SAMRecord read : duplicateReads ) {
-            //    System.out.printf("Read %s%n", read.format());
-            //}
-            combinedRead = DupUtils.combineDuplicates(duplicateReads, MAX_QUALITY_SCORE);
+            if ( reads.size() == 1 && ! reads.get(0).getDuplicateReadFlag() ) {
+                // we are a unique read
+                combinedRead = reads.get(0);
+            } else {
+                // actually call the combine function
+                //for (SAMRecord read : duplicateReads ) {
+                //    System.out.printf("Read %s%n", read.format());
+                //}
+                combinedRead = DupUtils.combineDuplicates(reads, MAX_QUALITY_SCORE);
+            }
+
+            if ( combinedRead.getDuplicateReadFlag() )
+                throw new RuntimeException(String.format("Combined read %s [of %d] is a duplicate after combination -- this is a bug%n%s",
+                        combinedRead.getReadName(), reads.size(), combinedRead.format()));
+
+            combinedReads.add(combinedRead);
        }

-        if ( combinedRead.getDuplicateReadFlag() )
-            throw new RuntimeException(String.format("Combined read %s [of %d] is a duplicate after combination -- this is a bug%n%s",
-                    combinedRead.getReadName(), duplicateReads.size(), combinedRead.format()));
-        
-        return combinedRead;
+        return combinedReads;
    }
 }
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/duplicates/CountDuplicatesWalker.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/duplicates/CountDuplicatesWalker.java
@ -29,41 +29,61 @@ import net.sf.samtools.SAMRecord;
 import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
 import org.broadinstitute.sting.gatk.walkers.DuplicateWalker;
 import org.broadinstitute.sting.utils.GenomeLoc;
+import org.broadinstitute.sting.utils.Utils;
+import org.broadinstitute.sting.utils.cmdLine.Argument;

 import java.util.List;
+import java.util.Set;
+import java.util.ArrayList;

 /**
 * a class to store the traversal information we pass around
 */
 class DuplicateCount  {
-        public int count = 0; // the count of sites we were given
-        public int undupDepth = 0; // the unique read count
-        public int depth = 0; // the dupplicate read depth
-    }
+    public int count = 0;       // the count of sites we were given
+    public int nUniqueMolecules = 0;  // the unique read count
+    public int nDuplicatedMolecules = 0;  // the unique read count
+    public int depth = 0;       // the dupplicate read depth
+}

 /**
 * Count the number of unique reads, duplicates, and the average depth of unique reads and duplicates at all positions.
 * @author aaron
 */
 public class CountDuplicatesWalker extends DuplicateWalker<DuplicateCount, DuplicateCount> {
+    @Argument(fullName="quiet", required=false, doc="If true, per locus information isn't printex")
+    public boolean quiet = false;

    /**
     * the map function, conforming to the duplicates interface
     * @param loc the genomic location
-     * @param refBases the reference bases that cover this position, which we turn off
     * @param context the AlignmentContext, containing all the reads overlapping this region
-     * @param uniqueReads all the unique reads, bundled together
-     * @param duplicateReads all the duplicate reads
+     * @param readSets all the duplicate reads
     * @return a DuplicateCount object, with the appropriate stats
     */
-    public DuplicateCount map(GenomeLoc loc, byte[] refBases, AlignmentContext context, List<SAMRecord> uniqueReads, List<SAMRecord> duplicateReads) {
+    public DuplicateCount map(GenomeLoc loc, AlignmentContext context, Set<List<SAMRecord>> readSets ) {
+        if ( ! quiet ) out.printf("%s with %d read sets => ", loc, readSets.size());
+
        DuplicateCount dup = new DuplicateCount();
+        dup.depth = 0;
+        for ( List<SAMRecord> reads : readSets) {
+            List<String> names = new ArrayList<String>();
+            for ( SAMRecord read : reads ) {
+                names.add(read.getReadName());
+            }
+            if ( ! quiet ) out.printf("%d reads [%s] ", reads.size(), Utils.join(",", names));
+            dup.depth += reads.size();
+            dup.nDuplicatedMolecules += reads.size() > 1 ? 1 : 0;   // if there's more than 1 read per set, we're a duplicated reads
+        }
+        if ( ! quiet ) out.printf("%n");
+
        dup.count = 1;
-        dup.undupDepth = uniqueReads.size();
-        dup.depth = duplicateReads.size();
+        dup.nUniqueMolecules = readSets.size();
        return dup;
    }

+    public boolean mapAtLociWithoutDuplicates() { return true; }
+
    /**
     * setup our walker.  In this case, new a DuplicateCount object and return it
     * @return the object holding the counts of the duplicates
@ -82,7 +102,8 @@ public class CountDuplicatesWalker extends DuplicateWalker<DuplicateCount, Dupli
        DuplicateCount dup = new DuplicateCount();
        dup.count = sum.count + value.count;
        dup.depth = value.depth + sum.depth;
-        dup.undupDepth = value.undupDepth + sum.undupDepth;
+        dup.nDuplicatedMolecules = value.nDuplicatedMolecules + sum.nDuplicatedMolecules;
+        dup.nUniqueMolecules = value.nUniqueMolecules + sum.nUniqueMolecules;
        return dup;
    }

@ -93,9 +114,10 @@ public class CountDuplicatesWalker extends DuplicateWalker<DuplicateCount, Dupli
    public void onTraversalDone(DuplicateCount result) {
        out.println("[REDUCE RESULT] Traversal result is: ");
        out.println("traversal iterations = " + result.count);
-        out.println("average depth = " + (double)result.depth / (double)result.count);
-        out.println("duplicates seen = " + result.depth);
-        out.println("unique read count = " + result.undupDepth);
-        out.println("average unique read depth = " + (double)result.undupDepth / (double)result.count);
+        out.printf("average depth = %.2f%n", (double)result.depth / (double)result.count);
+        out.println("unique molecules seen = " + result.nUniqueMolecules);
+        out.println("duplicated molecules seen = " + result.nDuplicatedMolecules);
+        out.printf("percent duplicated = %.2f%%%n", result.nDuplicatedMolecules / (double)result.nUniqueMolecules * 100);
+        out.printf("average unique read depth = %.2f%n", (double)result.nUniqueMolecules / (double)result.count);
    }
 }
--- a/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileup.java
+++ b/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileup.java
@ -39,6 +39,13 @@ public class ReadBackedPileup implements Iterable<PileupElement> {
        this(loc, readsOffsets2Pileup(reads, offsets));
    }

+    /**
+     * Create a new version of a read backed pileup at loc without any aligned reads
+     *
+     */
+     public ReadBackedPileup(GenomeLoc loc ) {
+        this(loc, new ArrayList<PileupElement>(0));
+    }

    /**
     * Create a new version of a read backed pileup at loc, using the reads and their corresponding
--- a/java/test/org/broadinstitute/sting/gatk/traversals/TraverseDuplicatesTest.java
+++ b/java/test/org/broadinstitute/sting/gatk/traversals/TraverseDuplicatesTest.java
@ -12,6 +12,7 @@ import org.junit.Test;

 import java.util.ArrayList;
 import java.util.List;
+import java.util.Set;


 /**
@ -37,49 +38,48 @@ public class TraverseDuplicatesTest extends BaseTest {
    public void testAllDupplicatesNoPairs() {
        List<SAMRecord> list = new ArrayList<SAMRecord>();
        for (int x = 0; x < 10; x++) {
-            SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "SWEET_READ", 0, 1, 100);
+            SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "SWEET_READ" + x, 0, 1, 100);
            read.setDuplicateReadFlag(true);
            list.add(read);
        }
-        Pair<List<SAMRecord>, List<SAMRecord>> myPairing = obj.splitDuplicates(list);
-        Assert.assertEquals(0, myPairing.first.size());   // unique
-        Assert.assertEquals(10, myPairing.second.size()); // dup's
+        Set<List<SAMRecord>> myPairings = obj.uniqueReadSets(list);
+        Assert.assertEquals(1, myPairings.size());
+        Assert.assertEquals(10, myPairings.iterator().next().size()); // dup's
    }

    @Test
    public void testNoDupplicatesNoPairs() {
        List<SAMRecord> list = new ArrayList<SAMRecord>();
        for (int x = 0; x < 10; x++) {
-            SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "SWEET_READ", 0, 1, 100);
+            SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "SWEET_READ" + x, 0, 1, 100);
            read.setDuplicateReadFlag(false);
            list.add(read);
        }
-        Pair<List<SAMRecord>, List<SAMRecord>> myPairing = obj.splitDuplicates(list);
-        Assert.assertEquals(10, myPairing.first.size()); // unique
-        Assert.assertEquals(0, myPairing.second.size()); // dup's
+
+        Set<List<SAMRecord>> myPairing = obj.uniqueReadSets(list);
+        Assert.assertEquals(10, myPairing.size()); // unique
    }

    @Test
    public void testFiftyFiftyNoPairs() {
        List<SAMRecord> list = new ArrayList<SAMRecord>();
        for (int x = 0; x < 5; x++) {
-            SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "SWEET_READ", 0, 1, 100);
+            SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "SWEET_READ" + x, 0, 1, 100);
            read.setDuplicateReadFlag(true);
            list.add(read);
        }
        for (int x = 10; x < 15; x++)
            list.add(ArtificialSAMUtils.createArtificialRead(header, String.valueOf(x), 0, x, 100));

-        Pair<List<SAMRecord>, List<SAMRecord>> myPairing = obj.splitDuplicates(list);
-        Assert.assertEquals(5, myPairing.first.size());  // unique
-        Assert.assertEquals(5, myPairing.second.size()); // dup's
+        Set<List<SAMRecord>> myPairing = obj.uniqueReadSets(list);
+        Assert.assertEquals(6, myPairing.size());  // unique
    }

    @Test
    public void testAllDupplicatesAllPairs() {
        List<SAMRecord> list = new ArrayList<SAMRecord>();
        for (int x = 0; x < 10; x++) {
-            SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "SWEET_READ", 0, 1, 100);
+            SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "SWEET_READ"+ x, 0, 1, 100);
            read.setDuplicateReadFlag(true);
            read.setMateAlignmentStart(100);
            read.setMateReferenceIndex(0);
@ -87,16 +87,15 @@ public class TraverseDuplicatesTest extends BaseTest {
            list.add(read);
        }

-        Pair<List<SAMRecord>, List<SAMRecord>> myPairing = obj.splitDuplicates(list);
-        Assert.assertEquals(0, myPairing.first.size());  // unique
-        Assert.assertEquals(10, myPairing.second.size()); // dup's
+        Set<List<SAMRecord>> myPairing = obj.uniqueReadSets(list);
+        Assert.assertEquals(1, myPairing.size());  // unique
    }

    @Test
    public void testNoDupplicatesAllPairs() {
        List<SAMRecord> list = new ArrayList<SAMRecord>();
        for (int x = 0; x < 10; x++) {
-            SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "SWEET_READ", 0, 1, 100);
+            SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "SWEET_READ"+ x, 0, 1, 100);
            if (x == 0) read.setDuplicateReadFlag(true); // one is a dup but (next line)
            read.setMateAlignmentStart(100); // they all have a shared start and mate start so they're dup's
            read.setMateReferenceIndex(0);
@ -104,16 +103,15 @@ public class TraverseDuplicatesTest extends BaseTest {
            list.add(read);
        }

-        Pair<List<SAMRecord>, List<SAMRecord>> myPairing = obj.splitDuplicates(list);
-        Assert.assertEquals(0, myPairing.first.size());  // unique
-        Assert.assertEquals(10, myPairing.second.size()); // dup's
+        Set<List<SAMRecord>> myPairing = obj.uniqueReadSets(list);
+        Assert.assertEquals(1, myPairing.size());  // unique
    }

    @Test
    public void testAllDupplicatesAllPairsDifferentPairedEnd() {
        List<SAMRecord> list = new ArrayList<SAMRecord>();
        for (int x = 0; x < 10; x++) {
-            SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "SWEET_READ", 0, 1, 100);
+            SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "SWEET_READ" + x, 0, 1, 100);
            if (x == 0) read.setDuplicateReadFlag(true); // one is a dup
            read.setMateAlignmentStart(100 + x);
            read.setMateReferenceIndex(0);
@ -121,8 +119,7 @@ public class TraverseDuplicatesTest extends BaseTest {
            list.add(read);
        }

-        Pair<List<SAMRecord>, List<SAMRecord>> myPairing = obj.splitDuplicates(list);
-        Assert.assertEquals(9, myPairing.first.size());  // unique
-        Assert.assertEquals(1, myPairing.second.size()); // dup's
+        Set<List<SAMRecord>> myPairing = obj.uniqueReadSets(list);
+        Assert.assertEquals(10, myPairing.size());  // unique
    }
 }
--- a/python/snpSelector.py
+++ b/python/snpSelector.py
@ -249,7 +249,7 @@ def calculateBinsForValues(values, field, minValue, maxValue, partitions):
            bins[-1][1] = bin[0]
            bins[-1][2] += curSize

-    #print 'Returning ', bins
+    print 'Returning ', bins
    #sys.exit(1)
    return bins