Completely rewritten duplicate traversal, more free of bugs, with integration tests for count duplicates walker validated on a TCGA hybrid capture lane.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2458 348d0f76-0448-11de-a6fe-93d51630548a
2009-12-28 23:56:49 +00:00 · 2009-12-28 23:56:49 +00:00 · fcc80e8632
parent 4617052b3c
commit fcc80e8632
8 changed files with 227 additions and 240 deletions
--- a/java/src/org/broadinstitute/sting/gatk/traversals/TraverseDuplicates.java
+++ b/java/src/org/broadinstitute/sting/gatk/traversals/TraverseDuplicates.java
@ -32,6 +32,7 @@ import org.apache.log4j.Logger;
 import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
 import org.broadinstitute.sting.gatk.datasources.providers.ReadView;
 import org.broadinstitute.sting.gatk.datasources.providers.ShardDataProvider;
 import org.broadinstitute.sting.gatk.datasources.providers.ManagingReferenceOrderedView;
 import org.broadinstitute.sting.gatk.datasources.shards.ReadShard;
 import org.broadinstitute.sting.gatk.datasources.shards.Shard;
 import org.broadinstitute.sting.gatk.iterators.PushbackIterator;
@ -40,11 +41,10 @@ import org.broadinstitute.sting.gatk.walkers.Walker;
 import org.broadinstitute.sting.utils.GenomeLoc;
 import org.broadinstitute.sting.utils.GenomeLocParser;
 import org.broadinstitute.sting.utils.Pair;
 import org.broadinstitute.sting.utils.StingException;
 import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
-import java.util.ArrayList;
+import java.util.*;
 import java.util.Arrays;
 import java.util.Iterator;
 import java.util.List;
 /**
 * @author Mark DePristo
@ -85,128 +85,58 @@ public class TraverseDuplicates extends TraversalEngine {
        return l;
    }
-    protected Pair<List<SAMRecord>, List<SAMRecord>> splitDuplicates(List<SAMRecord> reads) {
+    protected Set<List<SAMRecord>> uniqueReadSets(List<SAMRecord> reads) {
-        List<SAMRecord> uniques = new ArrayList<SAMRecord>();
+        Set<List<SAMRecord>> readSets = new HashSet<List<SAMRecord>>();
-        List<SAMRecord> dups = new ArrayList<SAMRecord>();
+        for ( SAMRecord read : reads ) {
-        
+            
-        // find the first duplicate
+            List<SAMRecord> readSet = findDuplicateReads(read, readSets);
-        SAMRecord key = null;
+            if ( readSet == null ) {
-        for (SAMRecord read : reads) {
+                readSets.add(new ArrayList<SAMRecord>(Arrays.asList(read)));    // copy so I can add to the list
-            if (read.getDuplicateReadFlag()) {
+            } else {
-                // this is our key
+                readSet.add(read);
                key = read;
                if (DEBUG) logger.debug(String.format("Key %s is a duplicate", read.getReadName()));
                break;
            }
        }
-        // At this point, there are two possibilities, we have found at least one dup or not
+        return readSets;
-        // if it's a dup, add it to the dups list, otherwise add it to the uniques list 
+    }
-        if (key != null) {
+
-            final GenomeLoc keyLoc = GenomeLocParser.createGenomeLoc(key);
+    protected List<SAMRecord> findDuplicateReads(SAMRecord read, Set<List<SAMRecord>> readSets ) {
-            final GenomeLoc keyMateLoc = (!key.getReadPairedFlag()) ? null :
+        if ( read.getReadPairedFlag() ) {
-                    GenomeLocParser.createGenomeLoc(key.getMateReferenceIndex(), key.getMateAlignmentStart(), key.getMateAlignmentStart());
+            // paired
-            for (SAMRecord read : reads) {
+            final GenomeLoc readMateLoc = GenomeLocParser.createGenomeLoc(read.getMateReferenceIndex(), read.getMateAlignmentStart(), read.getMateAlignmentStart());
-                final GenomeLoc readLoc = GenomeLocParser.createGenomeLoc(read);
+
-                final GenomeLoc readMateLoc = (!key.getReadPairedFlag()) ? null :
+            for (List<SAMRecord> reads : readSets) {
-                        GenomeLocParser.createGenomeLoc(read.getMateReferenceIndex(), read.getMateAlignmentStart(), read.getMateAlignmentStart());
+                SAMRecord key = reads.get(0);
-                if (DEBUG)
+                //if (DEBUG)
-                    logger.debug(String.format("Examining reads at %s vs. %s at %s / %s vs. %s / %s%n", key.getReadName(), read.getReadName(), keyLoc, keyMateLoc, readLoc, readMateLoc));
+                //    logger.debug(String.format("Examining reads at %s vs. %s at %s / %s vs. %s / %s%n", key.getReadName(), read.getReadName(), keyLoc, keyMateLoc, readLoc, readMateLoc));
                // read and key start at the same place, and either the this read and the key
                // share a mate location or the read is flagged as a duplicate
-                if (readLoc.compareTo(keyLoc) == 0 ||
+                if ( read.getAlignmentStart() == key.getAlignmentStart() && key.getReadPairedFlag() && ( key.getDuplicateReadFlag() || read.getDuplicateReadFlag() ) ) {
-                        read.getDuplicateReadFlag()) {
+                    // at least one has to be marked as a duplicate
-                    if ((readMateLoc != null && keyMateLoc != null && readMateLoc.compareTo(keyMateLoc) == 0) ||
+                    final GenomeLoc keyMateLoc = GenomeLocParser.createGenomeLoc(key.getMateReferenceIndex(), key.getMateAlignmentStart(), key.getMateAlignmentStart());
-                            (readMateLoc == null && keyMateLoc == null)) {
+                    if ( readMateLoc.compareTo(keyMateLoc) == 0 ) {
                        // we are at the same position as the dup and have the same mat pos, it's a dup
-                        if (DEBUG) logger.debug(String.format("  => Adding read to dups list: %s%n", read));
+                        if (DEBUG) logger.debug(String.format("  => Adding read to dups list: %s %d %s vs. %s", read, reads.size(), readMateLoc, keyMateLoc));
-                        dups.add(read);
+                        return reads;
                    } else {
                        uniques.add(read);
                    }
                } else {
                    uniques.add(read);
                }
            }
        } else {
-            uniques = reads;
+            for (List<SAMRecord> reads : readSets) {
-        }
+                SAMRecord key = reads.get(0);
-
+                boolean v = (! key.getReadPairedFlag()) && read.getAlignmentStart() == key.getAlignmentStart() && ( key.getDuplicateReadFlag() || read.getDuplicateReadFlag() ) && read.getReadLength() == key.getReadLength();
-        return new Pair<List<SAMRecord>, List<SAMRecord>>(uniques, dups);
+                //System.out.printf("%s %s %b %b %d %d %d %d => %b%n",
-    }
+                //        read.getReadPairedFlag(), key.getReadPairedFlag(), read.getDuplicateReadFlag(), key.getDuplicateReadFlag(),
-
+                //        read.getAlignmentStart(), key.getAlignmentStart(), read.getReadLength(), key.getReadLength(), v);
-    /**
+                if ( v ) {
-     * Traverse by reads, given the data and the walker
+                    //System.out.printf("Returning reads...%n");
-     *
+                    return reads;
     * @param sum       of type T, the return from the walker
     * @param <M>       the generic type
     * @param <T>       the return type of the reduce function
     * @param dupWalker our duplicates walker
     * @param readIter  our iterator
     *
     * @return the reduce type, T, the final product of all the reduce calls
     */
    private <M, T> T actuallyTraverse(DuplicateWalker<M, T> dupWalker,
                                      Iterator<SAMRecord> readIter,
                                      T sum) {
        /**
         * while we still have more reads:
         * ok, here's the idea.  We get all the reads that start at the same position in the genome
         * We then split the list of reads into sublists of reads:
         *   -> those with the same mate pair position, for paired reads
         *   -> those flagged as unpaired and duplicated but having the same start and end and
         */
        PushbackIterator<SAMRecord> iter = new PushbackIterator<SAMRecord>(readIter);
        for (SAMRecord read : iter) {
            // get the genome loc from the read
            GenomeLoc site = GenomeLocParser.createGenomeLoc(read);
            List<SAMRecord> reads = readsAtLoc(read, iter);
            Pair<List<SAMRecord>, List<SAMRecord>> split = splitDuplicates(reads);
            List<SAMRecord> uniqueReads = split.getFirst();
            List<SAMRecord> duplicateReads = split.getSecond();
            logger.debug(String.format("*** TraverseDuplicates.traverse at %s with %d reads has %d unique and %d duplicate reads",
                                       site, reads.size(), uniqueReads.size(), duplicateReads.size()));
            if (reads.size() != uniqueReads.size() + duplicateReads.size())
                throw new RuntimeException(String.format("Bug occurred spliting reads [N=%d] at loc %s into unique [N=%d] and duplicates [N=%d], sizes don't match",
                                                         reads.size(), site.toString(), uniqueReads.size(), duplicateReads.size()));
            // Jump forward in the reference to this locus location
            AlignmentContext locus = new AlignmentContext(site, duplicateReads, Arrays.asList(0));
            // update the number of duplicate sets we've seen
            TraversalStatistics.nRecords++;
            // we still have to fix the locus context provider to take care of this problem with > 1 length contexts
            // AlignmentContext locus = locusProvider.getLocusContext(site);
            byte[] refBases = new byte[0];
            if (dupWalker.mapUniqueReadsTooP()) {
                // Send each unique read to the map function
                for (SAMRecord unique : uniqueReads) {
                    List<SAMRecord> l = Arrays.asList(unique);
                    sum = mapOne(dupWalker, uniqueReads, l, site, refBases, locus, sum);
                }
            }
            if (duplicateReads.size() > 0)
                sum = mapOne(dupWalker, uniqueReads, duplicateReads, site, refBases, locus, sum);
            printProgress(DUPS_STRING, site);
            if (this.maximumIterations > 0 && TraversalStatistics.nRecords > this.maximumIterations) {
                logger.warn(String.format(("Maximum number of duplicate sets encountered, terminating traversal " + TraversalStatistics.nRecords)));
                break;
            }
        }
-        return sum;
+        return null;
    }
    /**
@ -232,7 +162,7 @@ public class TraverseDuplicates extends TraversalEngine {
            if (result) {
                TraversalStatistics.nSkippedReads++;
-                //System.out.printf("  [filter] %s => %b %s", rec.getReadName(), result, why);
+                //System.out.printf("  [filter] %s => %b", rec.getReadName(), result);
            } else {
                TraversalStatistics.nReads++;
            }
@ -240,27 +170,12 @@ public class TraverseDuplicates extends TraversalEngine {
        }
    }
    public <M, T> T mapOne(DuplicateWalker<M, T> dupWalker,
                           List<SAMRecord> uniqueReads,
                           List<SAMRecord> duplicateReads,
                           GenomeLoc site,
                           byte[] refBases,
                           AlignmentContext locus,
                           T sum) {
        final boolean keepMeP = dupWalker.filter(site, refBases, locus, uniqueReads, duplicateReads);
        if (keepMeP) {
            M x = dupWalker.map(site, refBases, locus, uniqueReads, duplicateReads);
            sum = dupWalker.reduce(x, sum);
        }
        return sum;
    }
    // --------------------------------------------------------------------------------------------------------------
    //
    // new style interface to the system
    //
    // --------------------------------------------------------------------------------------------------------------
    /**
     * Traverse by reads, given the data and the walker
     *
@ -276,8 +191,7 @@ public class TraverseDuplicates extends TraversalEngine {
                             Shard shard,
                             ShardDataProvider dataProvider,
                             T sum) {
-
+        //logger.debug(String.format("TraverseDuplicates.traverse Genomic interval is %s", shard.getGenomeLoc()));
        logger.debug(String.format("TraverseDuplicates.traverse Genomic interval is %s", ((ReadShard) shard).getSize()));
        if (!(walker instanceof DuplicateWalker))
            throw new IllegalArgumentException("Walker isn't a duplicate walker!");
@ -292,13 +206,46 @@ public class TraverseDuplicates extends TraversalEngine {
        FilteringIterator filterIter = new FilteringIterator(new ReadView(dataProvider).iterator(), new duplicateStreamFilterFunc());
        PushbackIterator<SAMRecord> iter = new PushbackIterator<SAMRecord>(filterIter);
        return actuallyTraverse(dupWalker, iter, sum);
    }
        /**
         * while we still have more reads:
         * ok, here's the idea.  We get all the reads that start at the same position in the genome
         * We then split the list of reads into sublists of reads:
         *   -> those with the same mate pair position, for paired reads
         *   -> those flagged as unpaired and duplicated but having the same start and end
         */
        for (SAMRecord read : iter) {
            // get the genome loc from the read
            GenomeLoc site = GenomeLocParser.createGenomeLoc(read);
            Set<List<SAMRecord>> readSets = uniqueReadSets(readsAtLoc(read, iter));
            logger.debug(String.format("*** TraverseDuplicates.traverse at %s with %d read sets", site, readSets.size()));
            // Jump forward in the reference to this locus location
            AlignmentContext locus = new AlignmentContext(site, new ReadBackedPileup(site));
            // update the number of duplicate sets we've seen
            TraversalStatistics.nRecords++;
            final boolean keepMeP = dupWalker.filter(site, locus, readSets);
            if (keepMeP) {
                M x = dupWalker.map(site, locus, readSets);
                sum = dupWalker.reduce(x, sum);
            }
            printProgress(DUPS_STRING, site);
            if (this.maximumIterations > 0 && TraversalStatistics.nRecords > this.maximumIterations) {
                logger.warn(String.format(("Maximum number of duplicate sets encountered, terminating traversal " + TraversalStatistics.nRecords)));
                break;
            }
        }
        return sum;
    }
    /**
     * Temporary override of printOnTraversalDone.
     * TODO: Add some sort of TE.getName() function once all TraversalEngines are ported.
     *
     * @param sum Result of the computation.
     * @param <T> Type of the result.
--- a/java/src/org/broadinstitute/sting/gatk/walkers/DuplicateWalker.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/DuplicateWalker.java
@ -4,6 +4,7 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
 import org.broadinstitute.sting.utils.GenomeLoc;
 import java.util.List;
 import java.util.Set;
 import net.sf.samtools.SAMRecord;
@ -17,9 +18,7 @@ import net.sf.samtools.SAMRecord;
@Requires({DataSource.READS,DataSource.REFERENCE})
 public abstract class DuplicateWalker<MapType, ReduceType> extends Walker<MapType, ReduceType> {
    // Do we actually want to operate on the context?
-    public boolean filter(GenomeLoc loc, byte[] refBases, AlignmentContext context,
+    public boolean filter(GenomeLoc loc, AlignmentContext context, Set<List<SAMRecord>> readSets ) {
                          List<SAMRecord> uniqueReads,
                          List<SAMRecord> duplicateReads) {
        return true;    // We are keeping all the reads
    }
@ -31,9 +30,14 @@ public abstract class DuplicateWalker<MapType, ReduceType> extends Walker<MapTyp
     */
    public boolean mapUniqueReadsTooP() { return false; }
-    public abstract MapType map(GenomeLoc loc, byte[] refBases, AlignmentContext context,
+    /**
-                                List<SAMRecord> uniqueReads,
+     * Called by the traversal engine to decide whether to call map() at loci without duplicate reads
-                                List<SAMRecord> duplicateReads);
+     *
     * @return true if you want to see non duplicates during the traversal
     */
    public boolean mapAtLociWithoutDuplicates() { return true; }
    public abstract MapType map(GenomeLoc loc, AlignmentContext context, Set<List<SAMRecord>> readSets );
    // Given result of map function
    public abstract ReduceType reduceInit();
--- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/DuplicateQualsWalker.java
+++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/DuplicateQualsWalker.java
@ -23,7 +23,7 @@
 * OTHER DEALINGS IN THE SOFTWARE.
 */
-package org.broadinstitute.sting.playground.gatk.walkers.duplicates;
+package org.broadinstitute.sting.oneoffprojects.walkers;
 import net.sf.samtools.SAMRecord;
 import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
@ -39,6 +39,7 @@ import org.broadinstitute.sting.utils.duplicates.DuplicateComp;
 import java.io.PrintStream;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.Set;
 class MismatchCounter {
    long nObs = 0;
@ -160,41 +161,40 @@ public class DuplicateQualsWalker extends DuplicateWalker<List<DuplicateComp>, Q
    }
    // Print out data for regression
-    public List<DuplicateComp> map(GenomeLoc loc, byte[] refBases, AlignmentContext context,
+    public List<DuplicateComp> map(GenomeLoc loc, AlignmentContext context, Set<List<SAMRecord>> readSets ) {
                                   List<SAMRecord> uniqueReads,
                                   List<SAMRecord> duplicateReads) {
        //logger.info(String.format("%s has %d duplicates and %d non-duplicates", loc, duplicateReads.size(), uniqueReads.size()));
        List<DuplicateComp> pairwiseComps = new ArrayList<DuplicateComp>();
        if ( ! ACTUALLY_DO_WORK )
            return pairwiseComps;
-        if ( COMBINE_QUALS ) {
+        // todo -- fixme -- the logic here is all wrong given new interface        
-            Pair<SAMRecord, SAMRecord> combinedReads = DupUtils.combinedReadPair( duplicateReads );
+//        if ( ! ACTUALLY_DO_WORK )
-            if ( combinedReads != null ) {
+//            return pairwiseComps;
-                SAMRecord combined1 = combinedReads.first;
+//
-                SAMRecord combined2 = combinedReads.second;
+//        if ( COMBINE_QUALS ) {
-
+//            Pair<SAMRecord, SAMRecord> combinedReads = DupUtils.combinedReadPair( duplicateReads );
-                if ( comparePairToSingleton )
+//            if ( combinedReads != null ) {
-                    pairwiseComps = addPairwiseMatches( pairwiseComps, combined1, duplicateReads.get(2), uniqueReads );
+//                SAMRecord combined1 = combinedReads.first;
-                else
+//                SAMRecord combined2 = combinedReads.second;
-                    pairwiseComps = addPairwiseMatches( pairwiseComps, combined1, combined2, uniqueReads );
+//
-            }
+//                if ( comparePairToSingleton )
-        } else {
+//                    pairwiseComps = addPairwiseMatches( pairwiseComps, combined1, duplicateReads.get(2), uniqueReads );
-            int nComparisons = 0;
+//                else
-            for ( SAMRecord read1 : duplicateReads ) {
+//                    pairwiseComps = addPairwiseMatches( pairwiseComps, combined1, combined2, uniqueReads );
-                for ( SAMRecord read2 : duplicateReads ) {
+//            }
-                    if ( read1.hashCode() < read2.hashCode() && DupUtils.usableDuplicate(read1, read2) ) {
+//        } else {
-                        // the hashcode insures we don't do A vs. B and B vs. A
+//            int nComparisons = 0;
-                        //System.out.printf("Comparing %s against %s%n", read1, read2);
+//            for ( SAMRecord read1 : duplicateReads ) {
-                        nComparisons++;
+//                for ( SAMRecord read2 : duplicateReads ) {
-                        pairwiseComps = addPairwiseMatches( pairwiseComps, read1, read2, uniqueReads );
+//                    if ( read1.hashCode() < read2.hashCode() && DupUtils.usableDuplicate(read1, read2) ) {
-                        if ( nComparisons > MAX_PAIRSIZE_COMPS_PER_DUPLICATE_SET )
+//                        // the hashcode insures we don't do A vs. B and B vs. A
-                            break;
+//                        //System.out.printf("Comparing %s against %s%n", read1, read2);
-                    }
+//                        nComparisons++;
-                }
+//                        pairwiseComps = addPairwiseMatches( pairwiseComps, read1, read2, uniqueReads );
-            }
+//                        if ( nComparisons > MAX_PAIRSIZE_COMPS_PER_DUPLICATE_SET )
-        }
+//                            break;
 //                    }
 //                }
 //            }
 //        }
        return pairwiseComps;
    }
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/duplicates/CombineDuplicatesWalker.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/duplicates/CombineDuplicatesWalker.java
@ -7,6 +7,8 @@ import org.broadinstitute.sting.utils.duplicates.DupUtils;
 import org.broadinstitute.sting.utils.cmdLine.Argument;
 import java.util.List;
 import java.util.Set;
 import java.util.ArrayList;
 import net.sf.samtools.SAMRecord;
 import net.sf.samtools.SAMFileWriter;
@ -15,7 +17,7 @@ import net.sf.samtools.SAMFileWriter;
 * Process the input bam file, optionally emitting all the unique reads found, and emitting the combined duplicate reads to
 * the specified output BAM location.  If no output location is specified, the reads are written to STDOUT. 
 */
-public class CombineDuplicatesWalker extends DuplicateWalker<SAMRecord, SAMFileWriter> {
+public class CombineDuplicatesWalker extends DuplicateWalker<List<SAMRecord>, SAMFileWriter> {
    @Argument(fullName="outputBAM", shortName="outputBAM", required=false, doc="BAM File to write combined duplicates to")
    public SAMFileWriter outputBAM = null;
@ -45,50 +47,58 @@ public class CombineDuplicatesWalker extends DuplicateWalker<SAMRecord, SAMFileW
    /**
     * emit the read that was produced by combining the dupplicates
     */
-    public SAMFileWriter reduce(SAMRecord read, SAMFileWriter output) {
+    public SAMFileWriter reduce(List<SAMRecord> reads, SAMFileWriter output) {
-        if ( output != null ) {
+        for ( SAMRecord read : reads ) {
-            output.addAlignment(read);
+            if ( output != null ) {
-        } else {
+                output.addAlignment(read);
-            out.println(read.format());
+            } else {
                out.println(read.format());
            }
        }
        return output;
    }
    /**
     * We don't want to see loci without duplicates, since 
     * @return
     */
    public boolean mapAtLociWithoutDuplicates() { return false; }
    /**
     * Build a combined read given the input list of non-unique reads.  If there's just one read in the
     * set, it's considered unique and returned.  If there's more than one, the N-way combine
     * duplicate function is invoked.
     *
     * @param loc the genome loc
     * @param refBases the reference bases for the given locus
     * @param context the alignment context that has the reads information
-     * @param duplicateReads a list of the dupplicate reads at this locus
+     * @param readSets the set of unique reads list at this locus
     * @param uniqueReads the unique read list at this locus
     * @return a read that combines the dupplicate reads at this locus
     */
-    public SAMRecord map(GenomeLoc loc, byte[] refBases, AlignmentContext context,
+    public List<SAMRecord> map(GenomeLoc loc, AlignmentContext context, Set<List<SAMRecord>> readSets ) {
-                         List<SAMRecord> uniqueReads,
+        List<SAMRecord> combinedReads = new ArrayList<SAMRecord>();
                         List<SAMRecord> duplicateReads) {
        //logger.info(String.format("%s has %d duplicates and %d non-duplicates", loc, duplicateReads.size(), uniqueReads.size()));
-        SAMRecord combinedRead = null;
+        for ( List<SAMRecord> reads : readSets ) {
            SAMRecord combinedRead = null;
-        if ( duplicateReads.size() == 1 && ! duplicateReads.get(0).getDuplicateReadFlag() ) {
+            if ( reads.size() == 1 && ! reads.get(0).getDuplicateReadFlag() ) {
-            // we are a unique read
+                // we are a unique read
-            combinedRead = duplicateReads.get(0);
+                combinedRead = reads.get(0);
-        } else {
+            } else {
-            // actually call the combine function
+                // actually call the combine function
-            //for (SAMRecord read : duplicateReads ) {
+                //for (SAMRecord read : duplicateReads ) {
-            //    System.out.printf("Read %s%n", read.format());
+                //    System.out.printf("Read %s%n", read.format());
-            //}
+                //}
-            combinedRead = DupUtils.combineDuplicates(duplicateReads, MAX_QUALITY_SCORE);
+                combinedRead = DupUtils.combineDuplicates(reads, MAX_QUALITY_SCORE);
            }
            if ( combinedRead.getDuplicateReadFlag() )
                throw new RuntimeException(String.format("Combined read %s [of %d] is a duplicate after combination -- this is a bug%n%s",
                        combinedRead.getReadName(), reads.size(), combinedRead.format()));
            combinedReads.add(combinedRead);
        }
-        if ( combinedRead.getDuplicateReadFlag() )
+        return combinedReads;
            throw new RuntimeException(String.format("Combined read %s [of %d] is a duplicate after combination -- this is a bug%n%s",
                    combinedRead.getReadName(), duplicateReads.size(), combinedRead.format()));
        return combinedRead;
    }
 }
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/duplicates/CountDuplicatesWalker.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/duplicates/CountDuplicatesWalker.java
@ -29,41 +29,61 @@ import net.sf.samtools.SAMRecord;
 import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
 import org.broadinstitute.sting.gatk.walkers.DuplicateWalker;
 import org.broadinstitute.sting.utils.GenomeLoc;
 import org.broadinstitute.sting.utils.Utils;
 import org.broadinstitute.sting.utils.cmdLine.Argument;
 import java.util.List;
 import java.util.Set;
 import java.util.ArrayList;
 /**
 * a class to store the traversal information we pass around
 */
 class DuplicateCount  {
-        public int count = 0; // the count of sites we were given
+    public int count = 0;       // the count of sites we were given
-        public int undupDepth = 0; // the unique read count
+    public int nUniqueMolecules = 0;  // the unique read count
-        public int depth = 0; // the dupplicate read depth
+    public int nDuplicatedMolecules = 0;  // the unique read count
-    }
+    public int depth = 0;       // the dupplicate read depth
 }
 /**
 * Count the number of unique reads, duplicates, and the average depth of unique reads and duplicates at all positions.
 * @author aaron
 */
 public class CountDuplicatesWalker extends DuplicateWalker<DuplicateCount, DuplicateCount> {
    @Argument(fullName="quiet", required=false, doc="If true, per locus information isn't printex")
    public boolean quiet = false;
    /**
     * the map function, conforming to the duplicates interface
     * @param loc the genomic location
     * @param refBases the reference bases that cover this position, which we turn off
     * @param context the AlignmentContext, containing all the reads overlapping this region
-     * @param uniqueReads all the unique reads, bundled together
+     * @param readSets all the duplicate reads
     * @param duplicateReads all the duplicate reads
     * @return a DuplicateCount object, with the appropriate stats
     */
-    public DuplicateCount map(GenomeLoc loc, byte[] refBases, AlignmentContext context, List<SAMRecord> uniqueReads, List<SAMRecord> duplicateReads) {
+    public DuplicateCount map(GenomeLoc loc, AlignmentContext context, Set<List<SAMRecord>> readSets ) {
        if ( ! quiet ) out.printf("%s with %d read sets => ", loc, readSets.size());
        DuplicateCount dup = new DuplicateCount();
        dup.depth = 0;
        for ( List<SAMRecord> reads : readSets) {
            List<String> names = new ArrayList<String>();
            for ( SAMRecord read : reads ) {
                names.add(read.getReadName());
            }
            if ( ! quiet ) out.printf("%d reads [%s] ", reads.size(), Utils.join(",", names));
            dup.depth += reads.size();
            dup.nDuplicatedMolecules += reads.size() > 1 ? 1 : 0;   // if there's more than 1 read per set, we're a duplicated reads
        }
        if ( ! quiet ) out.printf("%n");
        dup.count = 1;
-        dup.undupDepth = uniqueReads.size();
+        dup.nUniqueMolecules = readSets.size();
        dup.depth = duplicateReads.size();
        return dup;
    }
    public boolean mapAtLociWithoutDuplicates() { return true; }
    /**
     * setup our walker.  In this case, new a DuplicateCount object and return it
     * @return the object holding the counts of the duplicates
@ -82,7 +102,8 @@ public class CountDuplicatesWalker extends DuplicateWalker<DuplicateCount, Dupli
        DuplicateCount dup = new DuplicateCount();
        dup.count = sum.count + value.count;
        dup.depth = value.depth + sum.depth;
-        dup.undupDepth = value.undupDepth + sum.undupDepth;
+        dup.nDuplicatedMolecules = value.nDuplicatedMolecules + sum.nDuplicatedMolecules;
        dup.nUniqueMolecules = value.nUniqueMolecules + sum.nUniqueMolecules;
        return dup;
    }
@ -93,9 +114,10 @@ public class CountDuplicatesWalker extends DuplicateWalker<DuplicateCount, Dupli
    public void onTraversalDone(DuplicateCount result) {
        out.println("[REDUCE RESULT] Traversal result is: ");
        out.println("traversal iterations = " + result.count);
-        out.println("average depth = " + (double)result.depth / (double)result.count);
+        out.printf("average depth = %.2f%n", (double)result.depth / (double)result.count);
-        out.println("duplicates seen = " + result.depth);
+        out.println("unique molecules seen = " + result.nUniqueMolecules);
-        out.println("unique read count = " + result.undupDepth);
+        out.println("duplicated molecules seen = " + result.nDuplicatedMolecules);
-        out.println("average unique read depth = " + (double)result.undupDepth / (double)result.count);
+        out.printf("percent duplicated = %.2f%%%n", result.nDuplicatedMolecules / (double)result.nUniqueMolecules * 100);
        out.printf("average unique read depth = %.2f%n", (double)result.nUniqueMolecules / (double)result.count);
    }
 }
--- a/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileup.java
+++ b/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileup.java
@ -39,6 +39,13 @@ public class ReadBackedPileup implements Iterable<PileupElement> {
        this(loc, readsOffsets2Pileup(reads, offsets));
    }
    /**
     * Create a new version of a read backed pileup at loc without any aligned reads
     *
     */
     public ReadBackedPileup(GenomeLoc loc ) {
        this(loc, new ArrayList<PileupElement>(0));
    }
    /**
     * Create a new version of a read backed pileup at loc, using the reads and their corresponding
--- a/java/test/org/broadinstitute/sting/gatk/traversals/TraverseDuplicatesTest.java
+++ b/java/test/org/broadinstitute/sting/gatk/traversals/TraverseDuplicatesTest.java
@ -12,6 +12,7 @@ import org.junit.Test;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.Set;
 /**
@ -37,49 +38,48 @@ public class TraverseDuplicatesTest extends BaseTest {
    public void testAllDupplicatesNoPairs() {
        List<SAMRecord> list = new ArrayList<SAMRecord>();
        for (int x = 0; x < 10; x++) {
-            SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "SWEET_READ", 0, 1, 100);
+            SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "SWEET_READ" + x, 0, 1, 100);
            read.setDuplicateReadFlag(true);
            list.add(read);
        }
-        Pair<List<SAMRecord>, List<SAMRecord>> myPairing = obj.splitDuplicates(list);
+        Set<List<SAMRecord>> myPairings = obj.uniqueReadSets(list);
-        Assert.assertEquals(0, myPairing.first.size());   // unique
+        Assert.assertEquals(1, myPairings.size());
-        Assert.assertEquals(10, myPairing.second.size()); // dup's
+        Assert.assertEquals(10, myPairings.iterator().next().size()); // dup's
    }
    @Test
    public void testNoDupplicatesNoPairs() {
        List<SAMRecord> list = new ArrayList<SAMRecord>();
        for (int x = 0; x < 10; x++) {
-            SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "SWEET_READ", 0, 1, 100);
+            SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "SWEET_READ" + x, 0, 1, 100);
            read.setDuplicateReadFlag(false);
            list.add(read);
        }
-        Pair<List<SAMRecord>, List<SAMRecord>> myPairing = obj.splitDuplicates(list);
+
-        Assert.assertEquals(10, myPairing.first.size()); // unique
+        Set<List<SAMRecord>> myPairing = obj.uniqueReadSets(list);
-        Assert.assertEquals(0, myPairing.second.size()); // dup's
+        Assert.assertEquals(10, myPairing.size()); // unique
    }
    @Test
    public void testFiftyFiftyNoPairs() {
        List<SAMRecord> list = new ArrayList<SAMRecord>();
        for (int x = 0; x < 5; x++) {
-            SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "SWEET_READ", 0, 1, 100);
+            SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "SWEET_READ" + x, 0, 1, 100);
            read.setDuplicateReadFlag(true);
            list.add(read);
        }
        for (int x = 10; x < 15; x++)
            list.add(ArtificialSAMUtils.createArtificialRead(header, String.valueOf(x), 0, x, 100));
-        Pair<List<SAMRecord>, List<SAMRecord>> myPairing = obj.splitDuplicates(list);
+        Set<List<SAMRecord>> myPairing = obj.uniqueReadSets(list);
-        Assert.assertEquals(5, myPairing.first.size());  // unique
+        Assert.assertEquals(6, myPairing.size());  // unique
        Assert.assertEquals(5, myPairing.second.size()); // dup's
    }
    @Test
    public void testAllDupplicatesAllPairs() {
        List<SAMRecord> list = new ArrayList<SAMRecord>();
        for (int x = 0; x < 10; x++) {
-            SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "SWEET_READ", 0, 1, 100);
+            SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "SWEET_READ"+ x, 0, 1, 100);
            read.setDuplicateReadFlag(true);
            read.setMateAlignmentStart(100);
            read.setMateReferenceIndex(0);
@ -87,16 +87,15 @@ public class TraverseDuplicatesTest extends BaseTest {
            list.add(read);
        }
-        Pair<List<SAMRecord>, List<SAMRecord>> myPairing = obj.splitDuplicates(list);
+        Set<List<SAMRecord>> myPairing = obj.uniqueReadSets(list);
-        Assert.assertEquals(0, myPairing.first.size());  // unique
+        Assert.assertEquals(1, myPairing.size());  // unique
        Assert.assertEquals(10, myPairing.second.size()); // dup's
    }
    @Test
    public void testNoDupplicatesAllPairs() {
        List<SAMRecord> list = new ArrayList<SAMRecord>();
        for (int x = 0; x < 10; x++) {
-            SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "SWEET_READ", 0, 1, 100);
+            SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "SWEET_READ"+ x, 0, 1, 100);
            if (x == 0) read.setDuplicateReadFlag(true); // one is a dup but (next line)
            read.setMateAlignmentStart(100); // they all have a shared start and mate start so they're dup's
            read.setMateReferenceIndex(0);
@ -104,16 +103,15 @@ public class TraverseDuplicatesTest extends BaseTest {
            list.add(read);
        }
-        Pair<List<SAMRecord>, List<SAMRecord>> myPairing = obj.splitDuplicates(list);
+        Set<List<SAMRecord>> myPairing = obj.uniqueReadSets(list);
-        Assert.assertEquals(0, myPairing.first.size());  // unique
+        Assert.assertEquals(1, myPairing.size());  // unique
        Assert.assertEquals(10, myPairing.second.size()); // dup's
    }
    @Test
    public void testAllDupplicatesAllPairsDifferentPairedEnd() {
        List<SAMRecord> list = new ArrayList<SAMRecord>();
        for (int x = 0; x < 10; x++) {
-            SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "SWEET_READ", 0, 1, 100);
+            SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "SWEET_READ" + x, 0, 1, 100);
            if (x == 0) read.setDuplicateReadFlag(true); // one is a dup
            read.setMateAlignmentStart(100 + x);
            read.setMateReferenceIndex(0);
@ -121,8 +119,7 @@ public class TraverseDuplicatesTest extends BaseTest {
            list.add(read);
        }
-        Pair<List<SAMRecord>, List<SAMRecord>> myPairing = obj.splitDuplicates(list);
+        Set<List<SAMRecord>> myPairing = obj.uniqueReadSets(list);
-        Assert.assertEquals(9, myPairing.first.size());  // unique
+        Assert.assertEquals(10, myPairing.size());  // unique
        Assert.assertEquals(1, myPairing.second.size()); // dup's
    }
 }
--- a/python/snpSelector.py
+++ b/python/snpSelector.py
@ -249,7 +249,7 @@ def calculateBinsForValues(values, field, minValue, maxValue, partitions):
            bins[-1][1] = bin[0]
            bins[-1][2] += curSize
-    #print 'Returning ', bins
+    print 'Returning ', bins
    #sys.exit(1)
    return bins