a fix for traverse dupplicates bug: GSA-202. Also removed some debugging output from FastaAltRef walker

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1912 348d0f76-0448-11de-a6fe-93d51630548a
2009-10-26 20:18:55 +00:00 · 2009-10-26 20:18:55 +00:00 · 3fb3773098
parent a1e8a532ad
commit 3fb3773098
3 changed files with 153 additions and 21 deletions
--- a/java/src/org/broadinstitute/sting/gatk/traversals/TraverseDuplicates.java
+++ b/java/src/org/broadinstitute/sting/gatk/traversals/TraverseDuplicates.java
@ -57,10 +57,10 @@ import java.util.List;
 /**
 * @author Mark DePristo
 * @version 0.1
- * <p/>
- * Class TraverseDuplicates
- * <p/>
- * This class handles traversing lists of duplicate reads in the new shardable style
+ *          <p/>
+ *          Class TraverseDuplicates
+ *          <p/>
+ *          This class handles traversing lists of duplicate reads in the new shardable style
 */
 public class TraverseDuplicates extends TraversalEngine {
    /** our log, which we want to capture anything from this class */
@ -90,10 +90,10 @@ public class TraverseDuplicates extends TraversalEngine {
        return l;
    }

-    private Pair<List<SAMRecord>, List<SAMRecord>> splitDuplicates(List<SAMRecord> reads) {
+    protected Pair<List<SAMRecord>, List<SAMRecord>> splitDuplicates(List<SAMRecord> reads) {
        List<SAMRecord> uniques = new ArrayList<SAMRecord>();
        List<SAMRecord> dups = new ArrayList<SAMRecord>();
-
+        
        // find the first duplicate
        SAMRecord key = null;
        for (SAMRecord read : reads) {
@ -109,22 +109,27 @@ public class TraverseDuplicates extends TraversalEngine {
        // if it's a dup, add it to the dups list, otherwise add it to the uniques list 
        if (key != null) {
            final GenomeLoc keyLoc = GenomeLocParser.createGenomeLoc(key);
-            final GenomeLoc keyMateLoc = GenomeLocParser.createGenomeLoc(key.getMateReferenceIndex(), key.getMateAlignmentStart(), key.getMateAlignmentStart());
-
+            final GenomeLoc keyMateLoc = (!key.getReadPairedFlag()) ? null :
+                    GenomeLocParser.createGenomeLoc(key.getMateReferenceIndex(), key.getMateAlignmentStart(), key.getMateAlignmentStart());
            for (SAMRecord read : reads) {
                final GenomeLoc readLoc = GenomeLocParser.createGenomeLoc(read);
-                final GenomeLoc readMateLoc = GenomeLocParser.createGenomeLoc(read.getMateReferenceIndex(), read.getMateAlignmentStart(), read.getMateAlignmentStart());
+                final GenomeLoc readMateLoc = (!key.getReadPairedFlag()) ? null :
+                        GenomeLocParser.createGenomeLoc(read.getMateReferenceIndex(), read.getMateAlignmentStart(), read.getMateAlignmentStart());
                if (DEBUG)
                    logger.debug(String.format("Examining reads at %s vs. %s at %s / %s vs. %s / %s%n", key.getReadName(), read.getReadName(), keyLoc, keyMateLoc, readLoc, readMateLoc));

                // read and key start at the same place, and either the this read and the key
                // share a mate location or the read is flagged as a duplicate
-                if (readLoc.compareTo(keyLoc) == 0 &&
-                        (readMateLoc.compareTo(keyMateLoc) == 0) ||
+                if (readLoc.compareTo(keyLoc) == 0 ||
                        read.getDuplicateReadFlag()) {
-                    // we are at the same position as the dup and have the same mat pos, it's a dup
-                    if (DEBUG) logger.debug(String.format("  => Adding read to dups list: %s%n", read));
-                    dups.add(read);
+                    if ((readMateLoc != null && keyMateLoc != null && readMateLoc.compareTo(keyMateLoc) == 0) ||
+                            (readMateLoc == null && keyMateLoc == null)) {
+                        // we are at the same position as the dup and have the same mat pos, it's a dup
+                        if (DEBUG) logger.debug(String.format("  => Adding read to dups list: %s%n", read));
+                        dups.add(read);
+                    } else {
+                        uniques.add(read);
+                    }
                } else {
                    uniques.add(read);
                }
@ -139,11 +144,11 @@ public class TraverseDuplicates extends TraversalEngine {
    /**
     * Traverse by reads, given the data and the walker
     *
-     * @param sum of type T, the return from the walker
-     * @param <M> the generic type
-     * @param <T> the return type of the reduce function
+     * @param sum       of type T, the return from the walker
+     * @param <M>       the generic type
+     * @param <T>       the return type of the reduce function
     * @param dupWalker our duplicates walker
-     * @param readIter our iterator
+     * @param readIter  our iterator
     *
     * @return the reduce type, T, the final product of all the reduce calls
     */
@ -171,10 +176,10 @@ public class TraverseDuplicates extends TraversalEngine {
            List<SAMRecord> duplicateReads = split.getSecond();

            logger.debug(String.format("*** TraverseDuplicates.traverse at %s with %d reads has %d unique and %d duplicate reads",
-                    site, reads.size(), uniqueReads.size(), duplicateReads.size()));
+                                       site, reads.size(), uniqueReads.size(), duplicateReads.size()));
            if (reads.size() != uniqueReads.size() + duplicateReads.size())
                throw new RuntimeException(String.format("Bug occurred spliting reads [N=%d] at loc %s into unique [N=%d] and duplicates [N=%d], sizes don't match",
-                        reads.size(), site.toString(), uniqueReads.size(), duplicateReads.size()));
+                                                         reads.size(), site.toString(), uniqueReads.size(), duplicateReads.size()));

            // Jump forward in the reference to this locus location
            AlignmentContext locus = new AlignmentContext(site, duplicateReads, Arrays.asList(0));
--- a/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceWalker.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceWalker.java
@ -49,7 +49,6 @@ public class FastaAlternateReferenceWalker extends FastaReferenceWalker {
                continue;
            // if we have multiple variants at a locus, just take the first damn one we see for now
            Variation variant = (Variation) rod;
-            if (variant.getAlleleList().size() != 2) System.err.println("Not two " + Utils.join("-",variant.getAlleleList()));
            if (!rod.getName().startsWith("snpmask") && variant.isDeletion()) {
                deletionBasesRemaining = variant.getAlleleList().get(0).length();
                basesSeen++;
--- a/java/test/org/broadinstitute/sting/gatk/traversals/TraverseDuplicatesTest.java
+++ b/java/test/org/broadinstitute/sting/gatk/traversals/TraverseDuplicatesTest.java
@ -0,0 +1,128 @@
+package org.broadinstitute.sting.gatk.traversals;
+
+import net.sf.samtools.SAMFileHeader;
+import net.sf.samtools.SAMRecord;
+import org.broadinstitute.sting.BaseTest;
+import org.broadinstitute.sting.utils.GenomeLocParser;
+import org.broadinstitute.sting.utils.Pair;
+import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+import java.util.ArrayList;
+import java.util.List;
+
+
+/**
+ * @author aaron
+ *         <p/>
+ *         Class TraverseDuplicatesTest
+ *         <p/>
+ *         test the meat of the traverse dupplicates.
+ */
+public class TraverseDuplicatesTest extends BaseTest {
+
+    private TraverseDuplicates obj = new TraverseDuplicates();
+    private SAMFileHeader header;
+
+
+    @Before
+    public void doBefore() {
+        header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000);
+        GenomeLocParser.setupRefContigOrdering(header.getSequenceDictionary());
+    }
+
+    @Test
+    public void testAllDupplicatesNoPairs() {
+        List<SAMRecord> list = new ArrayList<SAMRecord>();
+        for (int x = 0; x < 10; x++) {
+            SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "SWEET_READ", 0, 1, 100);
+            read.setDuplicateReadFlag(true);
+            list.add(read);
+        }
+        Pair<List<SAMRecord>, List<SAMRecord>> myPairing = obj.splitDuplicates(list);
+        Assert.assertEquals(0, myPairing.first.size());   // unique
+        Assert.assertEquals(10, myPairing.second.size()); // dup's
+    }
+
+    @Test
+    public void testNoDupplicatesNoPairs() {
+        List<SAMRecord> list = new ArrayList<SAMRecord>();
+        for (int x = 0; x < 10; x++) {
+            SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "SWEET_READ", 0, 1, 100);
+            read.setDuplicateReadFlag(false);
+            list.add(read);
+        }
+        Pair<List<SAMRecord>, List<SAMRecord>> myPairing = obj.splitDuplicates(list);
+        Assert.assertEquals(10, myPairing.first.size()); // unique
+        Assert.assertEquals(0, myPairing.second.size()); // dup's
+    }
+
+    @Test
+    public void testFiftyFiftyNoPairs() {
+        List<SAMRecord> list = new ArrayList<SAMRecord>();
+        for (int x = 0; x < 5; x++) {
+            SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "SWEET_READ", 0, 1, 100);
+            read.setDuplicateReadFlag(true);
+            list.add(read);
+        }
+        for (int x = 10; x < 15; x++)
+            list.add(ArtificialSAMUtils.createArtificialRead(header, String.valueOf(x), 0, x, 100));
+
+        Pair<List<SAMRecord>, List<SAMRecord>> myPairing = obj.splitDuplicates(list);
+        Assert.assertEquals(5, myPairing.first.size());  // unique
+        Assert.assertEquals(5, myPairing.second.size()); // dup's
+    }
+
+    @Test
+    public void testAllDupplicatesAllPairs() {
+        List<SAMRecord> list = new ArrayList<SAMRecord>();
+        for (int x = 0; x < 10; x++) {
+            SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "SWEET_READ", 0, 1, 100);
+            read.setDuplicateReadFlag(true);
+            read.setMateAlignmentStart(100);
+            read.setMateReferenceIndex(0);
+            read.setReadPairedFlag(true);
+            list.add(read);
+        }
+
+        Pair<List<SAMRecord>, List<SAMRecord>> myPairing = obj.splitDuplicates(list);
+        Assert.assertEquals(0, myPairing.first.size());  // unique
+        Assert.assertEquals(10, myPairing.second.size()); // dup's
+    }
+
+    @Test
+    public void testNoDupplicatesAllPairs() {
+        List<SAMRecord> list = new ArrayList<SAMRecord>();
+        for (int x = 0; x < 10; x++) {
+            SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "SWEET_READ", 0, 1, 100);
+            if (x == 0) read.setDuplicateReadFlag(true); // one is a dup but (next line)
+            read.setMateAlignmentStart(100); // they all have a shared start and mate start so they're dup's
+            read.setMateReferenceIndex(0);
+            read.setReadPairedFlag(true);
+            list.add(read);
+        }
+
+        Pair<List<SAMRecord>, List<SAMRecord>> myPairing = obj.splitDuplicates(list);
+        Assert.assertEquals(0, myPairing.first.size());  // unique
+        Assert.assertEquals(10, myPairing.second.size()); // dup's
+    }
+
+    @Test
+    public void testAllDupplicatesAllPairsDifferentPairedEnd() {
+        List<SAMRecord> list = new ArrayList<SAMRecord>();
+        for (int x = 0; x < 10; x++) {
+            SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "SWEET_READ", 0, 1, 100);
+            if (x == 0) read.setDuplicateReadFlag(true); // one is a dup
+            read.setMateAlignmentStart(100 + x);
+            read.setMateReferenceIndex(0);
+            read.setReadPairedFlag(true);
+            list.add(read);
+        }
+
+        Pair<List<SAMRecord>, List<SAMRecord>> myPairing = obj.splitDuplicates(list);
+        Assert.assertEquals(9, myPairing.first.size());  // unique
+        Assert.assertEquals(1, myPairing.second.size()); // dup's
+    }
+}