diff --git a/java/src/org/broadinstitute/sting/gatk/traversals/TraverseDuplicates.java b/java/src/org/broadinstitute/sting/gatk/traversals/TraverseDuplicates.java
index 96585a142..d17edea30 100755
--- a/java/src/org/broadinstitute/sting/gatk/traversals/TraverseDuplicates.java
+++ b/java/src/org/broadinstitute/sting/gatk/traversals/TraverseDuplicates.java
@@ -57,10 +57,10 @@ import java.util.List;
/**
* @author Mark DePristo
* @version 0.1
- *
- * Class TraverseDuplicates
- *
- * This class handles traversing lists of duplicate reads in the new shardable style
+ *
+ * Class TraverseDuplicates
+ *
+ * This class handles traversing lists of duplicate reads in the new shardable style
*/
public class TraverseDuplicates extends TraversalEngine {
/** our log, which we want to capture anything from this class */
@@ -90,10 +90,10 @@ public class TraverseDuplicates extends TraversalEngine {
return l;
}
- private Pair, List> splitDuplicates(List reads) {
+ protected Pair, List> splitDuplicates(List reads) {
List uniques = new ArrayList();
List dups = new ArrayList();
-
+
// find the first duplicate
SAMRecord key = null;
for (SAMRecord read : reads) {
@@ -109,22 +109,27 @@ public class TraverseDuplicates extends TraversalEngine {
// if it's a dup, add it to the dups list, otherwise add it to the uniques list
if (key != null) {
final GenomeLoc keyLoc = GenomeLocParser.createGenomeLoc(key);
- final GenomeLoc keyMateLoc = GenomeLocParser.createGenomeLoc(key.getMateReferenceIndex(), key.getMateAlignmentStart(), key.getMateAlignmentStart());
-
+ final GenomeLoc keyMateLoc = (!key.getReadPairedFlag()) ? null :
+ GenomeLocParser.createGenomeLoc(key.getMateReferenceIndex(), key.getMateAlignmentStart(), key.getMateAlignmentStart());
for (SAMRecord read : reads) {
final GenomeLoc readLoc = GenomeLocParser.createGenomeLoc(read);
- final GenomeLoc readMateLoc = GenomeLocParser.createGenomeLoc(read.getMateReferenceIndex(), read.getMateAlignmentStart(), read.getMateAlignmentStart());
+ final GenomeLoc readMateLoc = (!key.getReadPairedFlag()) ? null :
+ GenomeLocParser.createGenomeLoc(read.getMateReferenceIndex(), read.getMateAlignmentStart(), read.getMateAlignmentStart());
if (DEBUG)
logger.debug(String.format("Examining reads at %s vs. %s at %s / %s vs. %s / %s%n", key.getReadName(), read.getReadName(), keyLoc, keyMateLoc, readLoc, readMateLoc));
// read and key start at the same place, and either the this read and the key
// share a mate location or the read is flagged as a duplicate
- if (readLoc.compareTo(keyLoc) == 0 &&
- (readMateLoc.compareTo(keyMateLoc) == 0) ||
+ if (readLoc.compareTo(keyLoc) == 0 ||
read.getDuplicateReadFlag()) {
- // we are at the same position as the dup and have the same mat pos, it's a dup
- if (DEBUG) logger.debug(String.format(" => Adding read to dups list: %s%n", read));
- dups.add(read);
+ if ((readMateLoc != null && keyMateLoc != null && readMateLoc.compareTo(keyMateLoc) == 0) ||
+ (readMateLoc == null && keyMateLoc == null)) {
+ // we are at the same position as the dup and have the same mat pos, it's a dup
+ if (DEBUG) logger.debug(String.format(" => Adding read to dups list: %s%n", read));
+ dups.add(read);
+ } else {
+ uniques.add(read);
+ }
} else {
uniques.add(read);
}
@@ -139,11 +144,11 @@ public class TraverseDuplicates extends TraversalEngine {
/**
* Traverse by reads, given the data and the walker
*
- * @param sum of type T, the return from the walker
- * @param the generic type
- * @param the return type of the reduce function
+ * @param sum of type T, the return from the walker
+ * @param the generic type
+ * @param the return type of the reduce function
* @param dupWalker our duplicates walker
- * @param readIter our iterator
+ * @param readIter our iterator
*
* @return the reduce type, T, the final product of all the reduce calls
*/
@@ -171,10 +176,10 @@ public class TraverseDuplicates extends TraversalEngine {
List duplicateReads = split.getSecond();
logger.debug(String.format("*** TraverseDuplicates.traverse at %s with %d reads has %d unique and %d duplicate reads",
- site, reads.size(), uniqueReads.size(), duplicateReads.size()));
+ site, reads.size(), uniqueReads.size(), duplicateReads.size()));
if (reads.size() != uniqueReads.size() + duplicateReads.size())
throw new RuntimeException(String.format("Bug occurred spliting reads [N=%d] at loc %s into unique [N=%d] and duplicates [N=%d], sizes don't match",
- reads.size(), site.toString(), uniqueReads.size(), duplicateReads.size()));
+ reads.size(), site.toString(), uniqueReads.size(), duplicateReads.size()));
// Jump forward in the reference to this locus location
AlignmentContext locus = new AlignmentContext(site, duplicateReads, Arrays.asList(0));
diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceWalker.java
index e5ecab166..5b8eefa8e 100755
--- a/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceWalker.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceWalker.java
@@ -49,7 +49,6 @@ public class FastaAlternateReferenceWalker extends FastaReferenceWalker {
continue;
// if we have multiple variants at a locus, just take the first damn one we see for now
Variation variant = (Variation) rod;
- if (variant.getAlleleList().size() != 2) System.err.println("Not two " + Utils.join("-",variant.getAlleleList()));
if (!rod.getName().startsWith("snpmask") && variant.isDeletion()) {
deletionBasesRemaining = variant.getAlleleList().get(0).length();
basesSeen++;
diff --git a/java/test/org/broadinstitute/sting/gatk/traversals/TraverseDuplicatesTest.java b/java/test/org/broadinstitute/sting/gatk/traversals/TraverseDuplicatesTest.java
new file mode 100644
index 000000000..027002410
--- /dev/null
+++ b/java/test/org/broadinstitute/sting/gatk/traversals/TraverseDuplicatesTest.java
@@ -0,0 +1,128 @@
+package org.broadinstitute.sting.gatk.traversals;
+
+import net.sf.samtools.SAMFileHeader;
+import net.sf.samtools.SAMRecord;
+import org.broadinstitute.sting.BaseTest;
+import org.broadinstitute.sting.utils.GenomeLocParser;
+import org.broadinstitute.sting.utils.Pair;
+import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+import java.util.ArrayList;
+import java.util.List;
+
+
+/**
+ * @author aaron
+ *
+ * Class TraverseDuplicatesTest
+ *
+ * test the meat of the traverse dupplicates.
+ */
+public class TraverseDuplicatesTest extends BaseTest {
+
+ private TraverseDuplicates obj = new TraverseDuplicates();
+ private SAMFileHeader header;
+
+
+ @Before
+ public void doBefore() {
+ header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000);
+ GenomeLocParser.setupRefContigOrdering(header.getSequenceDictionary());
+ }
+
+ @Test
+ public void testAllDupplicatesNoPairs() {
+ List list = new ArrayList();
+ for (int x = 0; x < 10; x++) {
+ SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "SWEET_READ", 0, 1, 100);
+ read.setDuplicateReadFlag(true);
+ list.add(read);
+ }
+ Pair, List> myPairing = obj.splitDuplicates(list);
+ Assert.assertEquals(0, myPairing.first.size()); // unique
+ Assert.assertEquals(10, myPairing.second.size()); // dup's
+ }
+
+ @Test
+ public void testNoDupplicatesNoPairs() {
+ List list = new ArrayList();
+ for (int x = 0; x < 10; x++) {
+ SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "SWEET_READ", 0, 1, 100);
+ read.setDuplicateReadFlag(false);
+ list.add(read);
+ }
+ Pair, List> myPairing = obj.splitDuplicates(list);
+ Assert.assertEquals(10, myPairing.first.size()); // unique
+ Assert.assertEquals(0, myPairing.second.size()); // dup's
+ }
+
+ @Test
+ public void testFiftyFiftyNoPairs() {
+ List list = new ArrayList();
+ for (int x = 0; x < 5; x++) {
+ SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "SWEET_READ", 0, 1, 100);
+ read.setDuplicateReadFlag(true);
+ list.add(read);
+ }
+ for (int x = 10; x < 15; x++)
+ list.add(ArtificialSAMUtils.createArtificialRead(header, String.valueOf(x), 0, x, 100));
+
+ Pair, List> myPairing = obj.splitDuplicates(list);
+ Assert.assertEquals(5, myPairing.first.size()); // unique
+ Assert.assertEquals(5, myPairing.second.size()); // dup's
+ }
+
+ @Test
+ public void testAllDupplicatesAllPairs() {
+ List list = new ArrayList();
+ for (int x = 0; x < 10; x++) {
+ SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "SWEET_READ", 0, 1, 100);
+ read.setDuplicateReadFlag(true);
+ read.setMateAlignmentStart(100);
+ read.setMateReferenceIndex(0);
+ read.setReadPairedFlag(true);
+ list.add(read);
+ }
+
+ Pair, List> myPairing = obj.splitDuplicates(list);
+ Assert.assertEquals(0, myPairing.first.size()); // unique
+ Assert.assertEquals(10, myPairing.second.size()); // dup's
+ }
+
+ @Test
+ public void testNoDupplicatesAllPairs() {
+ List list = new ArrayList();
+ for (int x = 0; x < 10; x++) {
+ SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "SWEET_READ", 0, 1, 100);
+ if (x == 0) read.setDuplicateReadFlag(true); // one is a dup but (next line)
+ read.setMateAlignmentStart(100); // they all have a shared start and mate start so they're dup's
+ read.setMateReferenceIndex(0);
+ read.setReadPairedFlag(true);
+ list.add(read);
+ }
+
+ Pair, List> myPairing = obj.splitDuplicates(list);
+ Assert.assertEquals(0, myPairing.first.size()); // unique
+ Assert.assertEquals(10, myPairing.second.size()); // dup's
+ }
+
+ @Test
+ public void testAllDupplicatesAllPairsDifferentPairedEnd() {
+ List list = new ArrayList();
+ for (int x = 0; x < 10; x++) {
+ SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "SWEET_READ", 0, 1, 100);
+ if (x == 0) read.setDuplicateReadFlag(true); // one is a dup
+ read.setMateAlignmentStart(100 + x);
+ read.setMateReferenceIndex(0);
+ read.setReadPairedFlag(true);
+ list.add(read);
+ }
+
+ Pair, List> myPairing = obj.splitDuplicates(list);
+ Assert.assertEquals(9, myPairing.first.size()); // unique
+ Assert.assertEquals(1, myPairing.second.size()); // dup's
+ }
+}