From 3fb377309817de58e335e459e9755cffc4886297 Mon Sep 17 00:00:00 2001 From: aaron Date: Mon, 26 Oct 2009 20:18:55 +0000 Subject: [PATCH] a fix for traverse dupplicates bug: GSA-202. Also removed some debugging output from FastaAltRef walker git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1912 348d0f76-0448-11de-a6fe-93d51630548a --- .../gatk/traversals/TraverseDuplicates.java | 45 +++--- .../fasta/FastaAlternateReferenceWalker.java | 1 - .../traversals/TraverseDuplicatesTest.java | 128 ++++++++++++++++++ 3 files changed, 153 insertions(+), 21 deletions(-) create mode 100644 java/test/org/broadinstitute/sting/gatk/traversals/TraverseDuplicatesTest.java diff --git a/java/src/org/broadinstitute/sting/gatk/traversals/TraverseDuplicates.java b/java/src/org/broadinstitute/sting/gatk/traversals/TraverseDuplicates.java index 96585a142..d17edea30 100755 --- a/java/src/org/broadinstitute/sting/gatk/traversals/TraverseDuplicates.java +++ b/java/src/org/broadinstitute/sting/gatk/traversals/TraverseDuplicates.java @@ -57,10 +57,10 @@ import java.util.List; /** * @author Mark DePristo * @version 0.1 - *

- * Class TraverseDuplicates - *

- * This class handles traversing lists of duplicate reads in the new shardable style + *

+ * Class TraverseDuplicates + *

+ * This class handles traversing lists of duplicate reads in the new shardable style */ public class TraverseDuplicates extends TraversalEngine { /** our log, which we want to capture anything from this class */ @@ -90,10 +90,10 @@ public class TraverseDuplicates extends TraversalEngine { return l; } - private Pair, List> splitDuplicates(List reads) { + protected Pair, List> splitDuplicates(List reads) { List uniques = new ArrayList(); List dups = new ArrayList(); - + // find the first duplicate SAMRecord key = null; for (SAMRecord read : reads) { @@ -109,22 +109,27 @@ public class TraverseDuplicates extends TraversalEngine { // if it's a dup, add it to the dups list, otherwise add it to the uniques list if (key != null) { final GenomeLoc keyLoc = GenomeLocParser.createGenomeLoc(key); - final GenomeLoc keyMateLoc = GenomeLocParser.createGenomeLoc(key.getMateReferenceIndex(), key.getMateAlignmentStart(), key.getMateAlignmentStart()); - + final GenomeLoc keyMateLoc = (!key.getReadPairedFlag()) ? null : + GenomeLocParser.createGenomeLoc(key.getMateReferenceIndex(), key.getMateAlignmentStart(), key.getMateAlignmentStart()); for (SAMRecord read : reads) { final GenomeLoc readLoc = GenomeLocParser.createGenomeLoc(read); - final GenomeLoc readMateLoc = GenomeLocParser.createGenomeLoc(read.getMateReferenceIndex(), read.getMateAlignmentStart(), read.getMateAlignmentStart()); + final GenomeLoc readMateLoc = (!key.getReadPairedFlag()) ? null : + GenomeLocParser.createGenomeLoc(read.getMateReferenceIndex(), read.getMateAlignmentStart(), read.getMateAlignmentStart()); if (DEBUG) logger.debug(String.format("Examining reads at %s vs. %s at %s / %s vs. %s / %s%n", key.getReadName(), read.getReadName(), keyLoc, keyMateLoc, readLoc, readMateLoc)); // read and key start at the same place, and either the this read and the key // share a mate location or the read is flagged as a duplicate - if (readLoc.compareTo(keyLoc) == 0 && - (readMateLoc.compareTo(keyMateLoc) == 0) || + if (readLoc.compareTo(keyLoc) == 0 || read.getDuplicateReadFlag()) { - // we are at the same position as the dup and have the same mat pos, it's a dup - if (DEBUG) logger.debug(String.format(" => Adding read to dups list: %s%n", read)); - dups.add(read); + if ((readMateLoc != null && keyMateLoc != null && readMateLoc.compareTo(keyMateLoc) == 0) || + (readMateLoc == null && keyMateLoc == null)) { + // we are at the same position as the dup and have the same mat pos, it's a dup + if (DEBUG) logger.debug(String.format(" => Adding read to dups list: %s%n", read)); + dups.add(read); + } else { + uniques.add(read); + } } else { uniques.add(read); } @@ -139,11 +144,11 @@ public class TraverseDuplicates extends TraversalEngine { /** * Traverse by reads, given the data and the walker * - * @param sum of type T, the return from the walker - * @param the generic type - * @param the return type of the reduce function + * @param sum of type T, the return from the walker + * @param the generic type + * @param the return type of the reduce function * @param dupWalker our duplicates walker - * @param readIter our iterator + * @param readIter our iterator * * @return the reduce type, T, the final product of all the reduce calls */ @@ -171,10 +176,10 @@ public class TraverseDuplicates extends TraversalEngine { List duplicateReads = split.getSecond(); logger.debug(String.format("*** TraverseDuplicates.traverse at %s with %d reads has %d unique and %d duplicate reads", - site, reads.size(), uniqueReads.size(), duplicateReads.size())); + site, reads.size(), uniqueReads.size(), duplicateReads.size())); if (reads.size() != uniqueReads.size() + duplicateReads.size()) throw new RuntimeException(String.format("Bug occurred spliting reads [N=%d] at loc %s into unique [N=%d] and duplicates [N=%d], sizes don't match", - reads.size(), site.toString(), uniqueReads.size(), duplicateReads.size())); + reads.size(), site.toString(), uniqueReads.size(), duplicateReads.size())); // Jump forward in the reference to this locus location AlignmentContext locus = new AlignmentContext(site, duplicateReads, Arrays.asList(0)); diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceWalker.java index e5ecab166..5b8eefa8e 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceWalker.java @@ -49,7 +49,6 @@ public class FastaAlternateReferenceWalker extends FastaReferenceWalker { continue; // if we have multiple variants at a locus, just take the first damn one we see for now Variation variant = (Variation) rod; - if (variant.getAlleleList().size() != 2) System.err.println("Not two " + Utils.join("-",variant.getAlleleList())); if (!rod.getName().startsWith("snpmask") && variant.isDeletion()) { deletionBasesRemaining = variant.getAlleleList().get(0).length(); basesSeen++; diff --git a/java/test/org/broadinstitute/sting/gatk/traversals/TraverseDuplicatesTest.java b/java/test/org/broadinstitute/sting/gatk/traversals/TraverseDuplicatesTest.java new file mode 100644 index 000000000..027002410 --- /dev/null +++ b/java/test/org/broadinstitute/sting/gatk/traversals/TraverseDuplicatesTest.java @@ -0,0 +1,128 @@ +package org.broadinstitute.sting.gatk.traversals; + +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.Pair; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; + +import java.util.ArrayList; +import java.util.List; + + +/** + * @author aaron + *

+ * Class TraverseDuplicatesTest + *

+ * test the meat of the traverse dupplicates. + */ +public class TraverseDuplicatesTest extends BaseTest { + + private TraverseDuplicates obj = new TraverseDuplicates(); + private SAMFileHeader header; + + + @Before + public void doBefore() { + header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000); + GenomeLocParser.setupRefContigOrdering(header.getSequenceDictionary()); + } + + @Test + public void testAllDupplicatesNoPairs() { + List list = new ArrayList(); + for (int x = 0; x < 10; x++) { + SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "SWEET_READ", 0, 1, 100); + read.setDuplicateReadFlag(true); + list.add(read); + } + Pair, List> myPairing = obj.splitDuplicates(list); + Assert.assertEquals(0, myPairing.first.size()); // unique + Assert.assertEquals(10, myPairing.second.size()); // dup's + } + + @Test + public void testNoDupplicatesNoPairs() { + List list = new ArrayList(); + for (int x = 0; x < 10; x++) { + SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "SWEET_READ", 0, 1, 100); + read.setDuplicateReadFlag(false); + list.add(read); + } + Pair, List> myPairing = obj.splitDuplicates(list); + Assert.assertEquals(10, myPairing.first.size()); // unique + Assert.assertEquals(0, myPairing.second.size()); // dup's + } + + @Test + public void testFiftyFiftyNoPairs() { + List list = new ArrayList(); + for (int x = 0; x < 5; x++) { + SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "SWEET_READ", 0, 1, 100); + read.setDuplicateReadFlag(true); + list.add(read); + } + for (int x = 10; x < 15; x++) + list.add(ArtificialSAMUtils.createArtificialRead(header, String.valueOf(x), 0, x, 100)); + + Pair, List> myPairing = obj.splitDuplicates(list); + Assert.assertEquals(5, myPairing.first.size()); // unique + Assert.assertEquals(5, myPairing.second.size()); // dup's + } + + @Test + public void testAllDupplicatesAllPairs() { + List list = new ArrayList(); + for (int x = 0; x < 10; x++) { + SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "SWEET_READ", 0, 1, 100); + read.setDuplicateReadFlag(true); + read.setMateAlignmentStart(100); + read.setMateReferenceIndex(0); + read.setReadPairedFlag(true); + list.add(read); + } + + Pair, List> myPairing = obj.splitDuplicates(list); + Assert.assertEquals(0, myPairing.first.size()); // unique + Assert.assertEquals(10, myPairing.second.size()); // dup's + } + + @Test + public void testNoDupplicatesAllPairs() { + List list = new ArrayList(); + for (int x = 0; x < 10; x++) { + SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "SWEET_READ", 0, 1, 100); + if (x == 0) read.setDuplicateReadFlag(true); // one is a dup but (next line) + read.setMateAlignmentStart(100); // they all have a shared start and mate start so they're dup's + read.setMateReferenceIndex(0); + read.setReadPairedFlag(true); + list.add(read); + } + + Pair, List> myPairing = obj.splitDuplicates(list); + Assert.assertEquals(0, myPairing.first.size()); // unique + Assert.assertEquals(10, myPairing.second.size()); // dup's + } + + @Test + public void testAllDupplicatesAllPairsDifferentPairedEnd() { + List list = new ArrayList(); + for (int x = 0; x < 10; x++) { + SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "SWEET_READ", 0, 1, 100); + if (x == 0) read.setDuplicateReadFlag(true); // one is a dup + read.setMateAlignmentStart(100 + x); + read.setMateReferenceIndex(0); + read.setReadPairedFlag(true); + list.add(read); + } + + Pair, List> myPairing = obj.splitDuplicates(list); + Assert.assertEquals(9, myPairing.first.size()); // unique + Assert.assertEquals(1, myPairing.second.size()); // dup's + } +}