a fix for traverse dupplicates bug: GSA-202. Also removed some debugging output from FastaAltRef walker

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1912 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
aaron 2009-10-26 20:18:55 +00:00
parent a1e8a532ad
commit 3fb3773098
3 changed files with 153 additions and 21 deletions

View File

@ -57,10 +57,10 @@ import java.util.List;
/**
* @author Mark DePristo
* @version 0.1
* <p/>
* Class TraverseDuplicates
* <p/>
* This class handles traversing lists of duplicate reads in the new shardable style
* <p/>
* Class TraverseDuplicates
* <p/>
* This class handles traversing lists of duplicate reads in the new shardable style
*/
public class TraverseDuplicates extends TraversalEngine {
/** our log, which we want to capture anything from this class */
@ -90,10 +90,10 @@ public class TraverseDuplicates extends TraversalEngine {
return l;
}
private Pair<List<SAMRecord>, List<SAMRecord>> splitDuplicates(List<SAMRecord> reads) {
protected Pair<List<SAMRecord>, List<SAMRecord>> splitDuplicates(List<SAMRecord> reads) {
List<SAMRecord> uniques = new ArrayList<SAMRecord>();
List<SAMRecord> dups = new ArrayList<SAMRecord>();
// find the first duplicate
SAMRecord key = null;
for (SAMRecord read : reads) {
@ -109,22 +109,27 @@ public class TraverseDuplicates extends TraversalEngine {
// if it's a dup, add it to the dups list, otherwise add it to the uniques list
if (key != null) {
final GenomeLoc keyLoc = GenomeLocParser.createGenomeLoc(key);
final GenomeLoc keyMateLoc = GenomeLocParser.createGenomeLoc(key.getMateReferenceIndex(), key.getMateAlignmentStart(), key.getMateAlignmentStart());
final GenomeLoc keyMateLoc = (!key.getReadPairedFlag()) ? null :
GenomeLocParser.createGenomeLoc(key.getMateReferenceIndex(), key.getMateAlignmentStart(), key.getMateAlignmentStart());
for (SAMRecord read : reads) {
final GenomeLoc readLoc = GenomeLocParser.createGenomeLoc(read);
final GenomeLoc readMateLoc = GenomeLocParser.createGenomeLoc(read.getMateReferenceIndex(), read.getMateAlignmentStart(), read.getMateAlignmentStart());
final GenomeLoc readMateLoc = (!key.getReadPairedFlag()) ? null :
GenomeLocParser.createGenomeLoc(read.getMateReferenceIndex(), read.getMateAlignmentStart(), read.getMateAlignmentStart());
if (DEBUG)
logger.debug(String.format("Examining reads at %s vs. %s at %s / %s vs. %s / %s%n", key.getReadName(), read.getReadName(), keyLoc, keyMateLoc, readLoc, readMateLoc));
// read and key start at the same place, and either the this read and the key
// share a mate location or the read is flagged as a duplicate
if (readLoc.compareTo(keyLoc) == 0 &&
(readMateLoc.compareTo(keyMateLoc) == 0) ||
if (readLoc.compareTo(keyLoc) == 0 ||
read.getDuplicateReadFlag()) {
// we are at the same position as the dup and have the same mat pos, it's a dup
if (DEBUG) logger.debug(String.format(" => Adding read to dups list: %s%n", read));
dups.add(read);
if ((readMateLoc != null && keyMateLoc != null && readMateLoc.compareTo(keyMateLoc) == 0) ||
(readMateLoc == null && keyMateLoc == null)) {
// we are at the same position as the dup and have the same mat pos, it's a dup
if (DEBUG) logger.debug(String.format(" => Adding read to dups list: %s%n", read));
dups.add(read);
} else {
uniques.add(read);
}
} else {
uniques.add(read);
}
@ -139,11 +144,11 @@ public class TraverseDuplicates extends TraversalEngine {
/**
* Traverse by reads, given the data and the walker
*
* @param sum of type T, the return from the walker
* @param <M> the generic type
* @param <T> the return type of the reduce function
* @param sum of type T, the return from the walker
* @param <M> the generic type
* @param <T> the return type of the reduce function
* @param dupWalker our duplicates walker
* @param readIter our iterator
* @param readIter our iterator
*
* @return the reduce type, T, the final product of all the reduce calls
*/
@ -171,10 +176,10 @@ public class TraverseDuplicates extends TraversalEngine {
List<SAMRecord> duplicateReads = split.getSecond();
logger.debug(String.format("*** TraverseDuplicates.traverse at %s with %d reads has %d unique and %d duplicate reads",
site, reads.size(), uniqueReads.size(), duplicateReads.size()));
site, reads.size(), uniqueReads.size(), duplicateReads.size()));
if (reads.size() != uniqueReads.size() + duplicateReads.size())
throw new RuntimeException(String.format("Bug occurred spliting reads [N=%d] at loc %s into unique [N=%d] and duplicates [N=%d], sizes don't match",
reads.size(), site.toString(), uniqueReads.size(), duplicateReads.size()));
reads.size(), site.toString(), uniqueReads.size(), duplicateReads.size()));
// Jump forward in the reference to this locus location
AlignmentContext locus = new AlignmentContext(site, duplicateReads, Arrays.asList(0));

View File

@ -49,7 +49,6 @@ public class FastaAlternateReferenceWalker extends FastaReferenceWalker {
continue;
// if we have multiple variants at a locus, just take the first damn one we see for now
Variation variant = (Variation) rod;
if (variant.getAlleleList().size() != 2) System.err.println("Not two " + Utils.join("-",variant.getAlleleList()));
if (!rod.getName().startsWith("snpmask") && variant.isDeletion()) {
deletionBasesRemaining = variant.getAlleleList().get(0).length();
basesSeen++;

View File

@ -0,0 +1,128 @@
package org.broadinstitute.sting.gatk.traversals;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.Pair;
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;
import java.util.ArrayList;
import java.util.List;
/**
* @author aaron
* <p/>
* Class TraverseDuplicatesTest
* <p/>
* test the meat of the traverse dupplicates.
*/
public class TraverseDuplicatesTest extends BaseTest {
private TraverseDuplicates obj = new TraverseDuplicates();
private SAMFileHeader header;
@Before
public void doBefore() {
header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000);
GenomeLocParser.setupRefContigOrdering(header.getSequenceDictionary());
}
@Test
public void testAllDupplicatesNoPairs() {
List<SAMRecord> list = new ArrayList<SAMRecord>();
for (int x = 0; x < 10; x++) {
SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "SWEET_READ", 0, 1, 100);
read.setDuplicateReadFlag(true);
list.add(read);
}
Pair<List<SAMRecord>, List<SAMRecord>> myPairing = obj.splitDuplicates(list);
Assert.assertEquals(0, myPairing.first.size()); // unique
Assert.assertEquals(10, myPairing.second.size()); // dup's
}
@Test
public void testNoDupplicatesNoPairs() {
List<SAMRecord> list = new ArrayList<SAMRecord>();
for (int x = 0; x < 10; x++) {
SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "SWEET_READ", 0, 1, 100);
read.setDuplicateReadFlag(false);
list.add(read);
}
Pair<List<SAMRecord>, List<SAMRecord>> myPairing = obj.splitDuplicates(list);
Assert.assertEquals(10, myPairing.first.size()); // unique
Assert.assertEquals(0, myPairing.second.size()); // dup's
}
@Test
public void testFiftyFiftyNoPairs() {
List<SAMRecord> list = new ArrayList<SAMRecord>();
for (int x = 0; x < 5; x++) {
SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "SWEET_READ", 0, 1, 100);
read.setDuplicateReadFlag(true);
list.add(read);
}
for (int x = 10; x < 15; x++)
list.add(ArtificialSAMUtils.createArtificialRead(header, String.valueOf(x), 0, x, 100));
Pair<List<SAMRecord>, List<SAMRecord>> myPairing = obj.splitDuplicates(list);
Assert.assertEquals(5, myPairing.first.size()); // unique
Assert.assertEquals(5, myPairing.second.size()); // dup's
}
@Test
public void testAllDupplicatesAllPairs() {
List<SAMRecord> list = new ArrayList<SAMRecord>();
for (int x = 0; x < 10; x++) {
SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "SWEET_READ", 0, 1, 100);
read.setDuplicateReadFlag(true);
read.setMateAlignmentStart(100);
read.setMateReferenceIndex(0);
read.setReadPairedFlag(true);
list.add(read);
}
Pair<List<SAMRecord>, List<SAMRecord>> myPairing = obj.splitDuplicates(list);
Assert.assertEquals(0, myPairing.first.size()); // unique
Assert.assertEquals(10, myPairing.second.size()); // dup's
}
@Test
public void testNoDupplicatesAllPairs() {
List<SAMRecord> list = new ArrayList<SAMRecord>();
for (int x = 0; x < 10; x++) {
SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "SWEET_READ", 0, 1, 100);
if (x == 0) read.setDuplicateReadFlag(true); // one is a dup but (next line)
read.setMateAlignmentStart(100); // they all have a shared start and mate start so they're dup's
read.setMateReferenceIndex(0);
read.setReadPairedFlag(true);
list.add(read);
}
Pair<List<SAMRecord>, List<SAMRecord>> myPairing = obj.splitDuplicates(list);
Assert.assertEquals(0, myPairing.first.size()); // unique
Assert.assertEquals(10, myPairing.second.size()); // dup's
}
@Test
public void testAllDupplicatesAllPairsDifferentPairedEnd() {
List<SAMRecord> list = new ArrayList<SAMRecord>();
for (int x = 0; x < 10; x++) {
SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "SWEET_READ", 0, 1, 100);
if (x == 0) read.setDuplicateReadFlag(true); // one is a dup
read.setMateAlignmentStart(100 + x);
read.setMateReferenceIndex(0);
read.setReadPairedFlag(true);
list.add(read);
}
Pair<List<SAMRecord>, List<SAMRecord>> myPairing = obj.splitDuplicates(list);
Assert.assertEquals(9, myPairing.first.size()); // unique
Assert.assertEquals(1, myPairing.second.size()); // dup's
}
}