a fix for traverse dupplicates bug: GSA-202. Also removed some debugging output from FastaAltRef walker
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1912 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
a1e8a532ad
commit
3fb3773098
|
|
@ -57,10 +57,10 @@ import java.util.List;
|
|||
/**
|
||||
* @author Mark DePristo
|
||||
* @version 0.1
|
||||
* <p/>
|
||||
* Class TraverseDuplicates
|
||||
* <p/>
|
||||
* This class handles traversing lists of duplicate reads in the new shardable style
|
||||
* <p/>
|
||||
* Class TraverseDuplicates
|
||||
* <p/>
|
||||
* This class handles traversing lists of duplicate reads in the new shardable style
|
||||
*/
|
||||
public class TraverseDuplicates extends TraversalEngine {
|
||||
/** our log, which we want to capture anything from this class */
|
||||
|
|
@ -90,10 +90,10 @@ public class TraverseDuplicates extends TraversalEngine {
|
|||
return l;
|
||||
}
|
||||
|
||||
private Pair<List<SAMRecord>, List<SAMRecord>> splitDuplicates(List<SAMRecord> reads) {
|
||||
protected Pair<List<SAMRecord>, List<SAMRecord>> splitDuplicates(List<SAMRecord> reads) {
|
||||
List<SAMRecord> uniques = new ArrayList<SAMRecord>();
|
||||
List<SAMRecord> dups = new ArrayList<SAMRecord>();
|
||||
|
||||
|
||||
// find the first duplicate
|
||||
SAMRecord key = null;
|
||||
for (SAMRecord read : reads) {
|
||||
|
|
@ -109,22 +109,27 @@ public class TraverseDuplicates extends TraversalEngine {
|
|||
// if it's a dup, add it to the dups list, otherwise add it to the uniques list
|
||||
if (key != null) {
|
||||
final GenomeLoc keyLoc = GenomeLocParser.createGenomeLoc(key);
|
||||
final GenomeLoc keyMateLoc = GenomeLocParser.createGenomeLoc(key.getMateReferenceIndex(), key.getMateAlignmentStart(), key.getMateAlignmentStart());
|
||||
|
||||
final GenomeLoc keyMateLoc = (!key.getReadPairedFlag()) ? null :
|
||||
GenomeLocParser.createGenomeLoc(key.getMateReferenceIndex(), key.getMateAlignmentStart(), key.getMateAlignmentStart());
|
||||
for (SAMRecord read : reads) {
|
||||
final GenomeLoc readLoc = GenomeLocParser.createGenomeLoc(read);
|
||||
final GenomeLoc readMateLoc = GenomeLocParser.createGenomeLoc(read.getMateReferenceIndex(), read.getMateAlignmentStart(), read.getMateAlignmentStart());
|
||||
final GenomeLoc readMateLoc = (!key.getReadPairedFlag()) ? null :
|
||||
GenomeLocParser.createGenomeLoc(read.getMateReferenceIndex(), read.getMateAlignmentStart(), read.getMateAlignmentStart());
|
||||
if (DEBUG)
|
||||
logger.debug(String.format("Examining reads at %s vs. %s at %s / %s vs. %s / %s%n", key.getReadName(), read.getReadName(), keyLoc, keyMateLoc, readLoc, readMateLoc));
|
||||
|
||||
// read and key start at the same place, and either the this read and the key
|
||||
// share a mate location or the read is flagged as a duplicate
|
||||
if (readLoc.compareTo(keyLoc) == 0 &&
|
||||
(readMateLoc.compareTo(keyMateLoc) == 0) ||
|
||||
if (readLoc.compareTo(keyLoc) == 0 ||
|
||||
read.getDuplicateReadFlag()) {
|
||||
// we are at the same position as the dup and have the same mat pos, it's a dup
|
||||
if (DEBUG) logger.debug(String.format(" => Adding read to dups list: %s%n", read));
|
||||
dups.add(read);
|
||||
if ((readMateLoc != null && keyMateLoc != null && readMateLoc.compareTo(keyMateLoc) == 0) ||
|
||||
(readMateLoc == null && keyMateLoc == null)) {
|
||||
// we are at the same position as the dup and have the same mat pos, it's a dup
|
||||
if (DEBUG) logger.debug(String.format(" => Adding read to dups list: %s%n", read));
|
||||
dups.add(read);
|
||||
} else {
|
||||
uniques.add(read);
|
||||
}
|
||||
} else {
|
||||
uniques.add(read);
|
||||
}
|
||||
|
|
@ -139,11 +144,11 @@ public class TraverseDuplicates extends TraversalEngine {
|
|||
/**
|
||||
* Traverse by reads, given the data and the walker
|
||||
*
|
||||
* @param sum of type T, the return from the walker
|
||||
* @param <M> the generic type
|
||||
* @param <T> the return type of the reduce function
|
||||
* @param sum of type T, the return from the walker
|
||||
* @param <M> the generic type
|
||||
* @param <T> the return type of the reduce function
|
||||
* @param dupWalker our duplicates walker
|
||||
* @param readIter our iterator
|
||||
* @param readIter our iterator
|
||||
*
|
||||
* @return the reduce type, T, the final product of all the reduce calls
|
||||
*/
|
||||
|
|
@ -171,10 +176,10 @@ public class TraverseDuplicates extends TraversalEngine {
|
|||
List<SAMRecord> duplicateReads = split.getSecond();
|
||||
|
||||
logger.debug(String.format("*** TraverseDuplicates.traverse at %s with %d reads has %d unique and %d duplicate reads",
|
||||
site, reads.size(), uniqueReads.size(), duplicateReads.size()));
|
||||
site, reads.size(), uniqueReads.size(), duplicateReads.size()));
|
||||
if (reads.size() != uniqueReads.size() + duplicateReads.size())
|
||||
throw new RuntimeException(String.format("Bug occurred spliting reads [N=%d] at loc %s into unique [N=%d] and duplicates [N=%d], sizes don't match",
|
||||
reads.size(), site.toString(), uniqueReads.size(), duplicateReads.size()));
|
||||
reads.size(), site.toString(), uniqueReads.size(), duplicateReads.size()));
|
||||
|
||||
// Jump forward in the reference to this locus location
|
||||
AlignmentContext locus = new AlignmentContext(site, duplicateReads, Arrays.asList(0));
|
||||
|
|
|
|||
|
|
@ -49,7 +49,6 @@ public class FastaAlternateReferenceWalker extends FastaReferenceWalker {
|
|||
continue;
|
||||
// if we have multiple variants at a locus, just take the first damn one we see for now
|
||||
Variation variant = (Variation) rod;
|
||||
if (variant.getAlleleList().size() != 2) System.err.println("Not two " + Utils.join("-",variant.getAlleleList()));
|
||||
if (!rod.getName().startsWith("snpmask") && variant.isDeletion()) {
|
||||
deletionBasesRemaining = variant.getAlleleList().get(0).length();
|
||||
basesSeen++;
|
||||
|
|
|
|||
|
|
@ -0,0 +1,128 @@
|
|||
package org.broadinstitute.sting.gatk.traversals;
|
||||
|
||||
import net.sf.samtools.SAMFileHeader;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.Pair;
|
||||
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
|
||||
import org.junit.Assert;
|
||||
import org.junit.Before;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
|
||||
/**
|
||||
* @author aaron
|
||||
* <p/>
|
||||
* Class TraverseDuplicatesTest
|
||||
* <p/>
|
||||
* test the meat of the traverse dupplicates.
|
||||
*/
|
||||
public class TraverseDuplicatesTest extends BaseTest {
|
||||
|
||||
private TraverseDuplicates obj = new TraverseDuplicates();
|
||||
private SAMFileHeader header;
|
||||
|
||||
|
||||
@Before
|
||||
public void doBefore() {
|
||||
header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000);
|
||||
GenomeLocParser.setupRefContigOrdering(header.getSequenceDictionary());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testAllDupplicatesNoPairs() {
|
||||
List<SAMRecord> list = new ArrayList<SAMRecord>();
|
||||
for (int x = 0; x < 10; x++) {
|
||||
SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "SWEET_READ", 0, 1, 100);
|
||||
read.setDuplicateReadFlag(true);
|
||||
list.add(read);
|
||||
}
|
||||
Pair<List<SAMRecord>, List<SAMRecord>> myPairing = obj.splitDuplicates(list);
|
||||
Assert.assertEquals(0, myPairing.first.size()); // unique
|
||||
Assert.assertEquals(10, myPairing.second.size()); // dup's
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testNoDupplicatesNoPairs() {
|
||||
List<SAMRecord> list = new ArrayList<SAMRecord>();
|
||||
for (int x = 0; x < 10; x++) {
|
||||
SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "SWEET_READ", 0, 1, 100);
|
||||
read.setDuplicateReadFlag(false);
|
||||
list.add(read);
|
||||
}
|
||||
Pair<List<SAMRecord>, List<SAMRecord>> myPairing = obj.splitDuplicates(list);
|
||||
Assert.assertEquals(10, myPairing.first.size()); // unique
|
||||
Assert.assertEquals(0, myPairing.second.size()); // dup's
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testFiftyFiftyNoPairs() {
|
||||
List<SAMRecord> list = new ArrayList<SAMRecord>();
|
||||
for (int x = 0; x < 5; x++) {
|
||||
SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "SWEET_READ", 0, 1, 100);
|
||||
read.setDuplicateReadFlag(true);
|
||||
list.add(read);
|
||||
}
|
||||
for (int x = 10; x < 15; x++)
|
||||
list.add(ArtificialSAMUtils.createArtificialRead(header, String.valueOf(x), 0, x, 100));
|
||||
|
||||
Pair<List<SAMRecord>, List<SAMRecord>> myPairing = obj.splitDuplicates(list);
|
||||
Assert.assertEquals(5, myPairing.first.size()); // unique
|
||||
Assert.assertEquals(5, myPairing.second.size()); // dup's
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testAllDupplicatesAllPairs() {
|
||||
List<SAMRecord> list = new ArrayList<SAMRecord>();
|
||||
for (int x = 0; x < 10; x++) {
|
||||
SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "SWEET_READ", 0, 1, 100);
|
||||
read.setDuplicateReadFlag(true);
|
||||
read.setMateAlignmentStart(100);
|
||||
read.setMateReferenceIndex(0);
|
||||
read.setReadPairedFlag(true);
|
||||
list.add(read);
|
||||
}
|
||||
|
||||
Pair<List<SAMRecord>, List<SAMRecord>> myPairing = obj.splitDuplicates(list);
|
||||
Assert.assertEquals(0, myPairing.first.size()); // unique
|
||||
Assert.assertEquals(10, myPairing.second.size()); // dup's
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testNoDupplicatesAllPairs() {
|
||||
List<SAMRecord> list = new ArrayList<SAMRecord>();
|
||||
for (int x = 0; x < 10; x++) {
|
||||
SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "SWEET_READ", 0, 1, 100);
|
||||
if (x == 0) read.setDuplicateReadFlag(true); // one is a dup but (next line)
|
||||
read.setMateAlignmentStart(100); // they all have a shared start and mate start so they're dup's
|
||||
read.setMateReferenceIndex(0);
|
||||
read.setReadPairedFlag(true);
|
||||
list.add(read);
|
||||
}
|
||||
|
||||
Pair<List<SAMRecord>, List<SAMRecord>> myPairing = obj.splitDuplicates(list);
|
||||
Assert.assertEquals(0, myPairing.first.size()); // unique
|
||||
Assert.assertEquals(10, myPairing.second.size()); // dup's
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testAllDupplicatesAllPairsDifferentPairedEnd() {
|
||||
List<SAMRecord> list = new ArrayList<SAMRecord>();
|
||||
for (int x = 0; x < 10; x++) {
|
||||
SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "SWEET_READ", 0, 1, 100);
|
||||
if (x == 0) read.setDuplicateReadFlag(true); // one is a dup
|
||||
read.setMateAlignmentStart(100 + x);
|
||||
read.setMateReferenceIndex(0);
|
||||
read.setReadPairedFlag(true);
|
||||
list.add(read);
|
||||
}
|
||||
|
||||
Pair<List<SAMRecord>, List<SAMRecord>> myPairing = obj.splitDuplicates(list);
|
||||
Assert.assertEquals(9, myPairing.first.size()); // unique
|
||||
Assert.assertEquals(1, myPairing.second.size()); // dup's
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue