a fix for traverse dupplicates bug: GSA-202. Also removed some debugging output from FastaAltRef walker
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1912 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
a1e8a532ad
commit
3fb3773098
|
|
@ -57,10 +57,10 @@ import java.util.List;
|
||||||
/**
|
/**
|
||||||
* @author Mark DePristo
|
* @author Mark DePristo
|
||||||
* @version 0.1
|
* @version 0.1
|
||||||
* <p/>
|
* <p/>
|
||||||
* Class TraverseDuplicates
|
* Class TraverseDuplicates
|
||||||
* <p/>
|
* <p/>
|
||||||
* This class handles traversing lists of duplicate reads in the new shardable style
|
* This class handles traversing lists of duplicate reads in the new shardable style
|
||||||
*/
|
*/
|
||||||
public class TraverseDuplicates extends TraversalEngine {
|
public class TraverseDuplicates extends TraversalEngine {
|
||||||
/** our log, which we want to capture anything from this class */
|
/** our log, which we want to capture anything from this class */
|
||||||
|
|
@ -90,10 +90,10 @@ public class TraverseDuplicates extends TraversalEngine {
|
||||||
return l;
|
return l;
|
||||||
}
|
}
|
||||||
|
|
||||||
private Pair<List<SAMRecord>, List<SAMRecord>> splitDuplicates(List<SAMRecord> reads) {
|
protected Pair<List<SAMRecord>, List<SAMRecord>> splitDuplicates(List<SAMRecord> reads) {
|
||||||
List<SAMRecord> uniques = new ArrayList<SAMRecord>();
|
List<SAMRecord> uniques = new ArrayList<SAMRecord>();
|
||||||
List<SAMRecord> dups = new ArrayList<SAMRecord>();
|
List<SAMRecord> dups = new ArrayList<SAMRecord>();
|
||||||
|
|
||||||
// find the first duplicate
|
// find the first duplicate
|
||||||
SAMRecord key = null;
|
SAMRecord key = null;
|
||||||
for (SAMRecord read : reads) {
|
for (SAMRecord read : reads) {
|
||||||
|
|
@ -109,22 +109,27 @@ public class TraverseDuplicates extends TraversalEngine {
|
||||||
// if it's a dup, add it to the dups list, otherwise add it to the uniques list
|
// if it's a dup, add it to the dups list, otherwise add it to the uniques list
|
||||||
if (key != null) {
|
if (key != null) {
|
||||||
final GenomeLoc keyLoc = GenomeLocParser.createGenomeLoc(key);
|
final GenomeLoc keyLoc = GenomeLocParser.createGenomeLoc(key);
|
||||||
final GenomeLoc keyMateLoc = GenomeLocParser.createGenomeLoc(key.getMateReferenceIndex(), key.getMateAlignmentStart(), key.getMateAlignmentStart());
|
final GenomeLoc keyMateLoc = (!key.getReadPairedFlag()) ? null :
|
||||||
|
GenomeLocParser.createGenomeLoc(key.getMateReferenceIndex(), key.getMateAlignmentStart(), key.getMateAlignmentStart());
|
||||||
for (SAMRecord read : reads) {
|
for (SAMRecord read : reads) {
|
||||||
final GenomeLoc readLoc = GenomeLocParser.createGenomeLoc(read);
|
final GenomeLoc readLoc = GenomeLocParser.createGenomeLoc(read);
|
||||||
final GenomeLoc readMateLoc = GenomeLocParser.createGenomeLoc(read.getMateReferenceIndex(), read.getMateAlignmentStart(), read.getMateAlignmentStart());
|
final GenomeLoc readMateLoc = (!key.getReadPairedFlag()) ? null :
|
||||||
|
GenomeLocParser.createGenomeLoc(read.getMateReferenceIndex(), read.getMateAlignmentStart(), read.getMateAlignmentStart());
|
||||||
if (DEBUG)
|
if (DEBUG)
|
||||||
logger.debug(String.format("Examining reads at %s vs. %s at %s / %s vs. %s / %s%n", key.getReadName(), read.getReadName(), keyLoc, keyMateLoc, readLoc, readMateLoc));
|
logger.debug(String.format("Examining reads at %s vs. %s at %s / %s vs. %s / %s%n", key.getReadName(), read.getReadName(), keyLoc, keyMateLoc, readLoc, readMateLoc));
|
||||||
|
|
||||||
// read and key start at the same place, and either the this read and the key
|
// read and key start at the same place, and either the this read and the key
|
||||||
// share a mate location or the read is flagged as a duplicate
|
// share a mate location or the read is flagged as a duplicate
|
||||||
if (readLoc.compareTo(keyLoc) == 0 &&
|
if (readLoc.compareTo(keyLoc) == 0 ||
|
||||||
(readMateLoc.compareTo(keyMateLoc) == 0) ||
|
|
||||||
read.getDuplicateReadFlag()) {
|
read.getDuplicateReadFlag()) {
|
||||||
// we are at the same position as the dup and have the same mat pos, it's a dup
|
if ((readMateLoc != null && keyMateLoc != null && readMateLoc.compareTo(keyMateLoc) == 0) ||
|
||||||
if (DEBUG) logger.debug(String.format(" => Adding read to dups list: %s%n", read));
|
(readMateLoc == null && keyMateLoc == null)) {
|
||||||
dups.add(read);
|
// we are at the same position as the dup and have the same mat pos, it's a dup
|
||||||
|
if (DEBUG) logger.debug(String.format(" => Adding read to dups list: %s%n", read));
|
||||||
|
dups.add(read);
|
||||||
|
} else {
|
||||||
|
uniques.add(read);
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
uniques.add(read);
|
uniques.add(read);
|
||||||
}
|
}
|
||||||
|
|
@ -139,11 +144,11 @@ public class TraverseDuplicates extends TraversalEngine {
|
||||||
/**
|
/**
|
||||||
* Traverse by reads, given the data and the walker
|
* Traverse by reads, given the data and the walker
|
||||||
*
|
*
|
||||||
* @param sum of type T, the return from the walker
|
* @param sum of type T, the return from the walker
|
||||||
* @param <M> the generic type
|
* @param <M> the generic type
|
||||||
* @param <T> the return type of the reduce function
|
* @param <T> the return type of the reduce function
|
||||||
* @param dupWalker our duplicates walker
|
* @param dupWalker our duplicates walker
|
||||||
* @param readIter our iterator
|
* @param readIter our iterator
|
||||||
*
|
*
|
||||||
* @return the reduce type, T, the final product of all the reduce calls
|
* @return the reduce type, T, the final product of all the reduce calls
|
||||||
*/
|
*/
|
||||||
|
|
@ -171,10 +176,10 @@ public class TraverseDuplicates extends TraversalEngine {
|
||||||
List<SAMRecord> duplicateReads = split.getSecond();
|
List<SAMRecord> duplicateReads = split.getSecond();
|
||||||
|
|
||||||
logger.debug(String.format("*** TraverseDuplicates.traverse at %s with %d reads has %d unique and %d duplicate reads",
|
logger.debug(String.format("*** TraverseDuplicates.traverse at %s with %d reads has %d unique and %d duplicate reads",
|
||||||
site, reads.size(), uniqueReads.size(), duplicateReads.size()));
|
site, reads.size(), uniqueReads.size(), duplicateReads.size()));
|
||||||
if (reads.size() != uniqueReads.size() + duplicateReads.size())
|
if (reads.size() != uniqueReads.size() + duplicateReads.size())
|
||||||
throw new RuntimeException(String.format("Bug occurred spliting reads [N=%d] at loc %s into unique [N=%d] and duplicates [N=%d], sizes don't match",
|
throw new RuntimeException(String.format("Bug occurred spliting reads [N=%d] at loc %s into unique [N=%d] and duplicates [N=%d], sizes don't match",
|
||||||
reads.size(), site.toString(), uniqueReads.size(), duplicateReads.size()));
|
reads.size(), site.toString(), uniqueReads.size(), duplicateReads.size()));
|
||||||
|
|
||||||
// Jump forward in the reference to this locus location
|
// Jump forward in the reference to this locus location
|
||||||
AlignmentContext locus = new AlignmentContext(site, duplicateReads, Arrays.asList(0));
|
AlignmentContext locus = new AlignmentContext(site, duplicateReads, Arrays.asList(0));
|
||||||
|
|
|
||||||
|
|
@ -49,7 +49,6 @@ public class FastaAlternateReferenceWalker extends FastaReferenceWalker {
|
||||||
continue;
|
continue;
|
||||||
// if we have multiple variants at a locus, just take the first damn one we see for now
|
// if we have multiple variants at a locus, just take the first damn one we see for now
|
||||||
Variation variant = (Variation) rod;
|
Variation variant = (Variation) rod;
|
||||||
if (variant.getAlleleList().size() != 2) System.err.println("Not two " + Utils.join("-",variant.getAlleleList()));
|
|
||||||
if (!rod.getName().startsWith("snpmask") && variant.isDeletion()) {
|
if (!rod.getName().startsWith("snpmask") && variant.isDeletion()) {
|
||||||
deletionBasesRemaining = variant.getAlleleList().get(0).length();
|
deletionBasesRemaining = variant.getAlleleList().get(0).length();
|
||||||
basesSeen++;
|
basesSeen++;
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,128 @@
|
||||||
|
package org.broadinstitute.sting.gatk.traversals;
|
||||||
|
|
||||||
|
import net.sf.samtools.SAMFileHeader;
|
||||||
|
import net.sf.samtools.SAMRecord;
|
||||||
|
import org.broadinstitute.sting.BaseTest;
|
||||||
|
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||||
|
import org.broadinstitute.sting.utils.Pair;
|
||||||
|
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
|
||||||
|
import org.junit.Assert;
|
||||||
|
import org.junit.Before;
|
||||||
|
import org.junit.Test;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @author aaron
|
||||||
|
* <p/>
|
||||||
|
* Class TraverseDuplicatesTest
|
||||||
|
* <p/>
|
||||||
|
* test the meat of the traverse dupplicates.
|
||||||
|
*/
|
||||||
|
public class TraverseDuplicatesTest extends BaseTest {
|
||||||
|
|
||||||
|
private TraverseDuplicates obj = new TraverseDuplicates();
|
||||||
|
private SAMFileHeader header;
|
||||||
|
|
||||||
|
|
||||||
|
@Before
|
||||||
|
public void doBefore() {
|
||||||
|
header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000);
|
||||||
|
GenomeLocParser.setupRefContigOrdering(header.getSequenceDictionary());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testAllDupplicatesNoPairs() {
|
||||||
|
List<SAMRecord> list = new ArrayList<SAMRecord>();
|
||||||
|
for (int x = 0; x < 10; x++) {
|
||||||
|
SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "SWEET_READ", 0, 1, 100);
|
||||||
|
read.setDuplicateReadFlag(true);
|
||||||
|
list.add(read);
|
||||||
|
}
|
||||||
|
Pair<List<SAMRecord>, List<SAMRecord>> myPairing = obj.splitDuplicates(list);
|
||||||
|
Assert.assertEquals(0, myPairing.first.size()); // unique
|
||||||
|
Assert.assertEquals(10, myPairing.second.size()); // dup's
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testNoDupplicatesNoPairs() {
|
||||||
|
List<SAMRecord> list = new ArrayList<SAMRecord>();
|
||||||
|
for (int x = 0; x < 10; x++) {
|
||||||
|
SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "SWEET_READ", 0, 1, 100);
|
||||||
|
read.setDuplicateReadFlag(false);
|
||||||
|
list.add(read);
|
||||||
|
}
|
||||||
|
Pair<List<SAMRecord>, List<SAMRecord>> myPairing = obj.splitDuplicates(list);
|
||||||
|
Assert.assertEquals(10, myPairing.first.size()); // unique
|
||||||
|
Assert.assertEquals(0, myPairing.second.size()); // dup's
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testFiftyFiftyNoPairs() {
|
||||||
|
List<SAMRecord> list = new ArrayList<SAMRecord>();
|
||||||
|
for (int x = 0; x < 5; x++) {
|
||||||
|
SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "SWEET_READ", 0, 1, 100);
|
||||||
|
read.setDuplicateReadFlag(true);
|
||||||
|
list.add(read);
|
||||||
|
}
|
||||||
|
for (int x = 10; x < 15; x++)
|
||||||
|
list.add(ArtificialSAMUtils.createArtificialRead(header, String.valueOf(x), 0, x, 100));
|
||||||
|
|
||||||
|
Pair<List<SAMRecord>, List<SAMRecord>> myPairing = obj.splitDuplicates(list);
|
||||||
|
Assert.assertEquals(5, myPairing.first.size()); // unique
|
||||||
|
Assert.assertEquals(5, myPairing.second.size()); // dup's
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testAllDupplicatesAllPairs() {
|
||||||
|
List<SAMRecord> list = new ArrayList<SAMRecord>();
|
||||||
|
for (int x = 0; x < 10; x++) {
|
||||||
|
SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "SWEET_READ", 0, 1, 100);
|
||||||
|
read.setDuplicateReadFlag(true);
|
||||||
|
read.setMateAlignmentStart(100);
|
||||||
|
read.setMateReferenceIndex(0);
|
||||||
|
read.setReadPairedFlag(true);
|
||||||
|
list.add(read);
|
||||||
|
}
|
||||||
|
|
||||||
|
Pair<List<SAMRecord>, List<SAMRecord>> myPairing = obj.splitDuplicates(list);
|
||||||
|
Assert.assertEquals(0, myPairing.first.size()); // unique
|
||||||
|
Assert.assertEquals(10, myPairing.second.size()); // dup's
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testNoDupplicatesAllPairs() {
|
||||||
|
List<SAMRecord> list = new ArrayList<SAMRecord>();
|
||||||
|
for (int x = 0; x < 10; x++) {
|
||||||
|
SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "SWEET_READ", 0, 1, 100);
|
||||||
|
if (x == 0) read.setDuplicateReadFlag(true); // one is a dup but (next line)
|
||||||
|
read.setMateAlignmentStart(100); // they all have a shared start and mate start so they're dup's
|
||||||
|
read.setMateReferenceIndex(0);
|
||||||
|
read.setReadPairedFlag(true);
|
||||||
|
list.add(read);
|
||||||
|
}
|
||||||
|
|
||||||
|
Pair<List<SAMRecord>, List<SAMRecord>> myPairing = obj.splitDuplicates(list);
|
||||||
|
Assert.assertEquals(0, myPairing.first.size()); // unique
|
||||||
|
Assert.assertEquals(10, myPairing.second.size()); // dup's
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testAllDupplicatesAllPairsDifferentPairedEnd() {
|
||||||
|
List<SAMRecord> list = new ArrayList<SAMRecord>();
|
||||||
|
for (int x = 0; x < 10; x++) {
|
||||||
|
SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "SWEET_READ", 0, 1, 100);
|
||||||
|
if (x == 0) read.setDuplicateReadFlag(true); // one is a dup
|
||||||
|
read.setMateAlignmentStart(100 + x);
|
||||||
|
read.setMateReferenceIndex(0);
|
||||||
|
read.setReadPairedFlag(true);
|
||||||
|
list.add(read);
|
||||||
|
}
|
||||||
|
|
||||||
|
Pair<List<SAMRecord>, List<SAMRecord>> myPairing = obj.splitDuplicates(list);
|
||||||
|
Assert.assertEquals(9, myPairing.first.size()); // unique
|
||||||
|
Assert.assertEquals(1, myPairing.second.size()); // dup's
|
||||||
|
}
|
||||||
|
}
|
||||||
Loading…
Reference in New Issue