Completely rewritten duplicate traversal, more free of bugs, with integration tests for count duplicates walker validated on a TCGA hybrid capture lane.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2458 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2009-12-28 23:56:49 +00:00
parent 4617052b3c
commit fcc80e8632
8 changed files with 227 additions and 240 deletions

View File

@ -32,6 +32,7 @@ import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.datasources.providers.ReadView;
import org.broadinstitute.sting.gatk.datasources.providers.ShardDataProvider;
import org.broadinstitute.sting.gatk.datasources.providers.ManagingReferenceOrderedView;
import org.broadinstitute.sting.gatk.datasources.shards.ReadShard;
import org.broadinstitute.sting.gatk.datasources.shards.Shard;
import org.broadinstitute.sting.gatk.iterators.PushbackIterator;
@ -40,11 +41,10 @@ import org.broadinstitute.sting.gatk.walkers.Walker;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.Pair;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import java.util.*;
/**
* @author Mark DePristo
@ -85,128 +85,58 @@ public class TraverseDuplicates extends TraversalEngine {
return l;
}
protected Pair<List<SAMRecord>, List<SAMRecord>> splitDuplicates(List<SAMRecord> reads) {
List<SAMRecord> uniques = new ArrayList<SAMRecord>();
List<SAMRecord> dups = new ArrayList<SAMRecord>();
// find the first duplicate
SAMRecord key = null;
for (SAMRecord read : reads) {
if (read.getDuplicateReadFlag()) {
// this is our key
key = read;
if (DEBUG) logger.debug(String.format("Key %s is a duplicate", read.getReadName()));
break;
protected Set<List<SAMRecord>> uniqueReadSets(List<SAMRecord> reads) {
Set<List<SAMRecord>> readSets = new HashSet<List<SAMRecord>>();
for ( SAMRecord read : reads ) {
List<SAMRecord> readSet = findDuplicateReads(read, readSets);
if ( readSet == null ) {
readSets.add(new ArrayList<SAMRecord>(Arrays.asList(read))); // copy so I can add to the list
} else {
readSet.add(read);
}
}
// At this point, there are two possibilities, we have found at least one dup or not
// if it's a dup, add it to the dups list, otherwise add it to the uniques list
if (key != null) {
final GenomeLoc keyLoc = GenomeLocParser.createGenomeLoc(key);
final GenomeLoc keyMateLoc = (!key.getReadPairedFlag()) ? null :
GenomeLocParser.createGenomeLoc(key.getMateReferenceIndex(), key.getMateAlignmentStart(), key.getMateAlignmentStart());
for (SAMRecord read : reads) {
final GenomeLoc readLoc = GenomeLocParser.createGenomeLoc(read);
final GenomeLoc readMateLoc = (!key.getReadPairedFlag()) ? null :
GenomeLocParser.createGenomeLoc(read.getMateReferenceIndex(), read.getMateAlignmentStart(), read.getMateAlignmentStart());
if (DEBUG)
logger.debug(String.format("Examining reads at %s vs. %s at %s / %s vs. %s / %s%n", key.getReadName(), read.getReadName(), keyLoc, keyMateLoc, readLoc, readMateLoc));
return readSets;
}
protected List<SAMRecord> findDuplicateReads(SAMRecord read, Set<List<SAMRecord>> readSets ) {
if ( read.getReadPairedFlag() ) {
// paired
final GenomeLoc readMateLoc = GenomeLocParser.createGenomeLoc(read.getMateReferenceIndex(), read.getMateAlignmentStart(), read.getMateAlignmentStart());
for (List<SAMRecord> reads : readSets) {
SAMRecord key = reads.get(0);
//if (DEBUG)
// logger.debug(String.format("Examining reads at %s vs. %s at %s / %s vs. %s / %s%n", key.getReadName(), read.getReadName(), keyLoc, keyMateLoc, readLoc, readMateLoc));
// read and key start at the same place, and either the this read and the key
// share a mate location or the read is flagged as a duplicate
if (readLoc.compareTo(keyLoc) == 0 ||
read.getDuplicateReadFlag()) {
if ((readMateLoc != null && keyMateLoc != null && readMateLoc.compareTo(keyMateLoc) == 0) ||
(readMateLoc == null && keyMateLoc == null)) {
if ( read.getAlignmentStart() == key.getAlignmentStart() && key.getReadPairedFlag() && ( key.getDuplicateReadFlag() || read.getDuplicateReadFlag() ) ) {
// at least one has to be marked as a duplicate
final GenomeLoc keyMateLoc = GenomeLocParser.createGenomeLoc(key.getMateReferenceIndex(), key.getMateAlignmentStart(), key.getMateAlignmentStart());
if ( readMateLoc.compareTo(keyMateLoc) == 0 ) {
// we are at the same position as the dup and have the same mat pos, it's a dup
if (DEBUG) logger.debug(String.format(" => Adding read to dups list: %s%n", read));
dups.add(read);
} else {
uniques.add(read);
if (DEBUG) logger.debug(String.format(" => Adding read to dups list: %s %d %s vs. %s", read, reads.size(), readMateLoc, keyMateLoc));
return reads;
}
} else {
uniques.add(read);
}
}
} else {
uniques = reads;
}
return new Pair<List<SAMRecord>, List<SAMRecord>>(uniques, dups);
}
/**
* Traverse by reads, given the data and the walker
*
* @param sum of type T, the return from the walker
* @param <M> the generic type
* @param <T> the return type of the reduce function
* @param dupWalker our duplicates walker
* @param readIter our iterator
*
* @return the reduce type, T, the final product of all the reduce calls
*/
private <M, T> T actuallyTraverse(DuplicateWalker<M, T> dupWalker,
Iterator<SAMRecord> readIter,
T sum) {
/**
* while we still have more reads:
* ok, here's the idea. We get all the reads that start at the same position in the genome
* We then split the list of reads into sublists of reads:
* -> those with the same mate pair position, for paired reads
* -> those flagged as unpaired and duplicated but having the same start and end and
*/
PushbackIterator<SAMRecord> iter = new PushbackIterator<SAMRecord>(readIter);
for (SAMRecord read : iter) {
// get the genome loc from the read
GenomeLoc site = GenomeLocParser.createGenomeLoc(read);
List<SAMRecord> reads = readsAtLoc(read, iter);
Pair<List<SAMRecord>, List<SAMRecord>> split = splitDuplicates(reads);
List<SAMRecord> uniqueReads = split.getFirst();
List<SAMRecord> duplicateReads = split.getSecond();
logger.debug(String.format("*** TraverseDuplicates.traverse at %s with %d reads has %d unique and %d duplicate reads",
site, reads.size(), uniqueReads.size(), duplicateReads.size()));
if (reads.size() != uniqueReads.size() + duplicateReads.size())
throw new RuntimeException(String.format("Bug occurred spliting reads [N=%d] at loc %s into unique [N=%d] and duplicates [N=%d], sizes don't match",
reads.size(), site.toString(), uniqueReads.size(), duplicateReads.size()));
// Jump forward in the reference to this locus location
AlignmentContext locus = new AlignmentContext(site, duplicateReads, Arrays.asList(0));
// update the number of duplicate sets we've seen
TraversalStatistics.nRecords++;
// we still have to fix the locus context provider to take care of this problem with > 1 length contexts
// AlignmentContext locus = locusProvider.getLocusContext(site);
byte[] refBases = new byte[0];
if (dupWalker.mapUniqueReadsTooP()) {
// Send each unique read to the map function
for (SAMRecord unique : uniqueReads) {
List<SAMRecord> l = Arrays.asList(unique);
sum = mapOne(dupWalker, uniqueReads, l, site, refBases, locus, sum);
for (List<SAMRecord> reads : readSets) {
SAMRecord key = reads.get(0);
boolean v = (! key.getReadPairedFlag()) && read.getAlignmentStart() == key.getAlignmentStart() && ( key.getDuplicateReadFlag() || read.getDuplicateReadFlag() ) && read.getReadLength() == key.getReadLength();
//System.out.printf("%s %s %b %b %d %d %d %d => %b%n",
// read.getReadPairedFlag(), key.getReadPairedFlag(), read.getDuplicateReadFlag(), key.getDuplicateReadFlag(),
// read.getAlignmentStart(), key.getAlignmentStart(), read.getReadLength(), key.getReadLength(), v);
if ( v ) {
//System.out.printf("Returning reads...%n");
return reads;
}
}
if (duplicateReads.size() > 0)
sum = mapOne(dupWalker, uniqueReads, duplicateReads, site, refBases, locus, sum);
printProgress(DUPS_STRING, site);
if (this.maximumIterations > 0 && TraversalStatistics.nRecords > this.maximumIterations) {
logger.warn(String.format(("Maximum number of duplicate sets encountered, terminating traversal " + TraversalStatistics.nRecords)));
break;
}
}
return sum;
return null;
}
/**
@ -232,7 +162,7 @@ public class TraverseDuplicates extends TraversalEngine {
if (result) {
TraversalStatistics.nSkippedReads++;
//System.out.printf(" [filter] %s => %b %s", rec.getReadName(), result, why);
//System.out.printf(" [filter] %s => %b", rec.getReadName(), result);
} else {
TraversalStatistics.nReads++;
}
@ -240,27 +170,12 @@ public class TraverseDuplicates extends TraversalEngine {
}
}
public <M, T> T mapOne(DuplicateWalker<M, T> dupWalker,
List<SAMRecord> uniqueReads,
List<SAMRecord> duplicateReads,
GenomeLoc site,
byte[] refBases,
AlignmentContext locus,
T sum) {
final boolean keepMeP = dupWalker.filter(site, refBases, locus, uniqueReads, duplicateReads);
if (keepMeP) {
M x = dupWalker.map(site, refBases, locus, uniqueReads, duplicateReads);
sum = dupWalker.reduce(x, sum);
}
return sum;
}
// --------------------------------------------------------------------------------------------------------------
//
// new style interface to the system
//
// --------------------------------------------------------------------------------------------------------------
/**
* Traverse by reads, given the data and the walker
*
@ -276,8 +191,7 @@ public class TraverseDuplicates extends TraversalEngine {
Shard shard,
ShardDataProvider dataProvider,
T sum) {
logger.debug(String.format("TraverseDuplicates.traverse Genomic interval is %s", ((ReadShard) shard).getSize()));
//logger.debug(String.format("TraverseDuplicates.traverse Genomic interval is %s", shard.getGenomeLoc()));
if (!(walker instanceof DuplicateWalker))
throw new IllegalArgumentException("Walker isn't a duplicate walker!");
@ -292,13 +206,46 @@ public class TraverseDuplicates extends TraversalEngine {
FilteringIterator filterIter = new FilteringIterator(new ReadView(dataProvider).iterator(), new duplicateStreamFilterFunc());
PushbackIterator<SAMRecord> iter = new PushbackIterator<SAMRecord>(filterIter);
return actuallyTraverse(dupWalker, iter, sum);
}
/**
* while we still have more reads:
* ok, here's the idea. We get all the reads that start at the same position in the genome
* We then split the list of reads into sublists of reads:
* -> those with the same mate pair position, for paired reads
* -> those flagged as unpaired and duplicated but having the same start and end
*/
for (SAMRecord read : iter) {
// get the genome loc from the read
GenomeLoc site = GenomeLocParser.createGenomeLoc(read);
Set<List<SAMRecord>> readSets = uniqueReadSets(readsAtLoc(read, iter));
logger.debug(String.format("*** TraverseDuplicates.traverse at %s with %d read sets", site, readSets.size()));
// Jump forward in the reference to this locus location
AlignmentContext locus = new AlignmentContext(site, new ReadBackedPileup(site));
// update the number of duplicate sets we've seen
TraversalStatistics.nRecords++;
final boolean keepMeP = dupWalker.filter(site, locus, readSets);
if (keepMeP) {
M x = dupWalker.map(site, locus, readSets);
sum = dupWalker.reduce(x, sum);
}
printProgress(DUPS_STRING, site);
if (this.maximumIterations > 0 && TraversalStatistics.nRecords > this.maximumIterations) {
logger.warn(String.format(("Maximum number of duplicate sets encountered, terminating traversal " + TraversalStatistics.nRecords)));
break;
}
}
return sum;
}
/**
* Temporary override of printOnTraversalDone.
* TODO: Add some sort of TE.getName() function once all TraversalEngines are ported.
*
* @param sum Result of the computation.
* @param <T> Type of the result.

View File

@ -4,6 +4,7 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.utils.GenomeLoc;
import java.util.List;
import java.util.Set;
import net.sf.samtools.SAMRecord;
@ -17,9 +18,7 @@ import net.sf.samtools.SAMRecord;
@Requires({DataSource.READS,DataSource.REFERENCE})
public abstract class DuplicateWalker<MapType, ReduceType> extends Walker<MapType, ReduceType> {
// Do we actually want to operate on the context?
public boolean filter(GenomeLoc loc, byte[] refBases, AlignmentContext context,
List<SAMRecord> uniqueReads,
List<SAMRecord> duplicateReads) {
public boolean filter(GenomeLoc loc, AlignmentContext context, Set<List<SAMRecord>> readSets ) {
return true; // We are keeping all the reads
}
@ -31,9 +30,14 @@ public abstract class DuplicateWalker<MapType, ReduceType> extends Walker<MapTyp
*/
public boolean mapUniqueReadsTooP() { return false; }
public abstract MapType map(GenomeLoc loc, byte[] refBases, AlignmentContext context,
List<SAMRecord> uniqueReads,
List<SAMRecord> duplicateReads);
/**
* Called by the traversal engine to decide whether to call map() at loci without duplicate reads
*
* @return true if you want to see non duplicates during the traversal
*/
public boolean mapAtLociWithoutDuplicates() { return true; }
public abstract MapType map(GenomeLoc loc, AlignmentContext context, Set<List<SAMRecord>> readSets );
// Given result of map function
public abstract ReduceType reduceInit();

View File

@ -23,7 +23,7 @@
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.playground.gatk.walkers.duplicates;
package org.broadinstitute.sting.oneoffprojects.walkers;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
@ -39,6 +39,7 @@ import org.broadinstitute.sting.utils.duplicates.DuplicateComp;
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
class MismatchCounter {
long nObs = 0;
@ -160,41 +161,40 @@ public class DuplicateQualsWalker extends DuplicateWalker<List<DuplicateComp>, Q
}
// Print out data for regression
public List<DuplicateComp> map(GenomeLoc loc, byte[] refBases, AlignmentContext context,
List<SAMRecord> uniqueReads,
List<SAMRecord> duplicateReads) {
public List<DuplicateComp> map(GenomeLoc loc, AlignmentContext context, Set<List<SAMRecord>> readSets ) {
//logger.info(String.format("%s has %d duplicates and %d non-duplicates", loc, duplicateReads.size(), uniqueReads.size()));
List<DuplicateComp> pairwiseComps = new ArrayList<DuplicateComp>();
if ( ! ACTUALLY_DO_WORK )
return pairwiseComps;
if ( COMBINE_QUALS ) {
Pair<SAMRecord, SAMRecord> combinedReads = DupUtils.combinedReadPair( duplicateReads );
if ( combinedReads != null ) {
SAMRecord combined1 = combinedReads.first;
SAMRecord combined2 = combinedReads.second;
if ( comparePairToSingleton )
pairwiseComps = addPairwiseMatches( pairwiseComps, combined1, duplicateReads.get(2), uniqueReads );
else
pairwiseComps = addPairwiseMatches( pairwiseComps, combined1, combined2, uniqueReads );
}
} else {
int nComparisons = 0;
for ( SAMRecord read1 : duplicateReads ) {
for ( SAMRecord read2 : duplicateReads ) {
if ( read1.hashCode() < read2.hashCode() && DupUtils.usableDuplicate(read1, read2) ) {
// the hashcode insures we don't do A vs. B and B vs. A
//System.out.printf("Comparing %s against %s%n", read1, read2);
nComparisons++;
pairwiseComps = addPairwiseMatches( pairwiseComps, read1, read2, uniqueReads );
if ( nComparisons > MAX_PAIRSIZE_COMPS_PER_DUPLICATE_SET )
break;
}
}
}
}
// todo -- fixme -- the logic here is all wrong given new interface
// if ( ! ACTUALLY_DO_WORK )
// return pairwiseComps;
//
// if ( COMBINE_QUALS ) {
// Pair<SAMRecord, SAMRecord> combinedReads = DupUtils.combinedReadPair( duplicateReads );
// if ( combinedReads != null ) {
// SAMRecord combined1 = combinedReads.first;
// SAMRecord combined2 = combinedReads.second;
//
// if ( comparePairToSingleton )
// pairwiseComps = addPairwiseMatches( pairwiseComps, combined1, duplicateReads.get(2), uniqueReads );
// else
// pairwiseComps = addPairwiseMatches( pairwiseComps, combined1, combined2, uniqueReads );
// }
// } else {
// int nComparisons = 0;
// for ( SAMRecord read1 : duplicateReads ) {
// for ( SAMRecord read2 : duplicateReads ) {
// if ( read1.hashCode() < read2.hashCode() && DupUtils.usableDuplicate(read1, read2) ) {
// // the hashcode insures we don't do A vs. B and B vs. A
// //System.out.printf("Comparing %s against %s%n", read1, read2);
// nComparisons++;
// pairwiseComps = addPairwiseMatches( pairwiseComps, read1, read2, uniqueReads );
// if ( nComparisons > MAX_PAIRSIZE_COMPS_PER_DUPLICATE_SET )
// break;
// }
// }
// }
// }
return pairwiseComps;
}

View File

@ -7,6 +7,8 @@ import org.broadinstitute.sting.utils.duplicates.DupUtils;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import java.util.List;
import java.util.Set;
import java.util.ArrayList;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.SAMFileWriter;
@ -15,7 +17,7 @@ import net.sf.samtools.SAMFileWriter;
* Process the input bam file, optionally emitting all the unique reads found, and emitting the combined duplicate reads to
* the specified output BAM location. If no output location is specified, the reads are written to STDOUT.
*/
public class CombineDuplicatesWalker extends DuplicateWalker<SAMRecord, SAMFileWriter> {
public class CombineDuplicatesWalker extends DuplicateWalker<List<SAMRecord>, SAMFileWriter> {
@Argument(fullName="outputBAM", shortName="outputBAM", required=false, doc="BAM File to write combined duplicates to")
public SAMFileWriter outputBAM = null;
@ -45,50 +47,58 @@ public class CombineDuplicatesWalker extends DuplicateWalker<SAMRecord, SAMFileW
/**
* emit the read that was produced by combining the dupplicates
*/
public SAMFileWriter reduce(SAMRecord read, SAMFileWriter output) {
if ( output != null ) {
output.addAlignment(read);
} else {
out.println(read.format());
public SAMFileWriter reduce(List<SAMRecord> reads, SAMFileWriter output) {
for ( SAMRecord read : reads ) {
if ( output != null ) {
output.addAlignment(read);
} else {
out.println(read.format());
}
}
return output;
}
/**
* We don't want to see loci without duplicates, since
* @return
*/
public boolean mapAtLociWithoutDuplicates() { return false; }
/**
* Build a combined read given the input list of non-unique reads. If there's just one read in the
* set, it's considered unique and returned. If there's more than one, the N-way combine
* duplicate function is invoked.
*
* @param loc the genome loc
* @param refBases the reference bases for the given locus
* @param context the alignment context that has the reads information
* @param duplicateReads a list of the dupplicate reads at this locus
* @param uniqueReads the unique read list at this locus
* @param readSets the set of unique reads list at this locus
* @return a read that combines the dupplicate reads at this locus
*/
public SAMRecord map(GenomeLoc loc, byte[] refBases, AlignmentContext context,
List<SAMRecord> uniqueReads,
List<SAMRecord> duplicateReads) {
//logger.info(String.format("%s has %d duplicates and %d non-duplicates", loc, duplicateReads.size(), uniqueReads.size()));
public List<SAMRecord> map(GenomeLoc loc, AlignmentContext context, Set<List<SAMRecord>> readSets ) {
List<SAMRecord> combinedReads = new ArrayList<SAMRecord>();
SAMRecord combinedRead = null;
for ( List<SAMRecord> reads : readSets ) {
SAMRecord combinedRead = null;
if ( duplicateReads.size() == 1 && ! duplicateReads.get(0).getDuplicateReadFlag() ) {
// we are a unique read
combinedRead = duplicateReads.get(0);
} else {
// actually call the combine function
//for (SAMRecord read : duplicateReads ) {
// System.out.printf("Read %s%n", read.format());
//}
combinedRead = DupUtils.combineDuplicates(duplicateReads, MAX_QUALITY_SCORE);
if ( reads.size() == 1 && ! reads.get(0).getDuplicateReadFlag() ) {
// we are a unique read
combinedRead = reads.get(0);
} else {
// actually call the combine function
//for (SAMRecord read : duplicateReads ) {
// System.out.printf("Read %s%n", read.format());
//}
combinedRead = DupUtils.combineDuplicates(reads, MAX_QUALITY_SCORE);
}
if ( combinedRead.getDuplicateReadFlag() )
throw new RuntimeException(String.format("Combined read %s [of %d] is a duplicate after combination -- this is a bug%n%s",
combinedRead.getReadName(), reads.size(), combinedRead.format()));
combinedReads.add(combinedRead);
}
if ( combinedRead.getDuplicateReadFlag() )
throw new RuntimeException(String.format("Combined read %s [of %d] is a duplicate after combination -- this is a bug%n%s",
combinedRead.getReadName(), duplicateReads.size(), combinedRead.format()));
return combinedRead;
return combinedReads;
}
}

View File

@ -29,41 +29,61 @@ import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.walkers.DuplicateWalker;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import java.util.List;
import java.util.Set;
import java.util.ArrayList;
/**
* a class to store the traversal information we pass around
*/
class DuplicateCount {
public int count = 0; // the count of sites we were given
public int undupDepth = 0; // the unique read count
public int depth = 0; // the dupplicate read depth
}
public int count = 0; // the count of sites we were given
public int nUniqueMolecules = 0; // the unique read count
public int nDuplicatedMolecules = 0; // the unique read count
public int depth = 0; // the dupplicate read depth
}
/**
* Count the number of unique reads, duplicates, and the average depth of unique reads and duplicates at all positions.
* @author aaron
*/
public class CountDuplicatesWalker extends DuplicateWalker<DuplicateCount, DuplicateCount> {
@Argument(fullName="quiet", required=false, doc="If true, per locus information isn't printex")
public boolean quiet = false;
/**
* the map function, conforming to the duplicates interface
* @param loc the genomic location
* @param refBases the reference bases that cover this position, which we turn off
* @param context the AlignmentContext, containing all the reads overlapping this region
* @param uniqueReads all the unique reads, bundled together
* @param duplicateReads all the duplicate reads
* @param readSets all the duplicate reads
* @return a DuplicateCount object, with the appropriate stats
*/
public DuplicateCount map(GenomeLoc loc, byte[] refBases, AlignmentContext context, List<SAMRecord> uniqueReads, List<SAMRecord> duplicateReads) {
public DuplicateCount map(GenomeLoc loc, AlignmentContext context, Set<List<SAMRecord>> readSets ) {
if ( ! quiet ) out.printf("%s with %d read sets => ", loc, readSets.size());
DuplicateCount dup = new DuplicateCount();
dup.depth = 0;
for ( List<SAMRecord> reads : readSets) {
List<String> names = new ArrayList<String>();
for ( SAMRecord read : reads ) {
names.add(read.getReadName());
}
if ( ! quiet ) out.printf("%d reads [%s] ", reads.size(), Utils.join(",", names));
dup.depth += reads.size();
dup.nDuplicatedMolecules += reads.size() > 1 ? 1 : 0; // if there's more than 1 read per set, we're a duplicated reads
}
if ( ! quiet ) out.printf("%n");
dup.count = 1;
dup.undupDepth = uniqueReads.size();
dup.depth = duplicateReads.size();
dup.nUniqueMolecules = readSets.size();
return dup;
}
public boolean mapAtLociWithoutDuplicates() { return true; }
/**
* setup our walker. In this case, new a DuplicateCount object and return it
* @return the object holding the counts of the duplicates
@ -82,7 +102,8 @@ public class CountDuplicatesWalker extends DuplicateWalker<DuplicateCount, Dupli
DuplicateCount dup = new DuplicateCount();
dup.count = sum.count + value.count;
dup.depth = value.depth + sum.depth;
dup.undupDepth = value.undupDepth + sum.undupDepth;
dup.nDuplicatedMolecules = value.nDuplicatedMolecules + sum.nDuplicatedMolecules;
dup.nUniqueMolecules = value.nUniqueMolecules + sum.nUniqueMolecules;
return dup;
}
@ -93,9 +114,10 @@ public class CountDuplicatesWalker extends DuplicateWalker<DuplicateCount, Dupli
public void onTraversalDone(DuplicateCount result) {
out.println("[REDUCE RESULT] Traversal result is: ");
out.println("traversal iterations = " + result.count);
out.println("average depth = " + (double)result.depth / (double)result.count);
out.println("duplicates seen = " + result.depth);
out.println("unique read count = " + result.undupDepth);
out.println("average unique read depth = " + (double)result.undupDepth / (double)result.count);
out.printf("average depth = %.2f%n", (double)result.depth / (double)result.count);
out.println("unique molecules seen = " + result.nUniqueMolecules);
out.println("duplicated molecules seen = " + result.nDuplicatedMolecules);
out.printf("percent duplicated = %.2f%%%n", result.nDuplicatedMolecules / (double)result.nUniqueMolecules * 100);
out.printf("average unique read depth = %.2f%n", (double)result.nUniqueMolecules / (double)result.count);
}
}

View File

@ -39,6 +39,13 @@ public class ReadBackedPileup implements Iterable<PileupElement> {
this(loc, readsOffsets2Pileup(reads, offsets));
}
/**
* Create a new version of a read backed pileup at loc without any aligned reads
*
*/
public ReadBackedPileup(GenomeLoc loc ) {
this(loc, new ArrayList<PileupElement>(0));
}
/**
* Create a new version of a read backed pileup at loc, using the reads and their corresponding

View File

@ -12,6 +12,7 @@ import org.junit.Test;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
/**
@ -37,49 +38,48 @@ public class TraverseDuplicatesTest extends BaseTest {
public void testAllDupplicatesNoPairs() {
List<SAMRecord> list = new ArrayList<SAMRecord>();
for (int x = 0; x < 10; x++) {
SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "SWEET_READ", 0, 1, 100);
SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "SWEET_READ" + x, 0, 1, 100);
read.setDuplicateReadFlag(true);
list.add(read);
}
Pair<List<SAMRecord>, List<SAMRecord>> myPairing = obj.splitDuplicates(list);
Assert.assertEquals(0, myPairing.first.size()); // unique
Assert.assertEquals(10, myPairing.second.size()); // dup's
Set<List<SAMRecord>> myPairings = obj.uniqueReadSets(list);
Assert.assertEquals(1, myPairings.size());
Assert.assertEquals(10, myPairings.iterator().next().size()); // dup's
}
@Test
public void testNoDupplicatesNoPairs() {
List<SAMRecord> list = new ArrayList<SAMRecord>();
for (int x = 0; x < 10; x++) {
SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "SWEET_READ", 0, 1, 100);
SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "SWEET_READ" + x, 0, 1, 100);
read.setDuplicateReadFlag(false);
list.add(read);
}
Pair<List<SAMRecord>, List<SAMRecord>> myPairing = obj.splitDuplicates(list);
Assert.assertEquals(10, myPairing.first.size()); // unique
Assert.assertEquals(0, myPairing.second.size()); // dup's
Set<List<SAMRecord>> myPairing = obj.uniqueReadSets(list);
Assert.assertEquals(10, myPairing.size()); // unique
}
@Test
public void testFiftyFiftyNoPairs() {
List<SAMRecord> list = new ArrayList<SAMRecord>();
for (int x = 0; x < 5; x++) {
SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "SWEET_READ", 0, 1, 100);
SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "SWEET_READ" + x, 0, 1, 100);
read.setDuplicateReadFlag(true);
list.add(read);
}
for (int x = 10; x < 15; x++)
list.add(ArtificialSAMUtils.createArtificialRead(header, String.valueOf(x), 0, x, 100));
Pair<List<SAMRecord>, List<SAMRecord>> myPairing = obj.splitDuplicates(list);
Assert.assertEquals(5, myPairing.first.size()); // unique
Assert.assertEquals(5, myPairing.second.size()); // dup's
Set<List<SAMRecord>> myPairing = obj.uniqueReadSets(list);
Assert.assertEquals(6, myPairing.size()); // unique
}
@Test
public void testAllDupplicatesAllPairs() {
List<SAMRecord> list = new ArrayList<SAMRecord>();
for (int x = 0; x < 10; x++) {
SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "SWEET_READ", 0, 1, 100);
SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "SWEET_READ"+ x, 0, 1, 100);
read.setDuplicateReadFlag(true);
read.setMateAlignmentStart(100);
read.setMateReferenceIndex(0);
@ -87,16 +87,15 @@ public class TraverseDuplicatesTest extends BaseTest {
list.add(read);
}
Pair<List<SAMRecord>, List<SAMRecord>> myPairing = obj.splitDuplicates(list);
Assert.assertEquals(0, myPairing.first.size()); // unique
Assert.assertEquals(10, myPairing.second.size()); // dup's
Set<List<SAMRecord>> myPairing = obj.uniqueReadSets(list);
Assert.assertEquals(1, myPairing.size()); // unique
}
@Test
public void testNoDupplicatesAllPairs() {
List<SAMRecord> list = new ArrayList<SAMRecord>();
for (int x = 0; x < 10; x++) {
SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "SWEET_READ", 0, 1, 100);
SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "SWEET_READ"+ x, 0, 1, 100);
if (x == 0) read.setDuplicateReadFlag(true); // one is a dup but (next line)
read.setMateAlignmentStart(100); // they all have a shared start and mate start so they're dup's
read.setMateReferenceIndex(0);
@ -104,16 +103,15 @@ public class TraverseDuplicatesTest extends BaseTest {
list.add(read);
}
Pair<List<SAMRecord>, List<SAMRecord>> myPairing = obj.splitDuplicates(list);
Assert.assertEquals(0, myPairing.first.size()); // unique
Assert.assertEquals(10, myPairing.second.size()); // dup's
Set<List<SAMRecord>> myPairing = obj.uniqueReadSets(list);
Assert.assertEquals(1, myPairing.size()); // unique
}
@Test
public void testAllDupplicatesAllPairsDifferentPairedEnd() {
List<SAMRecord> list = new ArrayList<SAMRecord>();
for (int x = 0; x < 10; x++) {
SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "SWEET_READ", 0, 1, 100);
SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "SWEET_READ" + x, 0, 1, 100);
if (x == 0) read.setDuplicateReadFlag(true); // one is a dup
read.setMateAlignmentStart(100 + x);
read.setMateReferenceIndex(0);
@ -121,8 +119,7 @@ public class TraverseDuplicatesTest extends BaseTest {
list.add(read);
}
Pair<List<SAMRecord>, List<SAMRecord>> myPairing = obj.splitDuplicates(list);
Assert.assertEquals(9, myPairing.first.size()); // unique
Assert.assertEquals(1, myPairing.second.size()); // dup's
Set<List<SAMRecord>> myPairing = obj.uniqueReadSets(list);
Assert.assertEquals(10, myPairing.size()); // unique
}
}

View File

@ -249,7 +249,7 @@ def calculateBinsForValues(values, field, minValue, maxValue, partitions):
bins[-1][1] = bin[0]
bins[-1][2] += curSize
#print 'Returning ', bins
print 'Returning ', bins
#sys.exit(1)
return bins