Completely rewritten duplicate traversal, more free of bugs, with integration tests for count duplicates walker validated on a TCGA hybrid capture lane.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2458 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2009-12-28 23:56:49 +00:00
parent 4617052b3c
commit fcc80e8632
8 changed files with 227 additions and 240 deletions

View File

@ -32,6 +32,7 @@ import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.datasources.providers.ReadView; import org.broadinstitute.sting.gatk.datasources.providers.ReadView;
import org.broadinstitute.sting.gatk.datasources.providers.ShardDataProvider; import org.broadinstitute.sting.gatk.datasources.providers.ShardDataProvider;
import org.broadinstitute.sting.gatk.datasources.providers.ManagingReferenceOrderedView;
import org.broadinstitute.sting.gatk.datasources.shards.ReadShard; import org.broadinstitute.sting.gatk.datasources.shards.ReadShard;
import org.broadinstitute.sting.gatk.datasources.shards.Shard; import org.broadinstitute.sting.gatk.datasources.shards.Shard;
import org.broadinstitute.sting.gatk.iterators.PushbackIterator; import org.broadinstitute.sting.gatk.iterators.PushbackIterator;
@ -40,11 +41,10 @@ import org.broadinstitute.sting.gatk.walkers.Walker;
import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.Pair; import org.broadinstitute.sting.utils.Pair;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import java.util.ArrayList; import java.util.*;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
/** /**
* @author Mark DePristo * @author Mark DePristo
@ -85,128 +85,58 @@ public class TraverseDuplicates extends TraversalEngine {
return l; return l;
} }
protected Pair<List<SAMRecord>, List<SAMRecord>> splitDuplicates(List<SAMRecord> reads) { protected Set<List<SAMRecord>> uniqueReadSets(List<SAMRecord> reads) {
List<SAMRecord> uniques = new ArrayList<SAMRecord>(); Set<List<SAMRecord>> readSets = new HashSet<List<SAMRecord>>();
List<SAMRecord> dups = new ArrayList<SAMRecord>(); for ( SAMRecord read : reads ) {
// find the first duplicate List<SAMRecord> readSet = findDuplicateReads(read, readSets);
SAMRecord key = null; if ( readSet == null ) {
for (SAMRecord read : reads) { readSets.add(new ArrayList<SAMRecord>(Arrays.asList(read))); // copy so I can add to the list
if (read.getDuplicateReadFlag()) { } else {
// this is our key readSet.add(read);
key = read;
if (DEBUG) logger.debug(String.format("Key %s is a duplicate", read.getReadName()));
break;
} }
} }
// At this point, there are two possibilities, we have found at least one dup or not return readSets;
// if it's a dup, add it to the dups list, otherwise add it to the uniques list }
if (key != null) {
final GenomeLoc keyLoc = GenomeLocParser.createGenomeLoc(key); protected List<SAMRecord> findDuplicateReads(SAMRecord read, Set<List<SAMRecord>> readSets ) {
final GenomeLoc keyMateLoc = (!key.getReadPairedFlag()) ? null : if ( read.getReadPairedFlag() ) {
GenomeLocParser.createGenomeLoc(key.getMateReferenceIndex(), key.getMateAlignmentStart(), key.getMateAlignmentStart()); // paired
for (SAMRecord read : reads) { final GenomeLoc readMateLoc = GenomeLocParser.createGenomeLoc(read.getMateReferenceIndex(), read.getMateAlignmentStart(), read.getMateAlignmentStart());
final GenomeLoc readLoc = GenomeLocParser.createGenomeLoc(read);
final GenomeLoc readMateLoc = (!key.getReadPairedFlag()) ? null : for (List<SAMRecord> reads : readSets) {
GenomeLocParser.createGenomeLoc(read.getMateReferenceIndex(), read.getMateAlignmentStart(), read.getMateAlignmentStart()); SAMRecord key = reads.get(0);
if (DEBUG) //if (DEBUG)
logger.debug(String.format("Examining reads at %s vs. %s at %s / %s vs. %s / %s%n", key.getReadName(), read.getReadName(), keyLoc, keyMateLoc, readLoc, readMateLoc)); // logger.debug(String.format("Examining reads at %s vs. %s at %s / %s vs. %s / %s%n", key.getReadName(), read.getReadName(), keyLoc, keyMateLoc, readLoc, readMateLoc));
// read and key start at the same place, and either the this read and the key // read and key start at the same place, and either the this read and the key
// share a mate location or the read is flagged as a duplicate // share a mate location or the read is flagged as a duplicate
if (readLoc.compareTo(keyLoc) == 0 || if ( read.getAlignmentStart() == key.getAlignmentStart() && key.getReadPairedFlag() && ( key.getDuplicateReadFlag() || read.getDuplicateReadFlag() ) ) {
read.getDuplicateReadFlag()) { // at least one has to be marked as a duplicate
if ((readMateLoc != null && keyMateLoc != null && readMateLoc.compareTo(keyMateLoc) == 0) || final GenomeLoc keyMateLoc = GenomeLocParser.createGenomeLoc(key.getMateReferenceIndex(), key.getMateAlignmentStart(), key.getMateAlignmentStart());
(readMateLoc == null && keyMateLoc == null)) { if ( readMateLoc.compareTo(keyMateLoc) == 0 ) {
// we are at the same position as the dup and have the same mat pos, it's a dup // we are at the same position as the dup and have the same mat pos, it's a dup
if (DEBUG) logger.debug(String.format(" => Adding read to dups list: %s%n", read)); if (DEBUG) logger.debug(String.format(" => Adding read to dups list: %s %d %s vs. %s", read, reads.size(), readMateLoc, keyMateLoc));
dups.add(read); return reads;
} else {
uniques.add(read);
} }
} else {
uniques.add(read);
} }
} }
} else { } else {
uniques = reads; for (List<SAMRecord> reads : readSets) {
} SAMRecord key = reads.get(0);
boolean v = (! key.getReadPairedFlag()) && read.getAlignmentStart() == key.getAlignmentStart() && ( key.getDuplicateReadFlag() || read.getDuplicateReadFlag() ) && read.getReadLength() == key.getReadLength();
return new Pair<List<SAMRecord>, List<SAMRecord>>(uniques, dups); //System.out.printf("%s %s %b %b %d %d %d %d => %b%n",
} // read.getReadPairedFlag(), key.getReadPairedFlag(), read.getDuplicateReadFlag(), key.getDuplicateReadFlag(),
// read.getAlignmentStart(), key.getAlignmentStart(), read.getReadLength(), key.getReadLength(), v);
/** if ( v ) {
* Traverse by reads, given the data and the walker //System.out.printf("Returning reads...%n");
* return reads;
* @param sum of type T, the return from the walker
* @param <M> the generic type
* @param <T> the return type of the reduce function
* @param dupWalker our duplicates walker
* @param readIter our iterator
*
* @return the reduce type, T, the final product of all the reduce calls
*/
private <M, T> T actuallyTraverse(DuplicateWalker<M, T> dupWalker,
Iterator<SAMRecord> readIter,
T sum) {
/**
* while we still have more reads:
* ok, here's the idea. We get all the reads that start at the same position in the genome
* We then split the list of reads into sublists of reads:
* -> those with the same mate pair position, for paired reads
* -> those flagged as unpaired and duplicated but having the same start and end and
*/
PushbackIterator<SAMRecord> iter = new PushbackIterator<SAMRecord>(readIter);
for (SAMRecord read : iter) {
// get the genome loc from the read
GenomeLoc site = GenomeLocParser.createGenomeLoc(read);
List<SAMRecord> reads = readsAtLoc(read, iter);
Pair<List<SAMRecord>, List<SAMRecord>> split = splitDuplicates(reads);
List<SAMRecord> uniqueReads = split.getFirst();
List<SAMRecord> duplicateReads = split.getSecond();
logger.debug(String.format("*** TraverseDuplicates.traverse at %s with %d reads has %d unique and %d duplicate reads",
site, reads.size(), uniqueReads.size(), duplicateReads.size()));
if (reads.size() != uniqueReads.size() + duplicateReads.size())
throw new RuntimeException(String.format("Bug occurred spliting reads [N=%d] at loc %s into unique [N=%d] and duplicates [N=%d], sizes don't match",
reads.size(), site.toString(), uniqueReads.size(), duplicateReads.size()));
// Jump forward in the reference to this locus location
AlignmentContext locus = new AlignmentContext(site, duplicateReads, Arrays.asList(0));
// update the number of duplicate sets we've seen
TraversalStatistics.nRecords++;
// we still have to fix the locus context provider to take care of this problem with > 1 length contexts
// AlignmentContext locus = locusProvider.getLocusContext(site);
byte[] refBases = new byte[0];
if (dupWalker.mapUniqueReadsTooP()) {
// Send each unique read to the map function
for (SAMRecord unique : uniqueReads) {
List<SAMRecord> l = Arrays.asList(unique);
sum = mapOne(dupWalker, uniqueReads, l, site, refBases, locus, sum);
} }
} }
if (duplicateReads.size() > 0)
sum = mapOne(dupWalker, uniqueReads, duplicateReads, site, refBases, locus, sum);
printProgress(DUPS_STRING, site);
if (this.maximumIterations > 0 && TraversalStatistics.nRecords > this.maximumIterations) {
logger.warn(String.format(("Maximum number of duplicate sets encountered, terminating traversal " + TraversalStatistics.nRecords)));
break;
}
} }
return sum; return null;
} }
/** /**
@ -232,7 +162,7 @@ public class TraverseDuplicates extends TraversalEngine {
if (result) { if (result) {
TraversalStatistics.nSkippedReads++; TraversalStatistics.nSkippedReads++;
//System.out.printf(" [filter] %s => %b %s", rec.getReadName(), result, why); //System.out.printf(" [filter] %s => %b", rec.getReadName(), result);
} else { } else {
TraversalStatistics.nReads++; TraversalStatistics.nReads++;
} }
@ -240,27 +170,12 @@ public class TraverseDuplicates extends TraversalEngine {
} }
} }
public <M, T> T mapOne(DuplicateWalker<M, T> dupWalker,
List<SAMRecord> uniqueReads,
List<SAMRecord> duplicateReads,
GenomeLoc site,
byte[] refBases,
AlignmentContext locus,
T sum) {
final boolean keepMeP = dupWalker.filter(site, refBases, locus, uniqueReads, duplicateReads);
if (keepMeP) {
M x = dupWalker.map(site, refBases, locus, uniqueReads, duplicateReads);
sum = dupWalker.reduce(x, sum);
}
return sum;
}
// -------------------------------------------------------------------------------------------------------------- // --------------------------------------------------------------------------------------------------------------
// //
// new style interface to the system // new style interface to the system
// //
// -------------------------------------------------------------------------------------------------------------- // --------------------------------------------------------------------------------------------------------------
/** /**
* Traverse by reads, given the data and the walker * Traverse by reads, given the data and the walker
* *
@ -276,8 +191,7 @@ public class TraverseDuplicates extends TraversalEngine {
Shard shard, Shard shard,
ShardDataProvider dataProvider, ShardDataProvider dataProvider,
T sum) { T sum) {
//logger.debug(String.format("TraverseDuplicates.traverse Genomic interval is %s", shard.getGenomeLoc()));
logger.debug(String.format("TraverseDuplicates.traverse Genomic interval is %s", ((ReadShard) shard).getSize()));
if (!(walker instanceof DuplicateWalker)) if (!(walker instanceof DuplicateWalker))
throw new IllegalArgumentException("Walker isn't a duplicate walker!"); throw new IllegalArgumentException("Walker isn't a duplicate walker!");
@ -292,13 +206,46 @@ public class TraverseDuplicates extends TraversalEngine {
FilteringIterator filterIter = new FilteringIterator(new ReadView(dataProvider).iterator(), new duplicateStreamFilterFunc()); FilteringIterator filterIter = new FilteringIterator(new ReadView(dataProvider).iterator(), new duplicateStreamFilterFunc());
PushbackIterator<SAMRecord> iter = new PushbackIterator<SAMRecord>(filterIter); PushbackIterator<SAMRecord> iter = new PushbackIterator<SAMRecord>(filterIter);
return actuallyTraverse(dupWalker, iter, sum);
}
/**
* while we still have more reads:
* ok, here's the idea. We get all the reads that start at the same position in the genome
* We then split the list of reads into sublists of reads:
* -> those with the same mate pair position, for paired reads
* -> those flagged as unpaired and duplicated but having the same start and end
*/
for (SAMRecord read : iter) {
// get the genome loc from the read
GenomeLoc site = GenomeLocParser.createGenomeLoc(read);
Set<List<SAMRecord>> readSets = uniqueReadSets(readsAtLoc(read, iter));
logger.debug(String.format("*** TraverseDuplicates.traverse at %s with %d read sets", site, readSets.size()));
// Jump forward in the reference to this locus location
AlignmentContext locus = new AlignmentContext(site, new ReadBackedPileup(site));
// update the number of duplicate sets we've seen
TraversalStatistics.nRecords++;
final boolean keepMeP = dupWalker.filter(site, locus, readSets);
if (keepMeP) {
M x = dupWalker.map(site, locus, readSets);
sum = dupWalker.reduce(x, sum);
}
printProgress(DUPS_STRING, site);
if (this.maximumIterations > 0 && TraversalStatistics.nRecords > this.maximumIterations) {
logger.warn(String.format(("Maximum number of duplicate sets encountered, terminating traversal " + TraversalStatistics.nRecords)));
break;
}
}
return sum;
}
/** /**
* Temporary override of printOnTraversalDone. * Temporary override of printOnTraversalDone.
* TODO: Add some sort of TE.getName() function once all TraversalEngines are ported.
* *
* @param sum Result of the computation. * @param sum Result of the computation.
* @param <T> Type of the result. * @param <T> Type of the result.

View File

@ -4,6 +4,7 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLoc;
import java.util.List; import java.util.List;
import java.util.Set;
import net.sf.samtools.SAMRecord; import net.sf.samtools.SAMRecord;
@ -17,9 +18,7 @@ import net.sf.samtools.SAMRecord;
@Requires({DataSource.READS,DataSource.REFERENCE}) @Requires({DataSource.READS,DataSource.REFERENCE})
public abstract class DuplicateWalker<MapType, ReduceType> extends Walker<MapType, ReduceType> { public abstract class DuplicateWalker<MapType, ReduceType> extends Walker<MapType, ReduceType> {
// Do we actually want to operate on the context? // Do we actually want to operate on the context?
public boolean filter(GenomeLoc loc, byte[] refBases, AlignmentContext context, public boolean filter(GenomeLoc loc, AlignmentContext context, Set<List<SAMRecord>> readSets ) {
List<SAMRecord> uniqueReads,
List<SAMRecord> duplicateReads) {
return true; // We are keeping all the reads return true; // We are keeping all the reads
} }
@ -31,9 +30,14 @@ public abstract class DuplicateWalker<MapType, ReduceType> extends Walker<MapTyp
*/ */
public boolean mapUniqueReadsTooP() { return false; } public boolean mapUniqueReadsTooP() { return false; }
public abstract MapType map(GenomeLoc loc, byte[] refBases, AlignmentContext context, /**
List<SAMRecord> uniqueReads, * Called by the traversal engine to decide whether to call map() at loci without duplicate reads
List<SAMRecord> duplicateReads); *
* @return true if you want to see non duplicates during the traversal
*/
public boolean mapAtLociWithoutDuplicates() { return true; }
public abstract MapType map(GenomeLoc loc, AlignmentContext context, Set<List<SAMRecord>> readSets );
// Given result of map function // Given result of map function
public abstract ReduceType reduceInit(); public abstract ReduceType reduceInit();

View File

@ -23,7 +23,7 @@
* OTHER DEALINGS IN THE SOFTWARE. * OTHER DEALINGS IN THE SOFTWARE.
*/ */
package org.broadinstitute.sting.playground.gatk.walkers.duplicates; package org.broadinstitute.sting.oneoffprojects.walkers;
import net.sf.samtools.SAMRecord; import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
@ -39,6 +39,7 @@ import org.broadinstitute.sting.utils.duplicates.DuplicateComp;
import java.io.PrintStream; import java.io.PrintStream;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
import java.util.Set;
class MismatchCounter { class MismatchCounter {
long nObs = 0; long nObs = 0;
@ -160,41 +161,40 @@ public class DuplicateQualsWalker extends DuplicateWalker<List<DuplicateComp>, Q
} }
// Print out data for regression // Print out data for regression
public List<DuplicateComp> map(GenomeLoc loc, byte[] refBases, AlignmentContext context, public List<DuplicateComp> map(GenomeLoc loc, AlignmentContext context, Set<List<SAMRecord>> readSets ) {
List<SAMRecord> uniqueReads,
List<SAMRecord> duplicateReads) {
//logger.info(String.format("%s has %d duplicates and %d non-duplicates", loc, duplicateReads.size(), uniqueReads.size())); //logger.info(String.format("%s has %d duplicates and %d non-duplicates", loc, duplicateReads.size(), uniqueReads.size()));
List<DuplicateComp> pairwiseComps = new ArrayList<DuplicateComp>(); List<DuplicateComp> pairwiseComps = new ArrayList<DuplicateComp>();
if ( ! ACTUALLY_DO_WORK )
return pairwiseComps;
if ( COMBINE_QUALS ) { // todo -- fixme -- the logic here is all wrong given new interface
Pair<SAMRecord, SAMRecord> combinedReads = DupUtils.combinedReadPair( duplicateReads ); // if ( ! ACTUALLY_DO_WORK )
if ( combinedReads != null ) { // return pairwiseComps;
SAMRecord combined1 = combinedReads.first; //
SAMRecord combined2 = combinedReads.second; // if ( COMBINE_QUALS ) {
// Pair<SAMRecord, SAMRecord> combinedReads = DupUtils.combinedReadPair( duplicateReads );
if ( comparePairToSingleton ) // if ( combinedReads != null ) {
pairwiseComps = addPairwiseMatches( pairwiseComps, combined1, duplicateReads.get(2), uniqueReads ); // SAMRecord combined1 = combinedReads.first;
else // SAMRecord combined2 = combinedReads.second;
pairwiseComps = addPairwiseMatches( pairwiseComps, combined1, combined2, uniqueReads ); //
} // if ( comparePairToSingleton )
} else { // pairwiseComps = addPairwiseMatches( pairwiseComps, combined1, duplicateReads.get(2), uniqueReads );
int nComparisons = 0; // else
for ( SAMRecord read1 : duplicateReads ) { // pairwiseComps = addPairwiseMatches( pairwiseComps, combined1, combined2, uniqueReads );
for ( SAMRecord read2 : duplicateReads ) { // }
if ( read1.hashCode() < read2.hashCode() && DupUtils.usableDuplicate(read1, read2) ) { // } else {
// the hashcode insures we don't do A vs. B and B vs. A // int nComparisons = 0;
//System.out.printf("Comparing %s against %s%n", read1, read2); // for ( SAMRecord read1 : duplicateReads ) {
nComparisons++; // for ( SAMRecord read2 : duplicateReads ) {
pairwiseComps = addPairwiseMatches( pairwiseComps, read1, read2, uniqueReads ); // if ( read1.hashCode() < read2.hashCode() && DupUtils.usableDuplicate(read1, read2) ) {
if ( nComparisons > MAX_PAIRSIZE_COMPS_PER_DUPLICATE_SET ) // // the hashcode insures we don't do A vs. B and B vs. A
break; // //System.out.printf("Comparing %s against %s%n", read1, read2);
} // nComparisons++;
} // pairwiseComps = addPairwiseMatches( pairwiseComps, read1, read2, uniqueReads );
} // if ( nComparisons > MAX_PAIRSIZE_COMPS_PER_DUPLICATE_SET )
} // break;
// }
// }
// }
// }
return pairwiseComps; return pairwiseComps;
} }

View File

@ -7,6 +7,8 @@ import org.broadinstitute.sting.utils.duplicates.DupUtils;
import org.broadinstitute.sting.utils.cmdLine.Argument; import org.broadinstitute.sting.utils.cmdLine.Argument;
import java.util.List; import java.util.List;
import java.util.Set;
import java.util.ArrayList;
import net.sf.samtools.SAMRecord; import net.sf.samtools.SAMRecord;
import net.sf.samtools.SAMFileWriter; import net.sf.samtools.SAMFileWriter;
@ -15,7 +17,7 @@ import net.sf.samtools.SAMFileWriter;
* Process the input bam file, optionally emitting all the unique reads found, and emitting the combined duplicate reads to * Process the input bam file, optionally emitting all the unique reads found, and emitting the combined duplicate reads to
* the specified output BAM location. If no output location is specified, the reads are written to STDOUT. * the specified output BAM location. If no output location is specified, the reads are written to STDOUT.
*/ */
public class CombineDuplicatesWalker extends DuplicateWalker<SAMRecord, SAMFileWriter> { public class CombineDuplicatesWalker extends DuplicateWalker<List<SAMRecord>, SAMFileWriter> {
@Argument(fullName="outputBAM", shortName="outputBAM", required=false, doc="BAM File to write combined duplicates to") @Argument(fullName="outputBAM", shortName="outputBAM", required=false, doc="BAM File to write combined duplicates to")
public SAMFileWriter outputBAM = null; public SAMFileWriter outputBAM = null;
@ -45,50 +47,58 @@ public class CombineDuplicatesWalker extends DuplicateWalker<SAMRecord, SAMFileW
/** /**
* emit the read that was produced by combining the dupplicates * emit the read that was produced by combining the dupplicates
*/ */
public SAMFileWriter reduce(SAMRecord read, SAMFileWriter output) { public SAMFileWriter reduce(List<SAMRecord> reads, SAMFileWriter output) {
if ( output != null ) { for ( SAMRecord read : reads ) {
output.addAlignment(read); if ( output != null ) {
} else { output.addAlignment(read);
out.println(read.format()); } else {
out.println(read.format());
}
} }
return output; return output;
} }
/**
* We don't want to see loci without duplicates, since
* @return
*/
public boolean mapAtLociWithoutDuplicates() { return false; }
/** /**
* Build a combined read given the input list of non-unique reads. If there's just one read in the * Build a combined read given the input list of non-unique reads. If there's just one read in the
* set, it's considered unique and returned. If there's more than one, the N-way combine * set, it's considered unique and returned. If there's more than one, the N-way combine
* duplicate function is invoked. * duplicate function is invoked.
* *
* @param loc the genome loc * @param loc the genome loc
* @param refBases the reference bases for the given locus
* @param context the alignment context that has the reads information * @param context the alignment context that has the reads information
* @param duplicateReads a list of the dupplicate reads at this locus * @param readSets the set of unique reads list at this locus
* @param uniqueReads the unique read list at this locus
* @return a read that combines the dupplicate reads at this locus * @return a read that combines the dupplicate reads at this locus
*/ */
public SAMRecord map(GenomeLoc loc, byte[] refBases, AlignmentContext context, public List<SAMRecord> map(GenomeLoc loc, AlignmentContext context, Set<List<SAMRecord>> readSets ) {
List<SAMRecord> uniqueReads, List<SAMRecord> combinedReads = new ArrayList<SAMRecord>();
List<SAMRecord> duplicateReads) {
//logger.info(String.format("%s has %d duplicates and %d non-duplicates", loc, duplicateReads.size(), uniqueReads.size()));
SAMRecord combinedRead = null; for ( List<SAMRecord> reads : readSets ) {
SAMRecord combinedRead = null;
if ( duplicateReads.size() == 1 && ! duplicateReads.get(0).getDuplicateReadFlag() ) { if ( reads.size() == 1 && ! reads.get(0).getDuplicateReadFlag() ) {
// we are a unique read // we are a unique read
combinedRead = duplicateReads.get(0); combinedRead = reads.get(0);
} else { } else {
// actually call the combine function // actually call the combine function
//for (SAMRecord read : duplicateReads ) { //for (SAMRecord read : duplicateReads ) {
// System.out.printf("Read %s%n", read.format()); // System.out.printf("Read %s%n", read.format());
//} //}
combinedRead = DupUtils.combineDuplicates(duplicateReads, MAX_QUALITY_SCORE); combinedRead = DupUtils.combineDuplicates(reads, MAX_QUALITY_SCORE);
}
if ( combinedRead.getDuplicateReadFlag() )
throw new RuntimeException(String.format("Combined read %s [of %d] is a duplicate after combination -- this is a bug%n%s",
combinedRead.getReadName(), reads.size(), combinedRead.format()));
combinedReads.add(combinedRead);
} }
if ( combinedRead.getDuplicateReadFlag() ) return combinedReads;
throw new RuntimeException(String.format("Combined read %s [of %d] is a duplicate after combination -- this is a bug%n%s",
combinedRead.getReadName(), duplicateReads.size(), combinedRead.format()));
return combinedRead;
} }
} }

View File

@ -29,41 +29,61 @@ import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.walkers.DuplicateWalker; import org.broadinstitute.sting.gatk.walkers.DuplicateWalker;
import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import java.util.List; import java.util.List;
import java.util.Set;
import java.util.ArrayList;
/** /**
* a class to store the traversal information we pass around * a class to store the traversal information we pass around
*/ */
class DuplicateCount { class DuplicateCount {
public int count = 0; // the count of sites we were given public int count = 0; // the count of sites we were given
public int undupDepth = 0; // the unique read count public int nUniqueMolecules = 0; // the unique read count
public int depth = 0; // the dupplicate read depth public int nDuplicatedMolecules = 0; // the unique read count
} public int depth = 0; // the dupplicate read depth
}
/** /**
* Count the number of unique reads, duplicates, and the average depth of unique reads and duplicates at all positions. * Count the number of unique reads, duplicates, and the average depth of unique reads and duplicates at all positions.
* @author aaron * @author aaron
*/ */
public class CountDuplicatesWalker extends DuplicateWalker<DuplicateCount, DuplicateCount> { public class CountDuplicatesWalker extends DuplicateWalker<DuplicateCount, DuplicateCount> {
@Argument(fullName="quiet", required=false, doc="If true, per locus information isn't printex")
public boolean quiet = false;
/** /**
* the map function, conforming to the duplicates interface * the map function, conforming to the duplicates interface
* @param loc the genomic location * @param loc the genomic location
* @param refBases the reference bases that cover this position, which we turn off
* @param context the AlignmentContext, containing all the reads overlapping this region * @param context the AlignmentContext, containing all the reads overlapping this region
* @param uniqueReads all the unique reads, bundled together * @param readSets all the duplicate reads
* @param duplicateReads all the duplicate reads
* @return a DuplicateCount object, with the appropriate stats * @return a DuplicateCount object, with the appropriate stats
*/ */
public DuplicateCount map(GenomeLoc loc, byte[] refBases, AlignmentContext context, List<SAMRecord> uniqueReads, List<SAMRecord> duplicateReads) { public DuplicateCount map(GenomeLoc loc, AlignmentContext context, Set<List<SAMRecord>> readSets ) {
if ( ! quiet ) out.printf("%s with %d read sets => ", loc, readSets.size());
DuplicateCount dup = new DuplicateCount(); DuplicateCount dup = new DuplicateCount();
dup.depth = 0;
for ( List<SAMRecord> reads : readSets) {
List<String> names = new ArrayList<String>();
for ( SAMRecord read : reads ) {
names.add(read.getReadName());
}
if ( ! quiet ) out.printf("%d reads [%s] ", reads.size(), Utils.join(",", names));
dup.depth += reads.size();
dup.nDuplicatedMolecules += reads.size() > 1 ? 1 : 0; // if there's more than 1 read per set, we're a duplicated reads
}
if ( ! quiet ) out.printf("%n");
dup.count = 1; dup.count = 1;
dup.undupDepth = uniqueReads.size(); dup.nUniqueMolecules = readSets.size();
dup.depth = duplicateReads.size();
return dup; return dup;
} }
public boolean mapAtLociWithoutDuplicates() { return true; }
/** /**
* setup our walker. In this case, new a DuplicateCount object and return it * setup our walker. In this case, new a DuplicateCount object and return it
* @return the object holding the counts of the duplicates * @return the object holding the counts of the duplicates
@ -82,7 +102,8 @@ public class CountDuplicatesWalker extends DuplicateWalker<DuplicateCount, Dupli
DuplicateCount dup = new DuplicateCount(); DuplicateCount dup = new DuplicateCount();
dup.count = sum.count + value.count; dup.count = sum.count + value.count;
dup.depth = value.depth + sum.depth; dup.depth = value.depth + sum.depth;
dup.undupDepth = value.undupDepth + sum.undupDepth; dup.nDuplicatedMolecules = value.nDuplicatedMolecules + sum.nDuplicatedMolecules;
dup.nUniqueMolecules = value.nUniqueMolecules + sum.nUniqueMolecules;
return dup; return dup;
} }
@ -93,9 +114,10 @@ public class CountDuplicatesWalker extends DuplicateWalker<DuplicateCount, Dupli
public void onTraversalDone(DuplicateCount result) { public void onTraversalDone(DuplicateCount result) {
out.println("[REDUCE RESULT] Traversal result is: "); out.println("[REDUCE RESULT] Traversal result is: ");
out.println("traversal iterations = " + result.count); out.println("traversal iterations = " + result.count);
out.println("average depth = " + (double)result.depth / (double)result.count); out.printf("average depth = %.2f%n", (double)result.depth / (double)result.count);
out.println("duplicates seen = " + result.depth); out.println("unique molecules seen = " + result.nUniqueMolecules);
out.println("unique read count = " + result.undupDepth); out.println("duplicated molecules seen = " + result.nDuplicatedMolecules);
out.println("average unique read depth = " + (double)result.undupDepth / (double)result.count); out.printf("percent duplicated = %.2f%%%n", result.nDuplicatedMolecules / (double)result.nUniqueMolecules * 100);
out.printf("average unique read depth = %.2f%n", (double)result.nUniqueMolecules / (double)result.count);
} }
} }

View File

@ -39,6 +39,13 @@ public class ReadBackedPileup implements Iterable<PileupElement> {
this(loc, readsOffsets2Pileup(reads, offsets)); this(loc, readsOffsets2Pileup(reads, offsets));
} }
/**
* Create a new version of a read backed pileup at loc without any aligned reads
*
*/
public ReadBackedPileup(GenomeLoc loc ) {
this(loc, new ArrayList<PileupElement>(0));
}
/** /**
* Create a new version of a read backed pileup at loc, using the reads and their corresponding * Create a new version of a read backed pileup at loc, using the reads and their corresponding

View File

@ -12,6 +12,7 @@ import org.junit.Test;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
import java.util.Set;
/** /**
@ -37,49 +38,48 @@ public class TraverseDuplicatesTest extends BaseTest {
public void testAllDupplicatesNoPairs() { public void testAllDupplicatesNoPairs() {
List<SAMRecord> list = new ArrayList<SAMRecord>(); List<SAMRecord> list = new ArrayList<SAMRecord>();
for (int x = 0; x < 10; x++) { for (int x = 0; x < 10; x++) {
SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "SWEET_READ", 0, 1, 100); SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "SWEET_READ" + x, 0, 1, 100);
read.setDuplicateReadFlag(true); read.setDuplicateReadFlag(true);
list.add(read); list.add(read);
} }
Pair<List<SAMRecord>, List<SAMRecord>> myPairing = obj.splitDuplicates(list); Set<List<SAMRecord>> myPairings = obj.uniqueReadSets(list);
Assert.assertEquals(0, myPairing.first.size()); // unique Assert.assertEquals(1, myPairings.size());
Assert.assertEquals(10, myPairing.second.size()); // dup's Assert.assertEquals(10, myPairings.iterator().next().size()); // dup's
} }
@Test @Test
public void testNoDupplicatesNoPairs() { public void testNoDupplicatesNoPairs() {
List<SAMRecord> list = new ArrayList<SAMRecord>(); List<SAMRecord> list = new ArrayList<SAMRecord>();
for (int x = 0; x < 10; x++) { for (int x = 0; x < 10; x++) {
SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "SWEET_READ", 0, 1, 100); SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "SWEET_READ" + x, 0, 1, 100);
read.setDuplicateReadFlag(false); read.setDuplicateReadFlag(false);
list.add(read); list.add(read);
} }
Pair<List<SAMRecord>, List<SAMRecord>> myPairing = obj.splitDuplicates(list);
Assert.assertEquals(10, myPairing.first.size()); // unique Set<List<SAMRecord>> myPairing = obj.uniqueReadSets(list);
Assert.assertEquals(0, myPairing.second.size()); // dup's Assert.assertEquals(10, myPairing.size()); // unique
} }
@Test @Test
public void testFiftyFiftyNoPairs() { public void testFiftyFiftyNoPairs() {
List<SAMRecord> list = new ArrayList<SAMRecord>(); List<SAMRecord> list = new ArrayList<SAMRecord>();
for (int x = 0; x < 5; x++) { for (int x = 0; x < 5; x++) {
SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "SWEET_READ", 0, 1, 100); SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "SWEET_READ" + x, 0, 1, 100);
read.setDuplicateReadFlag(true); read.setDuplicateReadFlag(true);
list.add(read); list.add(read);
} }
for (int x = 10; x < 15; x++) for (int x = 10; x < 15; x++)
list.add(ArtificialSAMUtils.createArtificialRead(header, String.valueOf(x), 0, x, 100)); list.add(ArtificialSAMUtils.createArtificialRead(header, String.valueOf(x), 0, x, 100));
Pair<List<SAMRecord>, List<SAMRecord>> myPairing = obj.splitDuplicates(list); Set<List<SAMRecord>> myPairing = obj.uniqueReadSets(list);
Assert.assertEquals(5, myPairing.first.size()); // unique Assert.assertEquals(6, myPairing.size()); // unique
Assert.assertEquals(5, myPairing.second.size()); // dup's
} }
@Test @Test
public void testAllDupplicatesAllPairs() { public void testAllDupplicatesAllPairs() {
List<SAMRecord> list = new ArrayList<SAMRecord>(); List<SAMRecord> list = new ArrayList<SAMRecord>();
for (int x = 0; x < 10; x++) { for (int x = 0; x < 10; x++) {
SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "SWEET_READ", 0, 1, 100); SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "SWEET_READ"+ x, 0, 1, 100);
read.setDuplicateReadFlag(true); read.setDuplicateReadFlag(true);
read.setMateAlignmentStart(100); read.setMateAlignmentStart(100);
read.setMateReferenceIndex(0); read.setMateReferenceIndex(0);
@ -87,16 +87,15 @@ public class TraverseDuplicatesTest extends BaseTest {
list.add(read); list.add(read);
} }
Pair<List<SAMRecord>, List<SAMRecord>> myPairing = obj.splitDuplicates(list); Set<List<SAMRecord>> myPairing = obj.uniqueReadSets(list);
Assert.assertEquals(0, myPairing.first.size()); // unique Assert.assertEquals(1, myPairing.size()); // unique
Assert.assertEquals(10, myPairing.second.size()); // dup's
} }
@Test @Test
public void testNoDupplicatesAllPairs() { public void testNoDupplicatesAllPairs() {
List<SAMRecord> list = new ArrayList<SAMRecord>(); List<SAMRecord> list = new ArrayList<SAMRecord>();
for (int x = 0; x < 10; x++) { for (int x = 0; x < 10; x++) {
SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "SWEET_READ", 0, 1, 100); SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "SWEET_READ"+ x, 0, 1, 100);
if (x == 0) read.setDuplicateReadFlag(true); // one is a dup but (next line) if (x == 0) read.setDuplicateReadFlag(true); // one is a dup but (next line)
read.setMateAlignmentStart(100); // they all have a shared start and mate start so they're dup's read.setMateAlignmentStart(100); // they all have a shared start and mate start so they're dup's
read.setMateReferenceIndex(0); read.setMateReferenceIndex(0);
@ -104,16 +103,15 @@ public class TraverseDuplicatesTest extends BaseTest {
list.add(read); list.add(read);
} }
Pair<List<SAMRecord>, List<SAMRecord>> myPairing = obj.splitDuplicates(list); Set<List<SAMRecord>> myPairing = obj.uniqueReadSets(list);
Assert.assertEquals(0, myPairing.first.size()); // unique Assert.assertEquals(1, myPairing.size()); // unique
Assert.assertEquals(10, myPairing.second.size()); // dup's
} }
@Test @Test
public void testAllDupplicatesAllPairsDifferentPairedEnd() { public void testAllDupplicatesAllPairsDifferentPairedEnd() {
List<SAMRecord> list = new ArrayList<SAMRecord>(); List<SAMRecord> list = new ArrayList<SAMRecord>();
for (int x = 0; x < 10; x++) { for (int x = 0; x < 10; x++) {
SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "SWEET_READ", 0, 1, 100); SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "SWEET_READ" + x, 0, 1, 100);
if (x == 0) read.setDuplicateReadFlag(true); // one is a dup if (x == 0) read.setDuplicateReadFlag(true); // one is a dup
read.setMateAlignmentStart(100 + x); read.setMateAlignmentStart(100 + x);
read.setMateReferenceIndex(0); read.setMateReferenceIndex(0);
@ -121,8 +119,7 @@ public class TraverseDuplicatesTest extends BaseTest {
list.add(read); list.add(read);
} }
Pair<List<SAMRecord>, List<SAMRecord>> myPairing = obj.splitDuplicates(list); Set<List<SAMRecord>> myPairing = obj.uniqueReadSets(list);
Assert.assertEquals(9, myPairing.first.size()); // unique Assert.assertEquals(10, myPairing.size()); // unique
Assert.assertEquals(1, myPairing.second.size()); // dup's
} }
} }

View File

@ -249,7 +249,7 @@ def calculateBinsForValues(values, field, minValue, maxValue, partitions):
bins[-1][1] = bin[0] bins[-1][1] = bin[0]
bins[-1][2] += curSize bins[-1][2] += curSize
#print 'Returning ', bins print 'Returning ', bins
#sys.exit(1) #sys.exit(1)
return bins return bins