Completely rewritten duplicate traversal, more free of bugs, with integration tests for count duplicates walker validated on a TCGA hybrid capture lane.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2458 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
4617052b3c
commit
fcc80e8632
|
|
@ -32,6 +32,7 @@ import org.apache.log4j.Logger;
|
|||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.datasources.providers.ReadView;
|
||||
import org.broadinstitute.sting.gatk.datasources.providers.ShardDataProvider;
|
||||
import org.broadinstitute.sting.gatk.datasources.providers.ManagingReferenceOrderedView;
|
||||
import org.broadinstitute.sting.gatk.datasources.shards.ReadShard;
|
||||
import org.broadinstitute.sting.gatk.datasources.shards.Shard;
|
||||
import org.broadinstitute.sting.gatk.iterators.PushbackIterator;
|
||||
|
|
@ -40,11 +41,10 @@ import org.broadinstitute.sting.gatk.walkers.Walker;
|
|||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.Pair;
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* @author Mark DePristo
|
||||
|
|
@ -85,128 +85,58 @@ public class TraverseDuplicates extends TraversalEngine {
|
|||
return l;
|
||||
}
|
||||
|
||||
protected Pair<List<SAMRecord>, List<SAMRecord>> splitDuplicates(List<SAMRecord> reads) {
|
||||
List<SAMRecord> uniques = new ArrayList<SAMRecord>();
|
||||
List<SAMRecord> dups = new ArrayList<SAMRecord>();
|
||||
|
||||
// find the first duplicate
|
||||
SAMRecord key = null;
|
||||
for (SAMRecord read : reads) {
|
||||
if (read.getDuplicateReadFlag()) {
|
||||
// this is our key
|
||||
key = read;
|
||||
if (DEBUG) logger.debug(String.format("Key %s is a duplicate", read.getReadName()));
|
||||
break;
|
||||
protected Set<List<SAMRecord>> uniqueReadSets(List<SAMRecord> reads) {
|
||||
Set<List<SAMRecord>> readSets = new HashSet<List<SAMRecord>>();
|
||||
for ( SAMRecord read : reads ) {
|
||||
|
||||
List<SAMRecord> readSet = findDuplicateReads(read, readSets);
|
||||
if ( readSet == null ) {
|
||||
readSets.add(new ArrayList<SAMRecord>(Arrays.asList(read))); // copy so I can add to the list
|
||||
} else {
|
||||
readSet.add(read);
|
||||
}
|
||||
}
|
||||
|
||||
// At this point, there are two possibilities, we have found at least one dup or not
|
||||
// if it's a dup, add it to the dups list, otherwise add it to the uniques list
|
||||
if (key != null) {
|
||||
final GenomeLoc keyLoc = GenomeLocParser.createGenomeLoc(key);
|
||||
final GenomeLoc keyMateLoc = (!key.getReadPairedFlag()) ? null :
|
||||
GenomeLocParser.createGenomeLoc(key.getMateReferenceIndex(), key.getMateAlignmentStart(), key.getMateAlignmentStart());
|
||||
for (SAMRecord read : reads) {
|
||||
final GenomeLoc readLoc = GenomeLocParser.createGenomeLoc(read);
|
||||
final GenomeLoc readMateLoc = (!key.getReadPairedFlag()) ? null :
|
||||
GenomeLocParser.createGenomeLoc(read.getMateReferenceIndex(), read.getMateAlignmentStart(), read.getMateAlignmentStart());
|
||||
if (DEBUG)
|
||||
logger.debug(String.format("Examining reads at %s vs. %s at %s / %s vs. %s / %s%n", key.getReadName(), read.getReadName(), keyLoc, keyMateLoc, readLoc, readMateLoc));
|
||||
return readSets;
|
||||
}
|
||||
|
||||
protected List<SAMRecord> findDuplicateReads(SAMRecord read, Set<List<SAMRecord>> readSets ) {
|
||||
if ( read.getReadPairedFlag() ) {
|
||||
// paired
|
||||
final GenomeLoc readMateLoc = GenomeLocParser.createGenomeLoc(read.getMateReferenceIndex(), read.getMateAlignmentStart(), read.getMateAlignmentStart());
|
||||
|
||||
for (List<SAMRecord> reads : readSets) {
|
||||
SAMRecord key = reads.get(0);
|
||||
//if (DEBUG)
|
||||
// logger.debug(String.format("Examining reads at %s vs. %s at %s / %s vs. %s / %s%n", key.getReadName(), read.getReadName(), keyLoc, keyMateLoc, readLoc, readMateLoc));
|
||||
|
||||
// read and key start at the same place, and either the this read and the key
|
||||
// share a mate location or the read is flagged as a duplicate
|
||||
if (readLoc.compareTo(keyLoc) == 0 ||
|
||||
read.getDuplicateReadFlag()) {
|
||||
if ((readMateLoc != null && keyMateLoc != null && readMateLoc.compareTo(keyMateLoc) == 0) ||
|
||||
(readMateLoc == null && keyMateLoc == null)) {
|
||||
if ( read.getAlignmentStart() == key.getAlignmentStart() && key.getReadPairedFlag() && ( key.getDuplicateReadFlag() || read.getDuplicateReadFlag() ) ) {
|
||||
// at least one has to be marked as a duplicate
|
||||
final GenomeLoc keyMateLoc = GenomeLocParser.createGenomeLoc(key.getMateReferenceIndex(), key.getMateAlignmentStart(), key.getMateAlignmentStart());
|
||||
if ( readMateLoc.compareTo(keyMateLoc) == 0 ) {
|
||||
// we are at the same position as the dup and have the same mat pos, it's a dup
|
||||
if (DEBUG) logger.debug(String.format(" => Adding read to dups list: %s%n", read));
|
||||
dups.add(read);
|
||||
} else {
|
||||
uniques.add(read);
|
||||
if (DEBUG) logger.debug(String.format(" => Adding read to dups list: %s %d %s vs. %s", read, reads.size(), readMateLoc, keyMateLoc));
|
||||
return reads;
|
||||
}
|
||||
} else {
|
||||
uniques.add(read);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
uniques = reads;
|
||||
}
|
||||
|
||||
return new Pair<List<SAMRecord>, List<SAMRecord>>(uniques, dups);
|
||||
}
|
||||
|
||||
/**
|
||||
* Traverse by reads, given the data and the walker
|
||||
*
|
||||
* @param sum of type T, the return from the walker
|
||||
* @param <M> the generic type
|
||||
* @param <T> the return type of the reduce function
|
||||
* @param dupWalker our duplicates walker
|
||||
* @param readIter our iterator
|
||||
*
|
||||
* @return the reduce type, T, the final product of all the reduce calls
|
||||
*/
|
||||
private <M, T> T actuallyTraverse(DuplicateWalker<M, T> dupWalker,
|
||||
Iterator<SAMRecord> readIter,
|
||||
T sum) {
|
||||
/**
|
||||
* while we still have more reads:
|
||||
* ok, here's the idea. We get all the reads that start at the same position in the genome
|
||||
* We then split the list of reads into sublists of reads:
|
||||
* -> those with the same mate pair position, for paired reads
|
||||
* -> those flagged as unpaired and duplicated but having the same start and end and
|
||||
*/
|
||||
PushbackIterator<SAMRecord> iter = new PushbackIterator<SAMRecord>(readIter);
|
||||
|
||||
|
||||
for (SAMRecord read : iter) {
|
||||
|
||||
// get the genome loc from the read
|
||||
GenomeLoc site = GenomeLocParser.createGenomeLoc(read);
|
||||
|
||||
List<SAMRecord> reads = readsAtLoc(read, iter);
|
||||
Pair<List<SAMRecord>, List<SAMRecord>> split = splitDuplicates(reads);
|
||||
List<SAMRecord> uniqueReads = split.getFirst();
|
||||
List<SAMRecord> duplicateReads = split.getSecond();
|
||||
|
||||
logger.debug(String.format("*** TraverseDuplicates.traverse at %s with %d reads has %d unique and %d duplicate reads",
|
||||
site, reads.size(), uniqueReads.size(), duplicateReads.size()));
|
||||
if (reads.size() != uniqueReads.size() + duplicateReads.size())
|
||||
throw new RuntimeException(String.format("Bug occurred spliting reads [N=%d] at loc %s into unique [N=%d] and duplicates [N=%d], sizes don't match",
|
||||
reads.size(), site.toString(), uniqueReads.size(), duplicateReads.size()));
|
||||
|
||||
// Jump forward in the reference to this locus location
|
||||
AlignmentContext locus = new AlignmentContext(site, duplicateReads, Arrays.asList(0));
|
||||
|
||||
// update the number of duplicate sets we've seen
|
||||
TraversalStatistics.nRecords++;
|
||||
|
||||
// we still have to fix the locus context provider to take care of this problem with > 1 length contexts
|
||||
// AlignmentContext locus = locusProvider.getLocusContext(site);
|
||||
|
||||
byte[] refBases = new byte[0];
|
||||
|
||||
if (dupWalker.mapUniqueReadsTooP()) {
|
||||
// Send each unique read to the map function
|
||||
for (SAMRecord unique : uniqueReads) {
|
||||
List<SAMRecord> l = Arrays.asList(unique);
|
||||
sum = mapOne(dupWalker, uniqueReads, l, site, refBases, locus, sum);
|
||||
for (List<SAMRecord> reads : readSets) {
|
||||
SAMRecord key = reads.get(0);
|
||||
boolean v = (! key.getReadPairedFlag()) && read.getAlignmentStart() == key.getAlignmentStart() && ( key.getDuplicateReadFlag() || read.getDuplicateReadFlag() ) && read.getReadLength() == key.getReadLength();
|
||||
//System.out.printf("%s %s %b %b %d %d %d %d => %b%n",
|
||||
// read.getReadPairedFlag(), key.getReadPairedFlag(), read.getDuplicateReadFlag(), key.getDuplicateReadFlag(),
|
||||
// read.getAlignmentStart(), key.getAlignmentStart(), read.getReadLength(), key.getReadLength(), v);
|
||||
if ( v ) {
|
||||
//System.out.printf("Returning reads...%n");
|
||||
return reads;
|
||||
}
|
||||
}
|
||||
|
||||
if (duplicateReads.size() > 0)
|
||||
sum = mapOne(dupWalker, uniqueReads, duplicateReads, site, refBases, locus, sum);
|
||||
|
||||
printProgress(DUPS_STRING, site);
|
||||
|
||||
if (this.maximumIterations > 0 && TraversalStatistics.nRecords > this.maximumIterations) {
|
||||
logger.warn(String.format(("Maximum number of duplicate sets encountered, terminating traversal " + TraversalStatistics.nRecords)));
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return sum;
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -232,7 +162,7 @@ public class TraverseDuplicates extends TraversalEngine {
|
|||
|
||||
if (result) {
|
||||
TraversalStatistics.nSkippedReads++;
|
||||
//System.out.printf(" [filter] %s => %b %s", rec.getReadName(), result, why);
|
||||
//System.out.printf(" [filter] %s => %b", rec.getReadName(), result);
|
||||
} else {
|
||||
TraversalStatistics.nReads++;
|
||||
}
|
||||
|
|
@ -240,27 +170,12 @@ public class TraverseDuplicates extends TraversalEngine {
|
|||
}
|
||||
}
|
||||
|
||||
public <M, T> T mapOne(DuplicateWalker<M, T> dupWalker,
|
||||
List<SAMRecord> uniqueReads,
|
||||
List<SAMRecord> duplicateReads,
|
||||
GenomeLoc site,
|
||||
byte[] refBases,
|
||||
AlignmentContext locus,
|
||||
T sum) {
|
||||
final boolean keepMeP = dupWalker.filter(site, refBases, locus, uniqueReads, duplicateReads);
|
||||
if (keepMeP) {
|
||||
M x = dupWalker.map(site, refBases, locus, uniqueReads, duplicateReads);
|
||||
sum = dupWalker.reduce(x, sum);
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// new style interface to the system
|
||||
//
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Traverse by reads, given the data and the walker
|
||||
*
|
||||
|
|
@ -276,8 +191,7 @@ public class TraverseDuplicates extends TraversalEngine {
|
|||
Shard shard,
|
||||
ShardDataProvider dataProvider,
|
||||
T sum) {
|
||||
|
||||
logger.debug(String.format("TraverseDuplicates.traverse Genomic interval is %s", ((ReadShard) shard).getSize()));
|
||||
//logger.debug(String.format("TraverseDuplicates.traverse Genomic interval is %s", shard.getGenomeLoc()));
|
||||
|
||||
if (!(walker instanceof DuplicateWalker))
|
||||
throw new IllegalArgumentException("Walker isn't a duplicate walker!");
|
||||
|
|
@ -292,13 +206,46 @@ public class TraverseDuplicates extends TraversalEngine {
|
|||
|
||||
FilteringIterator filterIter = new FilteringIterator(new ReadView(dataProvider).iterator(), new duplicateStreamFilterFunc());
|
||||
PushbackIterator<SAMRecord> iter = new PushbackIterator<SAMRecord>(filterIter);
|
||||
return actuallyTraverse(dupWalker, iter, sum);
|
||||
}
|
||||
|
||||
/**
|
||||
* while we still have more reads:
|
||||
* ok, here's the idea. We get all the reads that start at the same position in the genome
|
||||
* We then split the list of reads into sublists of reads:
|
||||
* -> those with the same mate pair position, for paired reads
|
||||
* -> those flagged as unpaired and duplicated but having the same start and end
|
||||
*/
|
||||
for (SAMRecord read : iter) {
|
||||
// get the genome loc from the read
|
||||
GenomeLoc site = GenomeLocParser.createGenomeLoc(read);
|
||||
|
||||
Set<List<SAMRecord>> readSets = uniqueReadSets(readsAtLoc(read, iter));
|
||||
logger.debug(String.format("*** TraverseDuplicates.traverse at %s with %d read sets", site, readSets.size()));
|
||||
|
||||
// Jump forward in the reference to this locus location
|
||||
AlignmentContext locus = new AlignmentContext(site, new ReadBackedPileup(site));
|
||||
|
||||
// update the number of duplicate sets we've seen
|
||||
TraversalStatistics.nRecords++;
|
||||
|
||||
final boolean keepMeP = dupWalker.filter(site, locus, readSets);
|
||||
if (keepMeP) {
|
||||
M x = dupWalker.map(site, locus, readSets);
|
||||
sum = dupWalker.reduce(x, sum);
|
||||
}
|
||||
|
||||
printProgress(DUPS_STRING, site);
|
||||
|
||||
if (this.maximumIterations > 0 && TraversalStatistics.nRecords > this.maximumIterations) {
|
||||
logger.warn(String.format(("Maximum number of duplicate sets encountered, terminating traversal " + TraversalStatistics.nRecords)));
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return sum;
|
||||
}
|
||||
|
||||
/**
|
||||
* Temporary override of printOnTraversalDone.
|
||||
* TODO: Add some sort of TE.getName() function once all TraversalEngines are ported.
|
||||
*
|
||||
* @param sum Result of the computation.
|
||||
* @param <T> Type of the result.
|
||||
|
|
|
|||
|
|
@ -4,6 +4,7 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
|||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
|
||||
|
|
@ -17,9 +18,7 @@ import net.sf.samtools.SAMRecord;
|
|||
@Requires({DataSource.READS,DataSource.REFERENCE})
|
||||
public abstract class DuplicateWalker<MapType, ReduceType> extends Walker<MapType, ReduceType> {
|
||||
// Do we actually want to operate on the context?
|
||||
public boolean filter(GenomeLoc loc, byte[] refBases, AlignmentContext context,
|
||||
List<SAMRecord> uniqueReads,
|
||||
List<SAMRecord> duplicateReads) {
|
||||
public boolean filter(GenomeLoc loc, AlignmentContext context, Set<List<SAMRecord>> readSets ) {
|
||||
return true; // We are keeping all the reads
|
||||
}
|
||||
|
||||
|
|
@ -31,9 +30,14 @@ public abstract class DuplicateWalker<MapType, ReduceType> extends Walker<MapTyp
|
|||
*/
|
||||
public boolean mapUniqueReadsTooP() { return false; }
|
||||
|
||||
public abstract MapType map(GenomeLoc loc, byte[] refBases, AlignmentContext context,
|
||||
List<SAMRecord> uniqueReads,
|
||||
List<SAMRecord> duplicateReads);
|
||||
/**
|
||||
* Called by the traversal engine to decide whether to call map() at loci without duplicate reads
|
||||
*
|
||||
* @return true if you want to see non duplicates during the traversal
|
||||
*/
|
||||
public boolean mapAtLociWithoutDuplicates() { return true; }
|
||||
|
||||
public abstract MapType map(GenomeLoc loc, AlignmentContext context, Set<List<SAMRecord>> readSets );
|
||||
|
||||
// Given result of map function
|
||||
public abstract ReduceType reduceInit();
|
||||
|
|
|
|||
|
|
@ -23,7 +23,7 @@
|
|||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.playground.gatk.walkers.duplicates;
|
||||
package org.broadinstitute.sting.oneoffprojects.walkers;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
|
|
@ -39,6 +39,7 @@ import org.broadinstitute.sting.utils.duplicates.DuplicateComp;
|
|||
import java.io.PrintStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
class MismatchCounter {
|
||||
long nObs = 0;
|
||||
|
|
@ -160,41 +161,40 @@ public class DuplicateQualsWalker extends DuplicateWalker<List<DuplicateComp>, Q
|
|||
}
|
||||
|
||||
// Print out data for regression
|
||||
public List<DuplicateComp> map(GenomeLoc loc, byte[] refBases, AlignmentContext context,
|
||||
List<SAMRecord> uniqueReads,
|
||||
List<SAMRecord> duplicateReads) {
|
||||
public List<DuplicateComp> map(GenomeLoc loc, AlignmentContext context, Set<List<SAMRecord>> readSets ) {
|
||||
//logger.info(String.format("%s has %d duplicates and %d non-duplicates", loc, duplicateReads.size(), uniqueReads.size()));
|
||||
List<DuplicateComp> pairwiseComps = new ArrayList<DuplicateComp>();
|
||||
|
||||
if ( ! ACTUALLY_DO_WORK )
|
||||
return pairwiseComps;
|
||||
|
||||
if ( COMBINE_QUALS ) {
|
||||
Pair<SAMRecord, SAMRecord> combinedReads = DupUtils.combinedReadPair( duplicateReads );
|
||||
if ( combinedReads != null ) {
|
||||
SAMRecord combined1 = combinedReads.first;
|
||||
SAMRecord combined2 = combinedReads.second;
|
||||
|
||||
if ( comparePairToSingleton )
|
||||
pairwiseComps = addPairwiseMatches( pairwiseComps, combined1, duplicateReads.get(2), uniqueReads );
|
||||
else
|
||||
pairwiseComps = addPairwiseMatches( pairwiseComps, combined1, combined2, uniqueReads );
|
||||
}
|
||||
} else {
|
||||
int nComparisons = 0;
|
||||
for ( SAMRecord read1 : duplicateReads ) {
|
||||
for ( SAMRecord read2 : duplicateReads ) {
|
||||
if ( read1.hashCode() < read2.hashCode() && DupUtils.usableDuplicate(read1, read2) ) {
|
||||
// the hashcode insures we don't do A vs. B and B vs. A
|
||||
//System.out.printf("Comparing %s against %s%n", read1, read2);
|
||||
nComparisons++;
|
||||
pairwiseComps = addPairwiseMatches( pairwiseComps, read1, read2, uniqueReads );
|
||||
if ( nComparisons > MAX_PAIRSIZE_COMPS_PER_DUPLICATE_SET )
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// todo -- fixme -- the logic here is all wrong given new interface
|
||||
// if ( ! ACTUALLY_DO_WORK )
|
||||
// return pairwiseComps;
|
||||
//
|
||||
// if ( COMBINE_QUALS ) {
|
||||
// Pair<SAMRecord, SAMRecord> combinedReads = DupUtils.combinedReadPair( duplicateReads );
|
||||
// if ( combinedReads != null ) {
|
||||
// SAMRecord combined1 = combinedReads.first;
|
||||
// SAMRecord combined2 = combinedReads.second;
|
||||
//
|
||||
// if ( comparePairToSingleton )
|
||||
// pairwiseComps = addPairwiseMatches( pairwiseComps, combined1, duplicateReads.get(2), uniqueReads );
|
||||
// else
|
||||
// pairwiseComps = addPairwiseMatches( pairwiseComps, combined1, combined2, uniqueReads );
|
||||
// }
|
||||
// } else {
|
||||
// int nComparisons = 0;
|
||||
// for ( SAMRecord read1 : duplicateReads ) {
|
||||
// for ( SAMRecord read2 : duplicateReads ) {
|
||||
// if ( read1.hashCode() < read2.hashCode() && DupUtils.usableDuplicate(read1, read2) ) {
|
||||
// // the hashcode insures we don't do A vs. B and B vs. A
|
||||
// //System.out.printf("Comparing %s against %s%n", read1, read2);
|
||||
// nComparisons++;
|
||||
// pairwiseComps = addPairwiseMatches( pairwiseComps, read1, read2, uniqueReads );
|
||||
// if ( nComparisons > MAX_PAIRSIZE_COMPS_PER_DUPLICATE_SET )
|
||||
// break;
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
|
||||
return pairwiseComps;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -7,6 +7,8 @@ import org.broadinstitute.sting.utils.duplicates.DupUtils;
|
|||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.ArrayList;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import net.sf.samtools.SAMFileWriter;
|
||||
|
|
@ -15,7 +17,7 @@ import net.sf.samtools.SAMFileWriter;
|
|||
* Process the input bam file, optionally emitting all the unique reads found, and emitting the combined duplicate reads to
|
||||
* the specified output BAM location. If no output location is specified, the reads are written to STDOUT.
|
||||
*/
|
||||
public class CombineDuplicatesWalker extends DuplicateWalker<SAMRecord, SAMFileWriter> {
|
||||
public class CombineDuplicatesWalker extends DuplicateWalker<List<SAMRecord>, SAMFileWriter> {
|
||||
@Argument(fullName="outputBAM", shortName="outputBAM", required=false, doc="BAM File to write combined duplicates to")
|
||||
public SAMFileWriter outputBAM = null;
|
||||
|
||||
|
|
@ -45,50 +47,58 @@ public class CombineDuplicatesWalker extends DuplicateWalker<SAMRecord, SAMFileW
|
|||
/**
|
||||
* emit the read that was produced by combining the dupplicates
|
||||
*/
|
||||
public SAMFileWriter reduce(SAMRecord read, SAMFileWriter output) {
|
||||
if ( output != null ) {
|
||||
output.addAlignment(read);
|
||||
} else {
|
||||
out.println(read.format());
|
||||
public SAMFileWriter reduce(List<SAMRecord> reads, SAMFileWriter output) {
|
||||
for ( SAMRecord read : reads ) {
|
||||
if ( output != null ) {
|
||||
output.addAlignment(read);
|
||||
} else {
|
||||
out.println(read.format());
|
||||
}
|
||||
}
|
||||
|
||||
return output;
|
||||
}
|
||||
|
||||
/**
|
||||
* We don't want to see loci without duplicates, since
|
||||
* @return
|
||||
*/
|
||||
public boolean mapAtLociWithoutDuplicates() { return false; }
|
||||
|
||||
/**
|
||||
* Build a combined read given the input list of non-unique reads. If there's just one read in the
|
||||
* set, it's considered unique and returned. If there's more than one, the N-way combine
|
||||
* duplicate function is invoked.
|
||||
*
|
||||
* @param loc the genome loc
|
||||
* @param refBases the reference bases for the given locus
|
||||
* @param context the alignment context that has the reads information
|
||||
* @param duplicateReads a list of the dupplicate reads at this locus
|
||||
* @param uniqueReads the unique read list at this locus
|
||||
* @param readSets the set of unique reads list at this locus
|
||||
* @return a read that combines the dupplicate reads at this locus
|
||||
*/
|
||||
public SAMRecord map(GenomeLoc loc, byte[] refBases, AlignmentContext context,
|
||||
List<SAMRecord> uniqueReads,
|
||||
List<SAMRecord> duplicateReads) {
|
||||
//logger.info(String.format("%s has %d duplicates and %d non-duplicates", loc, duplicateReads.size(), uniqueReads.size()));
|
||||
public List<SAMRecord> map(GenomeLoc loc, AlignmentContext context, Set<List<SAMRecord>> readSets ) {
|
||||
List<SAMRecord> combinedReads = new ArrayList<SAMRecord>();
|
||||
|
||||
SAMRecord combinedRead = null;
|
||||
for ( List<SAMRecord> reads : readSets ) {
|
||||
SAMRecord combinedRead = null;
|
||||
|
||||
if ( duplicateReads.size() == 1 && ! duplicateReads.get(0).getDuplicateReadFlag() ) {
|
||||
// we are a unique read
|
||||
combinedRead = duplicateReads.get(0);
|
||||
} else {
|
||||
// actually call the combine function
|
||||
//for (SAMRecord read : duplicateReads ) {
|
||||
// System.out.printf("Read %s%n", read.format());
|
||||
//}
|
||||
combinedRead = DupUtils.combineDuplicates(duplicateReads, MAX_QUALITY_SCORE);
|
||||
if ( reads.size() == 1 && ! reads.get(0).getDuplicateReadFlag() ) {
|
||||
// we are a unique read
|
||||
combinedRead = reads.get(0);
|
||||
} else {
|
||||
// actually call the combine function
|
||||
//for (SAMRecord read : duplicateReads ) {
|
||||
// System.out.printf("Read %s%n", read.format());
|
||||
//}
|
||||
combinedRead = DupUtils.combineDuplicates(reads, MAX_QUALITY_SCORE);
|
||||
}
|
||||
|
||||
if ( combinedRead.getDuplicateReadFlag() )
|
||||
throw new RuntimeException(String.format("Combined read %s [of %d] is a duplicate after combination -- this is a bug%n%s",
|
||||
combinedRead.getReadName(), reads.size(), combinedRead.format()));
|
||||
|
||||
combinedReads.add(combinedRead);
|
||||
}
|
||||
|
||||
if ( combinedRead.getDuplicateReadFlag() )
|
||||
throw new RuntimeException(String.format("Combined read %s [of %d] is a duplicate after combination -- this is a bug%n%s",
|
||||
combinedRead.getReadName(), duplicateReads.size(), combinedRead.format()));
|
||||
|
||||
return combinedRead;
|
||||
return combinedReads;
|
||||
}
|
||||
}
|
||||
|
|
@ -29,41 +29,61 @@ import net.sf.samtools.SAMRecord;
|
|||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.walkers.DuplicateWalker;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.ArrayList;
|
||||
|
||||
/**
|
||||
* a class to store the traversal information we pass around
|
||||
*/
|
||||
class DuplicateCount {
|
||||
public int count = 0; // the count of sites we were given
|
||||
public int undupDepth = 0; // the unique read count
|
||||
public int depth = 0; // the dupplicate read depth
|
||||
}
|
||||
public int count = 0; // the count of sites we were given
|
||||
public int nUniqueMolecules = 0; // the unique read count
|
||||
public int nDuplicatedMolecules = 0; // the unique read count
|
||||
public int depth = 0; // the dupplicate read depth
|
||||
}
|
||||
|
||||
/**
|
||||
* Count the number of unique reads, duplicates, and the average depth of unique reads and duplicates at all positions.
|
||||
* @author aaron
|
||||
*/
|
||||
public class CountDuplicatesWalker extends DuplicateWalker<DuplicateCount, DuplicateCount> {
|
||||
@Argument(fullName="quiet", required=false, doc="If true, per locus information isn't printex")
|
||||
public boolean quiet = false;
|
||||
|
||||
/**
|
||||
* the map function, conforming to the duplicates interface
|
||||
* @param loc the genomic location
|
||||
* @param refBases the reference bases that cover this position, which we turn off
|
||||
* @param context the AlignmentContext, containing all the reads overlapping this region
|
||||
* @param uniqueReads all the unique reads, bundled together
|
||||
* @param duplicateReads all the duplicate reads
|
||||
* @param readSets all the duplicate reads
|
||||
* @return a DuplicateCount object, with the appropriate stats
|
||||
*/
|
||||
public DuplicateCount map(GenomeLoc loc, byte[] refBases, AlignmentContext context, List<SAMRecord> uniqueReads, List<SAMRecord> duplicateReads) {
|
||||
public DuplicateCount map(GenomeLoc loc, AlignmentContext context, Set<List<SAMRecord>> readSets ) {
|
||||
if ( ! quiet ) out.printf("%s with %d read sets => ", loc, readSets.size());
|
||||
|
||||
DuplicateCount dup = new DuplicateCount();
|
||||
dup.depth = 0;
|
||||
for ( List<SAMRecord> reads : readSets) {
|
||||
List<String> names = new ArrayList<String>();
|
||||
for ( SAMRecord read : reads ) {
|
||||
names.add(read.getReadName());
|
||||
}
|
||||
if ( ! quiet ) out.printf("%d reads [%s] ", reads.size(), Utils.join(",", names));
|
||||
dup.depth += reads.size();
|
||||
dup.nDuplicatedMolecules += reads.size() > 1 ? 1 : 0; // if there's more than 1 read per set, we're a duplicated reads
|
||||
}
|
||||
if ( ! quiet ) out.printf("%n");
|
||||
|
||||
dup.count = 1;
|
||||
dup.undupDepth = uniqueReads.size();
|
||||
dup.depth = duplicateReads.size();
|
||||
dup.nUniqueMolecules = readSets.size();
|
||||
return dup;
|
||||
}
|
||||
|
||||
public boolean mapAtLociWithoutDuplicates() { return true; }
|
||||
|
||||
/**
|
||||
* setup our walker. In this case, new a DuplicateCount object and return it
|
||||
* @return the object holding the counts of the duplicates
|
||||
|
|
@ -82,7 +102,8 @@ public class CountDuplicatesWalker extends DuplicateWalker<DuplicateCount, Dupli
|
|||
DuplicateCount dup = new DuplicateCount();
|
||||
dup.count = sum.count + value.count;
|
||||
dup.depth = value.depth + sum.depth;
|
||||
dup.undupDepth = value.undupDepth + sum.undupDepth;
|
||||
dup.nDuplicatedMolecules = value.nDuplicatedMolecules + sum.nDuplicatedMolecules;
|
||||
dup.nUniqueMolecules = value.nUniqueMolecules + sum.nUniqueMolecules;
|
||||
return dup;
|
||||
}
|
||||
|
||||
|
|
@ -93,9 +114,10 @@ public class CountDuplicatesWalker extends DuplicateWalker<DuplicateCount, Dupli
|
|||
public void onTraversalDone(DuplicateCount result) {
|
||||
out.println("[REDUCE RESULT] Traversal result is: ");
|
||||
out.println("traversal iterations = " + result.count);
|
||||
out.println("average depth = " + (double)result.depth / (double)result.count);
|
||||
out.println("duplicates seen = " + result.depth);
|
||||
out.println("unique read count = " + result.undupDepth);
|
||||
out.println("average unique read depth = " + (double)result.undupDepth / (double)result.count);
|
||||
out.printf("average depth = %.2f%n", (double)result.depth / (double)result.count);
|
||||
out.println("unique molecules seen = " + result.nUniqueMolecules);
|
||||
out.println("duplicated molecules seen = " + result.nDuplicatedMolecules);
|
||||
out.printf("percent duplicated = %.2f%%%n", result.nDuplicatedMolecules / (double)result.nUniqueMolecules * 100);
|
||||
out.printf("average unique read depth = %.2f%n", (double)result.nUniqueMolecules / (double)result.count);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -39,6 +39,13 @@ public class ReadBackedPileup implements Iterable<PileupElement> {
|
|||
this(loc, readsOffsets2Pileup(reads, offsets));
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new version of a read backed pileup at loc without any aligned reads
|
||||
*
|
||||
*/
|
||||
public ReadBackedPileup(GenomeLoc loc ) {
|
||||
this(loc, new ArrayList<PileupElement>(0));
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new version of a read backed pileup at loc, using the reads and their corresponding
|
||||
|
|
|
|||
|
|
@ -12,6 +12,7 @@ import org.junit.Test;
|
|||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
|
||||
/**
|
||||
|
|
@ -37,49 +38,48 @@ public class TraverseDuplicatesTest extends BaseTest {
|
|||
public void testAllDupplicatesNoPairs() {
|
||||
List<SAMRecord> list = new ArrayList<SAMRecord>();
|
||||
for (int x = 0; x < 10; x++) {
|
||||
SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "SWEET_READ", 0, 1, 100);
|
||||
SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "SWEET_READ" + x, 0, 1, 100);
|
||||
read.setDuplicateReadFlag(true);
|
||||
list.add(read);
|
||||
}
|
||||
Pair<List<SAMRecord>, List<SAMRecord>> myPairing = obj.splitDuplicates(list);
|
||||
Assert.assertEquals(0, myPairing.first.size()); // unique
|
||||
Assert.assertEquals(10, myPairing.second.size()); // dup's
|
||||
Set<List<SAMRecord>> myPairings = obj.uniqueReadSets(list);
|
||||
Assert.assertEquals(1, myPairings.size());
|
||||
Assert.assertEquals(10, myPairings.iterator().next().size()); // dup's
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testNoDupplicatesNoPairs() {
|
||||
List<SAMRecord> list = new ArrayList<SAMRecord>();
|
||||
for (int x = 0; x < 10; x++) {
|
||||
SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "SWEET_READ", 0, 1, 100);
|
||||
SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "SWEET_READ" + x, 0, 1, 100);
|
||||
read.setDuplicateReadFlag(false);
|
||||
list.add(read);
|
||||
}
|
||||
Pair<List<SAMRecord>, List<SAMRecord>> myPairing = obj.splitDuplicates(list);
|
||||
Assert.assertEquals(10, myPairing.first.size()); // unique
|
||||
Assert.assertEquals(0, myPairing.second.size()); // dup's
|
||||
|
||||
Set<List<SAMRecord>> myPairing = obj.uniqueReadSets(list);
|
||||
Assert.assertEquals(10, myPairing.size()); // unique
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testFiftyFiftyNoPairs() {
|
||||
List<SAMRecord> list = new ArrayList<SAMRecord>();
|
||||
for (int x = 0; x < 5; x++) {
|
||||
SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "SWEET_READ", 0, 1, 100);
|
||||
SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "SWEET_READ" + x, 0, 1, 100);
|
||||
read.setDuplicateReadFlag(true);
|
||||
list.add(read);
|
||||
}
|
||||
for (int x = 10; x < 15; x++)
|
||||
list.add(ArtificialSAMUtils.createArtificialRead(header, String.valueOf(x), 0, x, 100));
|
||||
|
||||
Pair<List<SAMRecord>, List<SAMRecord>> myPairing = obj.splitDuplicates(list);
|
||||
Assert.assertEquals(5, myPairing.first.size()); // unique
|
||||
Assert.assertEquals(5, myPairing.second.size()); // dup's
|
||||
Set<List<SAMRecord>> myPairing = obj.uniqueReadSets(list);
|
||||
Assert.assertEquals(6, myPairing.size()); // unique
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testAllDupplicatesAllPairs() {
|
||||
List<SAMRecord> list = new ArrayList<SAMRecord>();
|
||||
for (int x = 0; x < 10; x++) {
|
||||
SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "SWEET_READ", 0, 1, 100);
|
||||
SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "SWEET_READ"+ x, 0, 1, 100);
|
||||
read.setDuplicateReadFlag(true);
|
||||
read.setMateAlignmentStart(100);
|
||||
read.setMateReferenceIndex(0);
|
||||
|
|
@ -87,16 +87,15 @@ public class TraverseDuplicatesTest extends BaseTest {
|
|||
list.add(read);
|
||||
}
|
||||
|
||||
Pair<List<SAMRecord>, List<SAMRecord>> myPairing = obj.splitDuplicates(list);
|
||||
Assert.assertEquals(0, myPairing.first.size()); // unique
|
||||
Assert.assertEquals(10, myPairing.second.size()); // dup's
|
||||
Set<List<SAMRecord>> myPairing = obj.uniqueReadSets(list);
|
||||
Assert.assertEquals(1, myPairing.size()); // unique
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testNoDupplicatesAllPairs() {
|
||||
List<SAMRecord> list = new ArrayList<SAMRecord>();
|
||||
for (int x = 0; x < 10; x++) {
|
||||
SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "SWEET_READ", 0, 1, 100);
|
||||
SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "SWEET_READ"+ x, 0, 1, 100);
|
||||
if (x == 0) read.setDuplicateReadFlag(true); // one is a dup but (next line)
|
||||
read.setMateAlignmentStart(100); // they all have a shared start and mate start so they're dup's
|
||||
read.setMateReferenceIndex(0);
|
||||
|
|
@ -104,16 +103,15 @@ public class TraverseDuplicatesTest extends BaseTest {
|
|||
list.add(read);
|
||||
}
|
||||
|
||||
Pair<List<SAMRecord>, List<SAMRecord>> myPairing = obj.splitDuplicates(list);
|
||||
Assert.assertEquals(0, myPairing.first.size()); // unique
|
||||
Assert.assertEquals(10, myPairing.second.size()); // dup's
|
||||
Set<List<SAMRecord>> myPairing = obj.uniqueReadSets(list);
|
||||
Assert.assertEquals(1, myPairing.size()); // unique
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testAllDupplicatesAllPairsDifferentPairedEnd() {
|
||||
List<SAMRecord> list = new ArrayList<SAMRecord>();
|
||||
for (int x = 0; x < 10; x++) {
|
||||
SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "SWEET_READ", 0, 1, 100);
|
||||
SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "SWEET_READ" + x, 0, 1, 100);
|
||||
if (x == 0) read.setDuplicateReadFlag(true); // one is a dup
|
||||
read.setMateAlignmentStart(100 + x);
|
||||
read.setMateReferenceIndex(0);
|
||||
|
|
@ -121,8 +119,7 @@ public class TraverseDuplicatesTest extends BaseTest {
|
|||
list.add(read);
|
||||
}
|
||||
|
||||
Pair<List<SAMRecord>, List<SAMRecord>> myPairing = obj.splitDuplicates(list);
|
||||
Assert.assertEquals(9, myPairing.first.size()); // unique
|
||||
Assert.assertEquals(1, myPairing.second.size()); // dup's
|
||||
Set<List<SAMRecord>> myPairing = obj.uniqueReadSets(list);
|
||||
Assert.assertEquals(10, myPairing.size()); // unique
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -249,7 +249,7 @@ def calculateBinsForValues(values, field, minValue, maxValue, partitions):
|
|||
bins[-1][1] = bin[0]
|
||||
bins[-1][2] += curSize
|
||||
|
||||
#print 'Returning ', bins
|
||||
print 'Returning ', bins
|
||||
#sys.exit(1)
|
||||
return bins
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue