From fcc80e8632f5934a52e9d855fe3efaab9c4e620f Mon Sep 17 00:00:00 2001 From: depristo Date: Mon, 28 Dec 2009 23:56:49 +0000 Subject: [PATCH] Completely rewritten duplicate traversal, more free of bugs, with integration tests for count duplicates walker validated on a TCGA hybrid capture lane. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2458 348d0f76-0448-11de-a6fe-93d51630548a --- .../gatk/traversals/TraverseDuplicates.java | 213 +++++++----------- .../sting/gatk/walkers/DuplicateWalker.java | 16 +- .../walkers/DuplicateQualsWalker.java | 66 +++--- .../duplicates/CombineDuplicatesWalker.java | 66 +++--- .../duplicates/CountDuplicatesWalker.java | 52 +++-- .../sting/utils/pileup/ReadBackedPileup.java | 7 + .../traversals/TraverseDuplicatesTest.java | 45 ++-- python/snpSelector.py | 2 +- 8 files changed, 227 insertions(+), 240 deletions(-) diff --git a/java/src/org/broadinstitute/sting/gatk/traversals/TraverseDuplicates.java b/java/src/org/broadinstitute/sting/gatk/traversals/TraverseDuplicates.java index 10ea57b40..050c51e6a 100755 --- a/java/src/org/broadinstitute/sting/gatk/traversals/TraverseDuplicates.java +++ b/java/src/org/broadinstitute/sting/gatk/traversals/TraverseDuplicates.java @@ -32,6 +32,7 @@ import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.datasources.providers.ReadView; import org.broadinstitute.sting.gatk.datasources.providers.ShardDataProvider; +import org.broadinstitute.sting.gatk.datasources.providers.ManagingReferenceOrderedView; import org.broadinstitute.sting.gatk.datasources.shards.ReadShard; import org.broadinstitute.sting.gatk.datasources.shards.Shard; import org.broadinstitute.sting.gatk.iterators.PushbackIterator; @@ -40,11 +41,10 @@ import org.broadinstitute.sting.gatk.walkers.Walker; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.Pair; +import org.broadinstitute.sting.utils.StingException; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Iterator; -import java.util.List; +import java.util.*; /** * @author Mark DePristo @@ -85,128 +85,58 @@ public class TraverseDuplicates extends TraversalEngine { return l; } - protected Pair, List> splitDuplicates(List reads) { - List uniques = new ArrayList(); - List dups = new ArrayList(); - - // find the first duplicate - SAMRecord key = null; - for (SAMRecord read : reads) { - if (read.getDuplicateReadFlag()) { - // this is our key - key = read; - if (DEBUG) logger.debug(String.format("Key %s is a duplicate", read.getReadName())); - break; + protected Set> uniqueReadSets(List reads) { + Set> readSets = new HashSet>(); + for ( SAMRecord read : reads ) { + + List readSet = findDuplicateReads(read, readSets); + if ( readSet == null ) { + readSets.add(new ArrayList(Arrays.asList(read))); // copy so I can add to the list + } else { + readSet.add(read); } } - // At this point, there are two possibilities, we have found at least one dup or not - // if it's a dup, add it to the dups list, otherwise add it to the uniques list - if (key != null) { - final GenomeLoc keyLoc = GenomeLocParser.createGenomeLoc(key); - final GenomeLoc keyMateLoc = (!key.getReadPairedFlag()) ? null : - GenomeLocParser.createGenomeLoc(key.getMateReferenceIndex(), key.getMateAlignmentStart(), key.getMateAlignmentStart()); - for (SAMRecord read : reads) { - final GenomeLoc readLoc = GenomeLocParser.createGenomeLoc(read); - final GenomeLoc readMateLoc = (!key.getReadPairedFlag()) ? null : - GenomeLocParser.createGenomeLoc(read.getMateReferenceIndex(), read.getMateAlignmentStart(), read.getMateAlignmentStart()); - if (DEBUG) - logger.debug(String.format("Examining reads at %s vs. %s at %s / %s vs. %s / %s%n", key.getReadName(), read.getReadName(), keyLoc, keyMateLoc, readLoc, readMateLoc)); + return readSets; + } + + protected List findDuplicateReads(SAMRecord read, Set> readSets ) { + if ( read.getReadPairedFlag() ) { + // paired + final GenomeLoc readMateLoc = GenomeLocParser.createGenomeLoc(read.getMateReferenceIndex(), read.getMateAlignmentStart(), read.getMateAlignmentStart()); + + for (List reads : readSets) { + SAMRecord key = reads.get(0); + //if (DEBUG) + // logger.debug(String.format("Examining reads at %s vs. %s at %s / %s vs. %s / %s%n", key.getReadName(), read.getReadName(), keyLoc, keyMateLoc, readLoc, readMateLoc)); // read and key start at the same place, and either the this read and the key // share a mate location or the read is flagged as a duplicate - if (readLoc.compareTo(keyLoc) == 0 || - read.getDuplicateReadFlag()) { - if ((readMateLoc != null && keyMateLoc != null && readMateLoc.compareTo(keyMateLoc) == 0) || - (readMateLoc == null && keyMateLoc == null)) { + if ( read.getAlignmentStart() == key.getAlignmentStart() && key.getReadPairedFlag() && ( key.getDuplicateReadFlag() || read.getDuplicateReadFlag() ) ) { + // at least one has to be marked as a duplicate + final GenomeLoc keyMateLoc = GenomeLocParser.createGenomeLoc(key.getMateReferenceIndex(), key.getMateAlignmentStart(), key.getMateAlignmentStart()); + if ( readMateLoc.compareTo(keyMateLoc) == 0 ) { // we are at the same position as the dup and have the same mat pos, it's a dup - if (DEBUG) logger.debug(String.format(" => Adding read to dups list: %s%n", read)); - dups.add(read); - } else { - uniques.add(read); + if (DEBUG) logger.debug(String.format(" => Adding read to dups list: %s %d %s vs. %s", read, reads.size(), readMateLoc, keyMateLoc)); + return reads; } - } else { - uniques.add(read); } } } else { - uniques = reads; - } - - return new Pair, List>(uniques, dups); - } - - /** - * Traverse by reads, given the data and the walker - * - * @param sum of type T, the return from the walker - * @param the generic type - * @param the return type of the reduce function - * @param dupWalker our duplicates walker - * @param readIter our iterator - * - * @return the reduce type, T, the final product of all the reduce calls - */ - private T actuallyTraverse(DuplicateWalker dupWalker, - Iterator readIter, - T sum) { - /** - * while we still have more reads: - * ok, here's the idea. We get all the reads that start at the same position in the genome - * We then split the list of reads into sublists of reads: - * -> those with the same mate pair position, for paired reads - * -> those flagged as unpaired and duplicated but having the same start and end and - */ - PushbackIterator iter = new PushbackIterator(readIter); - - - for (SAMRecord read : iter) { - - // get the genome loc from the read - GenomeLoc site = GenomeLocParser.createGenomeLoc(read); - - List reads = readsAtLoc(read, iter); - Pair, List> split = splitDuplicates(reads); - List uniqueReads = split.getFirst(); - List duplicateReads = split.getSecond(); - - logger.debug(String.format("*** TraverseDuplicates.traverse at %s with %d reads has %d unique and %d duplicate reads", - site, reads.size(), uniqueReads.size(), duplicateReads.size())); - if (reads.size() != uniqueReads.size() + duplicateReads.size()) - throw new RuntimeException(String.format("Bug occurred spliting reads [N=%d] at loc %s into unique [N=%d] and duplicates [N=%d], sizes don't match", - reads.size(), site.toString(), uniqueReads.size(), duplicateReads.size())); - - // Jump forward in the reference to this locus location - AlignmentContext locus = new AlignmentContext(site, duplicateReads, Arrays.asList(0)); - - // update the number of duplicate sets we've seen - TraversalStatistics.nRecords++; - - // we still have to fix the locus context provider to take care of this problem with > 1 length contexts - // AlignmentContext locus = locusProvider.getLocusContext(site); - - byte[] refBases = new byte[0]; - - if (dupWalker.mapUniqueReadsTooP()) { - // Send each unique read to the map function - for (SAMRecord unique : uniqueReads) { - List l = Arrays.asList(unique); - sum = mapOne(dupWalker, uniqueReads, l, site, refBases, locus, sum); + for (List reads : readSets) { + SAMRecord key = reads.get(0); + boolean v = (! key.getReadPairedFlag()) && read.getAlignmentStart() == key.getAlignmentStart() && ( key.getDuplicateReadFlag() || read.getDuplicateReadFlag() ) && read.getReadLength() == key.getReadLength(); + //System.out.printf("%s %s %b %b %d %d %d %d => %b%n", + // read.getReadPairedFlag(), key.getReadPairedFlag(), read.getDuplicateReadFlag(), key.getDuplicateReadFlag(), + // read.getAlignmentStart(), key.getAlignmentStart(), read.getReadLength(), key.getReadLength(), v); + if ( v ) { + //System.out.printf("Returning reads...%n"); + return reads; } } - - if (duplicateReads.size() > 0) - sum = mapOne(dupWalker, uniqueReads, duplicateReads, site, refBases, locus, sum); - - printProgress(DUPS_STRING, site); - - if (this.maximumIterations > 0 && TraversalStatistics.nRecords > this.maximumIterations) { - logger.warn(String.format(("Maximum number of duplicate sets encountered, terminating traversal " + TraversalStatistics.nRecords))); - break; - } } - return sum; + return null; } /** @@ -232,7 +162,7 @@ public class TraverseDuplicates extends TraversalEngine { if (result) { TraversalStatistics.nSkippedReads++; - //System.out.printf(" [filter] %s => %b %s", rec.getReadName(), result, why); + //System.out.printf(" [filter] %s => %b", rec.getReadName(), result); } else { TraversalStatistics.nReads++; } @@ -240,27 +170,12 @@ public class TraverseDuplicates extends TraversalEngine { } } - public T mapOne(DuplicateWalker dupWalker, - List uniqueReads, - List duplicateReads, - GenomeLoc site, - byte[] refBases, - AlignmentContext locus, - T sum) { - final boolean keepMeP = dupWalker.filter(site, refBases, locus, uniqueReads, duplicateReads); - if (keepMeP) { - M x = dupWalker.map(site, refBases, locus, uniqueReads, duplicateReads); - sum = dupWalker.reduce(x, sum); - } - return sum; - } - - // -------------------------------------------------------------------------------------------------------------- // // new style interface to the system // // -------------------------------------------------------------------------------------------------------------- + /** * Traverse by reads, given the data and the walker * @@ -276,8 +191,7 @@ public class TraverseDuplicates extends TraversalEngine { Shard shard, ShardDataProvider dataProvider, T sum) { - - logger.debug(String.format("TraverseDuplicates.traverse Genomic interval is %s", ((ReadShard) shard).getSize())); + //logger.debug(String.format("TraverseDuplicates.traverse Genomic interval is %s", shard.getGenomeLoc())); if (!(walker instanceof DuplicateWalker)) throw new IllegalArgumentException("Walker isn't a duplicate walker!"); @@ -292,13 +206,46 @@ public class TraverseDuplicates extends TraversalEngine { FilteringIterator filterIter = new FilteringIterator(new ReadView(dataProvider).iterator(), new duplicateStreamFilterFunc()); PushbackIterator iter = new PushbackIterator(filterIter); - return actuallyTraverse(dupWalker, iter, sum); - } + /** + * while we still have more reads: + * ok, here's the idea. We get all the reads that start at the same position in the genome + * We then split the list of reads into sublists of reads: + * -> those with the same mate pair position, for paired reads + * -> those flagged as unpaired and duplicated but having the same start and end + */ + for (SAMRecord read : iter) { + // get the genome loc from the read + GenomeLoc site = GenomeLocParser.createGenomeLoc(read); + + Set> readSets = uniqueReadSets(readsAtLoc(read, iter)); + logger.debug(String.format("*** TraverseDuplicates.traverse at %s with %d read sets", site, readSets.size())); + + // Jump forward in the reference to this locus location + AlignmentContext locus = new AlignmentContext(site, new ReadBackedPileup(site)); + + // update the number of duplicate sets we've seen + TraversalStatistics.nRecords++; + + final boolean keepMeP = dupWalker.filter(site, locus, readSets); + if (keepMeP) { + M x = dupWalker.map(site, locus, readSets); + sum = dupWalker.reduce(x, sum); + } + + printProgress(DUPS_STRING, site); + + if (this.maximumIterations > 0 && TraversalStatistics.nRecords > this.maximumIterations) { + logger.warn(String.format(("Maximum number of duplicate sets encountered, terminating traversal " + TraversalStatistics.nRecords))); + break; + } + } + + return sum; + } /** * Temporary override of printOnTraversalDone. - * TODO: Add some sort of TE.getName() function once all TraversalEngines are ported. * * @param sum Result of the computation. * @param Type of the result. diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/DuplicateWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/DuplicateWalker.java index 09edc466b..c3b0f62b9 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/DuplicateWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/DuplicateWalker.java @@ -4,6 +4,7 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.utils.GenomeLoc; import java.util.List; +import java.util.Set; import net.sf.samtools.SAMRecord; @@ -17,9 +18,7 @@ import net.sf.samtools.SAMRecord; @Requires({DataSource.READS,DataSource.REFERENCE}) public abstract class DuplicateWalker extends Walker { // Do we actually want to operate on the context? - public boolean filter(GenomeLoc loc, byte[] refBases, AlignmentContext context, - List uniqueReads, - List duplicateReads) { + public boolean filter(GenomeLoc loc, AlignmentContext context, Set> readSets ) { return true; // We are keeping all the reads } @@ -31,9 +30,14 @@ public abstract class DuplicateWalker extends Walker uniqueReads, - List duplicateReads); + /** + * Called by the traversal engine to decide whether to call map() at loci without duplicate reads + * + * @return true if you want to see non duplicates during the traversal + */ + public boolean mapAtLociWithoutDuplicates() { return true; } + + public abstract MapType map(GenomeLoc loc, AlignmentContext context, Set> readSets ); // Given result of map function public abstract ReduceType reduceInit(); diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/DuplicateQualsWalker.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/DuplicateQualsWalker.java index 628af8f24..f83e4e62e 100755 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/DuplicateQualsWalker.java +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/DuplicateQualsWalker.java @@ -23,7 +23,7 @@ * OTHER DEALINGS IN THE SOFTWARE. */ -package org.broadinstitute.sting.playground.gatk.walkers.duplicates; +package org.broadinstitute.sting.oneoffprojects.walkers; import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; @@ -39,6 +39,7 @@ import org.broadinstitute.sting.utils.duplicates.DuplicateComp; import java.io.PrintStream; import java.util.ArrayList; import java.util.List; +import java.util.Set; class MismatchCounter { long nObs = 0; @@ -160,41 +161,40 @@ public class DuplicateQualsWalker extends DuplicateWalker, Q } // Print out data for regression - public List map(GenomeLoc loc, byte[] refBases, AlignmentContext context, - List uniqueReads, - List duplicateReads) { + public List map(GenomeLoc loc, AlignmentContext context, Set> readSets ) { //logger.info(String.format("%s has %d duplicates and %d non-duplicates", loc, duplicateReads.size(), uniqueReads.size())); List pairwiseComps = new ArrayList(); - - if ( ! ACTUALLY_DO_WORK ) - return pairwiseComps; - if ( COMBINE_QUALS ) { - Pair combinedReads = DupUtils.combinedReadPair( duplicateReads ); - if ( combinedReads != null ) { - SAMRecord combined1 = combinedReads.first; - SAMRecord combined2 = combinedReads.second; - - if ( comparePairToSingleton ) - pairwiseComps = addPairwiseMatches( pairwiseComps, combined1, duplicateReads.get(2), uniqueReads ); - else - pairwiseComps = addPairwiseMatches( pairwiseComps, combined1, combined2, uniqueReads ); - } - } else { - int nComparisons = 0; - for ( SAMRecord read1 : duplicateReads ) { - for ( SAMRecord read2 : duplicateReads ) { - if ( read1.hashCode() < read2.hashCode() && DupUtils.usableDuplicate(read1, read2) ) { - // the hashcode insures we don't do A vs. B and B vs. A - //System.out.printf("Comparing %s against %s%n", read1, read2); - nComparisons++; - pairwiseComps = addPairwiseMatches( pairwiseComps, read1, read2, uniqueReads ); - if ( nComparisons > MAX_PAIRSIZE_COMPS_PER_DUPLICATE_SET ) - break; - } - } - } - } + // todo -- fixme -- the logic here is all wrong given new interface +// if ( ! ACTUALLY_DO_WORK ) +// return pairwiseComps; +// +// if ( COMBINE_QUALS ) { +// Pair combinedReads = DupUtils.combinedReadPair( duplicateReads ); +// if ( combinedReads != null ) { +// SAMRecord combined1 = combinedReads.first; +// SAMRecord combined2 = combinedReads.second; +// +// if ( comparePairToSingleton ) +// pairwiseComps = addPairwiseMatches( pairwiseComps, combined1, duplicateReads.get(2), uniqueReads ); +// else +// pairwiseComps = addPairwiseMatches( pairwiseComps, combined1, combined2, uniqueReads ); +// } +// } else { +// int nComparisons = 0; +// for ( SAMRecord read1 : duplicateReads ) { +// for ( SAMRecord read2 : duplicateReads ) { +// if ( read1.hashCode() < read2.hashCode() && DupUtils.usableDuplicate(read1, read2) ) { +// // the hashcode insures we don't do A vs. B and B vs. A +// //System.out.printf("Comparing %s against %s%n", read1, read2); +// nComparisons++; +// pairwiseComps = addPairwiseMatches( pairwiseComps, read1, read2, uniqueReads ); +// if ( nComparisons > MAX_PAIRSIZE_COMPS_PER_DUPLICATE_SET ) +// break; +// } +// } +// } +// } return pairwiseComps; } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/duplicates/CombineDuplicatesWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/duplicates/CombineDuplicatesWalker.java index 98470d27e..c87a8a337 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/duplicates/CombineDuplicatesWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/duplicates/CombineDuplicatesWalker.java @@ -7,6 +7,8 @@ import org.broadinstitute.sting.utils.duplicates.DupUtils; import org.broadinstitute.sting.utils.cmdLine.Argument; import java.util.List; +import java.util.Set; +import java.util.ArrayList; import net.sf.samtools.SAMRecord; import net.sf.samtools.SAMFileWriter; @@ -15,7 +17,7 @@ import net.sf.samtools.SAMFileWriter; * Process the input bam file, optionally emitting all the unique reads found, and emitting the combined duplicate reads to * the specified output BAM location. If no output location is specified, the reads are written to STDOUT. */ -public class CombineDuplicatesWalker extends DuplicateWalker { +public class CombineDuplicatesWalker extends DuplicateWalker, SAMFileWriter> { @Argument(fullName="outputBAM", shortName="outputBAM", required=false, doc="BAM File to write combined duplicates to") public SAMFileWriter outputBAM = null; @@ -45,50 +47,58 @@ public class CombineDuplicatesWalker extends DuplicateWalker reads, SAMFileWriter output) { + for ( SAMRecord read : reads ) { + if ( output != null ) { + output.addAlignment(read); + } else { + out.println(read.format()); + } } return output; } + /** + * We don't want to see loci without duplicates, since + * @return + */ + public boolean mapAtLociWithoutDuplicates() { return false; } + /** * Build a combined read given the input list of non-unique reads. If there's just one read in the * set, it's considered unique and returned. If there's more than one, the N-way combine * duplicate function is invoked. * * @param loc the genome loc - * @param refBases the reference bases for the given locus * @param context the alignment context that has the reads information - * @param duplicateReads a list of the dupplicate reads at this locus - * @param uniqueReads the unique read list at this locus + * @param readSets the set of unique reads list at this locus * @return a read that combines the dupplicate reads at this locus */ - public SAMRecord map(GenomeLoc loc, byte[] refBases, AlignmentContext context, - List uniqueReads, - List duplicateReads) { - //logger.info(String.format("%s has %d duplicates and %d non-duplicates", loc, duplicateReads.size(), uniqueReads.size())); + public List map(GenomeLoc loc, AlignmentContext context, Set> readSets ) { + List combinedReads = new ArrayList(); - SAMRecord combinedRead = null; + for ( List reads : readSets ) { + SAMRecord combinedRead = null; - if ( duplicateReads.size() == 1 && ! duplicateReads.get(0).getDuplicateReadFlag() ) { - // we are a unique read - combinedRead = duplicateReads.get(0); - } else { - // actually call the combine function - //for (SAMRecord read : duplicateReads ) { - // System.out.printf("Read %s%n", read.format()); - //} - combinedRead = DupUtils.combineDuplicates(duplicateReads, MAX_QUALITY_SCORE); + if ( reads.size() == 1 && ! reads.get(0).getDuplicateReadFlag() ) { + // we are a unique read + combinedRead = reads.get(0); + } else { + // actually call the combine function + //for (SAMRecord read : duplicateReads ) { + // System.out.printf("Read %s%n", read.format()); + //} + combinedRead = DupUtils.combineDuplicates(reads, MAX_QUALITY_SCORE); + } + + if ( combinedRead.getDuplicateReadFlag() ) + throw new RuntimeException(String.format("Combined read %s [of %d] is a duplicate after combination -- this is a bug%n%s", + combinedRead.getReadName(), reads.size(), combinedRead.format())); + + combinedReads.add(combinedRead); } - if ( combinedRead.getDuplicateReadFlag() ) - throw new RuntimeException(String.format("Combined read %s [of %d] is a duplicate after combination -- this is a bug%n%s", - combinedRead.getReadName(), duplicateReads.size(), combinedRead.format())); - - return combinedRead; + return combinedReads; } } \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/duplicates/CountDuplicatesWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/duplicates/CountDuplicatesWalker.java index e5c8aff9f..46bfc6963 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/duplicates/CountDuplicatesWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/duplicates/CountDuplicatesWalker.java @@ -29,41 +29,61 @@ import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.walkers.DuplicateWalker; import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.cmdLine.Argument; import java.util.List; +import java.util.Set; +import java.util.ArrayList; /** * a class to store the traversal information we pass around */ class DuplicateCount { - public int count = 0; // the count of sites we were given - public int undupDepth = 0; // the unique read count - public int depth = 0; // the dupplicate read depth - } + public int count = 0; // the count of sites we were given + public int nUniqueMolecules = 0; // the unique read count + public int nDuplicatedMolecules = 0; // the unique read count + public int depth = 0; // the dupplicate read depth +} /** * Count the number of unique reads, duplicates, and the average depth of unique reads and duplicates at all positions. * @author aaron */ public class CountDuplicatesWalker extends DuplicateWalker { + @Argument(fullName="quiet", required=false, doc="If true, per locus information isn't printex") + public boolean quiet = false; /** * the map function, conforming to the duplicates interface * @param loc the genomic location - * @param refBases the reference bases that cover this position, which we turn off * @param context the AlignmentContext, containing all the reads overlapping this region - * @param uniqueReads all the unique reads, bundled together - * @param duplicateReads all the duplicate reads + * @param readSets all the duplicate reads * @return a DuplicateCount object, with the appropriate stats */ - public DuplicateCount map(GenomeLoc loc, byte[] refBases, AlignmentContext context, List uniqueReads, List duplicateReads) { + public DuplicateCount map(GenomeLoc loc, AlignmentContext context, Set> readSets ) { + if ( ! quiet ) out.printf("%s with %d read sets => ", loc, readSets.size()); + DuplicateCount dup = new DuplicateCount(); + dup.depth = 0; + for ( List reads : readSets) { + List names = new ArrayList(); + for ( SAMRecord read : reads ) { + names.add(read.getReadName()); + } + if ( ! quiet ) out.printf("%d reads [%s] ", reads.size(), Utils.join(",", names)); + dup.depth += reads.size(); + dup.nDuplicatedMolecules += reads.size() > 1 ? 1 : 0; // if there's more than 1 read per set, we're a duplicated reads + } + if ( ! quiet ) out.printf("%n"); + dup.count = 1; - dup.undupDepth = uniqueReads.size(); - dup.depth = duplicateReads.size(); + dup.nUniqueMolecules = readSets.size(); return dup; } + public boolean mapAtLociWithoutDuplicates() { return true; } + /** * setup our walker. In this case, new a DuplicateCount object and return it * @return the object holding the counts of the duplicates @@ -82,7 +102,8 @@ public class CountDuplicatesWalker extends DuplicateWalker { this(loc, readsOffsets2Pileup(reads, offsets)); } + /** + * Create a new version of a read backed pileup at loc without any aligned reads + * + */ + public ReadBackedPileup(GenomeLoc loc ) { + this(loc, new ArrayList(0)); + } /** * Create a new version of a read backed pileup at loc, using the reads and their corresponding diff --git a/java/test/org/broadinstitute/sting/gatk/traversals/TraverseDuplicatesTest.java b/java/test/org/broadinstitute/sting/gatk/traversals/TraverseDuplicatesTest.java index 027002410..7b834ab74 100644 --- a/java/test/org/broadinstitute/sting/gatk/traversals/TraverseDuplicatesTest.java +++ b/java/test/org/broadinstitute/sting/gatk/traversals/TraverseDuplicatesTest.java @@ -12,6 +12,7 @@ import org.junit.Test; import java.util.ArrayList; import java.util.List; +import java.util.Set; /** @@ -37,49 +38,48 @@ public class TraverseDuplicatesTest extends BaseTest { public void testAllDupplicatesNoPairs() { List list = new ArrayList(); for (int x = 0; x < 10; x++) { - SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "SWEET_READ", 0, 1, 100); + SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "SWEET_READ" + x, 0, 1, 100); read.setDuplicateReadFlag(true); list.add(read); } - Pair, List> myPairing = obj.splitDuplicates(list); - Assert.assertEquals(0, myPairing.first.size()); // unique - Assert.assertEquals(10, myPairing.second.size()); // dup's + Set> myPairings = obj.uniqueReadSets(list); + Assert.assertEquals(1, myPairings.size()); + Assert.assertEquals(10, myPairings.iterator().next().size()); // dup's } @Test public void testNoDupplicatesNoPairs() { List list = new ArrayList(); for (int x = 0; x < 10; x++) { - SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "SWEET_READ", 0, 1, 100); + SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "SWEET_READ" + x, 0, 1, 100); read.setDuplicateReadFlag(false); list.add(read); } - Pair, List> myPairing = obj.splitDuplicates(list); - Assert.assertEquals(10, myPairing.first.size()); // unique - Assert.assertEquals(0, myPairing.second.size()); // dup's + + Set> myPairing = obj.uniqueReadSets(list); + Assert.assertEquals(10, myPairing.size()); // unique } @Test public void testFiftyFiftyNoPairs() { List list = new ArrayList(); for (int x = 0; x < 5; x++) { - SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "SWEET_READ", 0, 1, 100); + SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "SWEET_READ" + x, 0, 1, 100); read.setDuplicateReadFlag(true); list.add(read); } for (int x = 10; x < 15; x++) list.add(ArtificialSAMUtils.createArtificialRead(header, String.valueOf(x), 0, x, 100)); - Pair, List> myPairing = obj.splitDuplicates(list); - Assert.assertEquals(5, myPairing.first.size()); // unique - Assert.assertEquals(5, myPairing.second.size()); // dup's + Set> myPairing = obj.uniqueReadSets(list); + Assert.assertEquals(6, myPairing.size()); // unique } @Test public void testAllDupplicatesAllPairs() { List list = new ArrayList(); for (int x = 0; x < 10; x++) { - SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "SWEET_READ", 0, 1, 100); + SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "SWEET_READ"+ x, 0, 1, 100); read.setDuplicateReadFlag(true); read.setMateAlignmentStart(100); read.setMateReferenceIndex(0); @@ -87,16 +87,15 @@ public class TraverseDuplicatesTest extends BaseTest { list.add(read); } - Pair, List> myPairing = obj.splitDuplicates(list); - Assert.assertEquals(0, myPairing.first.size()); // unique - Assert.assertEquals(10, myPairing.second.size()); // dup's + Set> myPairing = obj.uniqueReadSets(list); + Assert.assertEquals(1, myPairing.size()); // unique } @Test public void testNoDupplicatesAllPairs() { List list = new ArrayList(); for (int x = 0; x < 10; x++) { - SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "SWEET_READ", 0, 1, 100); + SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "SWEET_READ"+ x, 0, 1, 100); if (x == 0) read.setDuplicateReadFlag(true); // one is a dup but (next line) read.setMateAlignmentStart(100); // they all have a shared start and mate start so they're dup's read.setMateReferenceIndex(0); @@ -104,16 +103,15 @@ public class TraverseDuplicatesTest extends BaseTest { list.add(read); } - Pair, List> myPairing = obj.splitDuplicates(list); - Assert.assertEquals(0, myPairing.first.size()); // unique - Assert.assertEquals(10, myPairing.second.size()); // dup's + Set> myPairing = obj.uniqueReadSets(list); + Assert.assertEquals(1, myPairing.size()); // unique } @Test public void testAllDupplicatesAllPairsDifferentPairedEnd() { List list = new ArrayList(); for (int x = 0; x < 10; x++) { - SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "SWEET_READ", 0, 1, 100); + SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "SWEET_READ" + x, 0, 1, 100); if (x == 0) read.setDuplicateReadFlag(true); // one is a dup read.setMateAlignmentStart(100 + x); read.setMateReferenceIndex(0); @@ -121,8 +119,7 @@ public class TraverseDuplicatesTest extends BaseTest { list.add(read); } - Pair, List> myPairing = obj.splitDuplicates(list); - Assert.assertEquals(9, myPairing.first.size()); // unique - Assert.assertEquals(1, myPairing.second.size()); // dup's + Set> myPairing = obj.uniqueReadSets(list); + Assert.assertEquals(10, myPairing.size()); // unique } } diff --git a/python/snpSelector.py b/python/snpSelector.py index 6c8ebc7a8..f5599b8f4 100755 --- a/python/snpSelector.py +++ b/python/snpSelector.py @@ -249,7 +249,7 @@ def calculateBinsForValues(values, field, minValue, maxValue, partitions): bins[-1][1] = bin[0] bins[-1][2] += curSize - #print 'Returning ', bins + print 'Returning ', bins #sys.exit(1) return bins