diff --git a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index 7d3a001e3..ac40b1236 100755 --- a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -1,3 +1,28 @@ +/* + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + package org.broadinstitute.sting.gatk; import net.sf.picard.reference.ReferenceSequenceFile; @@ -91,7 +116,7 @@ public class GenomeAnalysisEngine { // if we're a read or a locus walker, we use the new system. Right now we have complicated // branching based on the input data, but this should disapear when all the traversals are switched over - if (my_walker instanceof LocusWalker || my_walker instanceof ReadWalker) { + if (my_walker instanceof LocusWalker || my_walker instanceof ReadWalker || my_walker instanceof DuplicateWalker) { microScheduler = createMicroscheduler(my_walker, rods); } else { // we have an old style traversal, once we're done return legacyTraversal(my_walker, rods); @@ -129,15 +154,12 @@ public class GenomeAnalysisEngine { * into their own function allows us to deviate in the two behaviors so the new style traversals aren't limited to what * the old style does. As traversals are converted, this function should disappear. * - * @param my_walker - * @param rods + * @param my_walker the walker to run + * @param rods the rod tracks to associate with */ private void legacyTraversal(Walker my_walker, List> rods) { if (my_walker instanceof LocusWindowWalker) { this.engine = new TraverseByLocusWindows(argCollection.samFiles, argCollection.referenceFile, rods); - } else if (my_walker instanceof DuplicateWalker) { - // we're a duplicate walker - this.engine = new TraverseDuplicates(argCollection.samFiles, argCollection.referenceFile, rods); } else { throw new RuntimeException("Unexpected walker type: " + my_walker); } @@ -178,11 +200,13 @@ public class GenomeAnalysisEngine { microScheduler = MicroScheduler.create(my_walker, extractSourceInfoFromArguments(argCollection), argCollection.referenceFile, rods, argCollection.numberOfThreads); engine = microScheduler.getTraversalEngine(); } - else if (my_walker instanceof ReadWalker) { + else if (my_walker instanceof ReadWalker || my_walker instanceof DuplicateWalker) { if (argCollection.referenceFile == null) - Utils.scareUser(String.format("Locus-based traversals require a reference file but none was given")); + Utils.scareUser(String.format("Read-based traversals require a reference file but none was given")); microScheduler = MicroScheduler.create(my_walker, extractSourceInfoFromArguments(argCollection), argCollection.referenceFile, rods, argCollection.numberOfThreads); engine = microScheduler.getTraversalEngine(); + } else { + Utils.scareUser(String.format("Unable to create the appropriate TraversalEngine for analysis type " + argCollection.analysisName)); } return microScheduler; diff --git a/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java b/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java index c64ed721b..52be738ae 100755 --- a/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java +++ b/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java @@ -1,32 +1,56 @@ +/* + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + package org.broadinstitute.sting.gatk.executive; import net.sf.picard.reference.ReferenceSequenceFile; import org.apache.log4j.Logger; +import org.broadinstitute.sting.gatk.Reads; +import org.broadinstitute.sting.gatk.datasources.providers.ShardDataProvider; +import org.broadinstitute.sting.gatk.datasources.shards.Shard; import org.broadinstitute.sting.gatk.datasources.shards.ShardStrategy; import org.broadinstitute.sting.gatk.datasources.shards.ShardStrategyFactory; -import org.broadinstitute.sting.gatk.datasources.shards.Shard; -import org.broadinstitute.sting.gatk.datasources.simpleDataSources.SAMDataSource; import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource; -import org.broadinstitute.sting.gatk.datasources.providers.ShardDataProvider; -import org.broadinstitute.sting.gatk.traversals.TraversalEngine; -import org.broadinstitute.sting.gatk.traversals.TraverseReads; -import org.broadinstitute.sting.gatk.traversals.TraverseLoci; -import org.broadinstitute.sting.gatk.walkers.TreeReducible; -import org.broadinstitute.sting.gatk.walkers.Walker; -import org.broadinstitute.sting.gatk.walkers.ReadWalker; -import org.broadinstitute.sting.gatk.walkers.LocusWalker; -import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; +import org.broadinstitute.sting.gatk.datasources.simpleDataSources.SAMDataSource; import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData; -import org.broadinstitute.sting.gatk.Reads; -import org.broadinstitute.sting.utils.StingException; -import org.broadinstitute.sting.utils.GenomeLocSortedSet; +import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; +import org.broadinstitute.sting.gatk.traversals.TraversalEngine; +import org.broadinstitute.sting.gatk.traversals.TraverseDuplicates; +import org.broadinstitute.sting.gatk.traversals.TraverseLoci; +import org.broadinstitute.sting.gatk.traversals.TraverseReads; +import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.GenomeLocSortedSet; +import org.broadinstitute.sting.utils.StingException; import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile; import java.io.File; import java.io.FileNotFoundException; -import java.util.List; import java.util.ArrayList; +import java.util.List; + /** * Created by IntelliJ IDEA. @@ -51,8 +75,13 @@ public abstract class MicroScheduler { /** * MicroScheduler factory function. Create a microscheduler appropriate for reducing the * selected walker. - * @param walker Which walker to use. + * + * @param walker Which walker to use. + * @param reads the informations associated with the reads + * @param ref the reference file + * @param rods the rods to include in the traversal * @param nThreadsToUse Number of threads to utilize. + * * @return The best-fit microscheduler. */ public static MicroScheduler create(Walker walker, Reads reads, File ref, List> rods, int nThreadsToUse) { @@ -67,16 +96,24 @@ public abstract class MicroScheduler { /** * Create a microscheduler given the reads and reference. - * @param reads The reads. + * + * @param walker the walker to execute with + * @param reads The reads. * @param refFile File pointer to the reference. + * @param rods the rods to include in the traversal */ protected MicroScheduler(Walker walker, Reads reads, File refFile, List> rods) { if (walker instanceof ReadWalker) { traversalEngine = new TraverseReads(reads.getReadsFiles(), refFile, rods); - this.reads = getReadsDataSource(reads,true); - } else { + this.reads = getReadsDataSource(reads, true); + } else if (walker instanceof LocusWalker) { traversalEngine = new TraverseLoci(reads.getReadsFiles(), refFile, rods); - this.reads = getReadsDataSource(reads,false); + this.reads = getReadsDataSource(reads, false); + } else if (walker instanceof DuplicateWalker) { + traversalEngine = new TraverseDuplicates(reads.getReadsFiles(), refFile, rods); + this.reads = getReadsDataSource(reads, true); + } else { + throw new UnsupportedOperationException("Unable to determine traversal type, the walker is an unknown type."); } @@ -87,6 +124,7 @@ public abstract class MicroScheduler { /** * A temporary getter for the traversal engine. In the future, clients * of the microscheduler shouldn't need to know anything about the traversal engine. + * * @return The traversal engine. */ public TraversalEngine getTraversalEngine() { @@ -95,18 +133,23 @@ public abstract class MicroScheduler { /** * Walks a walker over the given list of intervals. - * @param walker Computation to perform over dataset. - * @param intervals A list of intervals over which to walk. Null for whole dataset. + * + * @param walker Computation to perform over dataset. + * @param intervals A list of intervals over which to walk. Null for whole dataset. * @param maxIterations the maximum number of iterations we're to perform + * * @return the return type of the walker */ public abstract Object execute(Walker walker, GenomeLocSortedSet intervals, Integer maxIterations); /** * Get the sharding strategy given a driving data source. - * @param walker Walker for which to infer sharding strategy. + * + * @param walker Walker for which to infer sharding strategy. * @param drivingDataSource Data on which to shard. - * @param intervals Intervals to use when limiting sharding. + * @param intervals Intervals to use when limiting sharding. + * @param maxIterations the maximum number of iterations to run through + * * @return Sharding strategy for this driving data source. */ protected ShardStrategy getShardStrategy(Walker walker, @@ -118,31 +161,32 @@ public abstract class MicroScheduler { if (walker instanceof LocusWalker) { if (intervals != null) { shardType = (walker.isReduceByInterval()) ? - ShardStrategyFactory.SHATTER_STRATEGY.INTERVAL : - ShardStrategyFactory.SHATTER_STRATEGY.LINEAR; + ShardStrategyFactory.SHATTER_STRATEGY.INTERVAL : + ShardStrategyFactory.SHATTER_STRATEGY.LINEAR; shardStrategy = ShardStrategyFactory.shatter(shardType, - drivingDataSource.getSequenceDictionary(), - SHARD_SIZE, - intervals, maxIterations); + drivingDataSource.getSequenceDictionary(), + SHARD_SIZE, + intervals, maxIterations); } else shardStrategy = ShardStrategyFactory.shatter(ShardStrategyFactory.SHATTER_STRATEGY.LINEAR, - drivingDataSource.getSequenceDictionary(), - SHARD_SIZE, maxIterations); + drivingDataSource.getSequenceDictionary(), + SHARD_SIZE, maxIterations); - } else if (walker instanceof ReadWalker) { + } else if (walker instanceof ReadWalker || + walker instanceof DuplicateWalker) { shardType = ShardStrategyFactory.SHATTER_STRATEGY.READS; if (intervals != null) { shardStrategy = ShardStrategyFactory.shatter(shardType, - drivingDataSource.getSequenceDictionary(), - SHARD_SIZE, - intervals, maxIterations); + drivingDataSource.getSequenceDictionary(), + SHARD_SIZE, + intervals, maxIterations); } else { shardStrategy = ShardStrategyFactory.shatter(shardType, - drivingDataSource.getSequenceDictionary(), - SHARD_SIZE, maxIterations); + drivingDataSource.getSequenceDictionary(), + SHARD_SIZE, maxIterations); } } else throw new StingException("Unable to support walker of type" + walker.getClass().getName()); @@ -152,7 +196,9 @@ public abstract class MicroScheduler { /** * Gets an window into all the data that can be viewed as a single shard. + * * @param shard The section of data to view. + * * @return An accessor for all the data in this shard. */ protected ShardDataProvider getShardDataProvider(Shard shard) { @@ -161,6 +207,10 @@ public abstract class MicroScheduler { /** * Gets a data source for the given set of reads. + * + * @param reads the read source information + * @param byReads are we a by reads traversal, or not + * * @return A data source for the given set of reads. */ private SAMDataSource getReadsDataSource(Reads reads, boolean byReads) { @@ -180,6 +230,9 @@ public abstract class MicroScheduler { /** * Open the reference-ordered data sources. + * + * @param rods the reference order data to execute using + * * @return A list of reference-ordered data sources. */ private List getReferenceOrderedDataSources(List> rods) { @@ -191,7 +244,9 @@ public abstract class MicroScheduler { /** * Opens a reference sequence file paired with an index. + * * @param refFile Handle to a reference sequence file. Non-null. + * * @return A thread-safe file wrapper. */ private IndexedFastaSequenceFile openReferenceSequenceFile(File refFile) { diff --git a/java/src/org/broadinstitute/sting/gatk/traversals/TraverseDuplicates.java b/java/src/org/broadinstitute/sting/gatk/traversals/TraverseDuplicates.java index 78b7bbbb5..0f1bd9ce1 100755 --- a/java/src/org/broadinstitute/sting/gatk/traversals/TraverseDuplicates.java +++ b/java/src/org/broadinstitute/sting/gatk/traversals/TraverseDuplicates.java @@ -1,3 +1,28 @@ +/* + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + package org.broadinstitute.sting.gatk.traversals; import net.sf.picard.filter.FilteringIterator; @@ -29,22 +54,12 @@ import java.util.List; * User: aaron * Date: Apr 24, 2009 * Time: 10:35:22 AM - * - * The Broad Institute - * SOFTWARE COPYRIGHT NOTICE AGREEMENT - * This software and its documentation are copyright 2009 by the - * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. - * - * This software is supplied without any warranty or guaranteed support whatsoever. Neither - * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. - * */ /** * @author Mark DePristo * @version 0.1 - * @date Apr 29, 2009 *

* Class TraverseDuplicates *

@@ -67,17 +82,16 @@ public class TraverseDuplicates extends TraversalEngine { super(reads, ref, rods); } - private List readsAtLoc(final SAMRecord read, PushbackIterator iter) - { + private List readsAtLoc(final SAMRecord read, PushbackIterator iter) { GenomeLoc site = GenomeLocParser.createGenomeLoc(read); ArrayList l = new ArrayList(); l.add(read); - for (SAMRecord read2: iter) { + for (SAMRecord read2 : iter) { GenomeLoc site2 = GenomeLocParser.createGenomeLoc(read2); // the next read starts too late - if ( site2.getStart() != site.getStart() ) { + if (site2.getStart() != site.getStart()) { //System.out.printf("site = %s, site2 = %s%n", site, site2); iter.pushback(read2); break; @@ -96,8 +110,8 @@ public class TraverseDuplicates extends TraversalEngine { // find the first duplicate SAMRecord key = null; - for ( SAMRecord read : reads ) { - if ( read.getDuplicateReadFlag() ) { + for (SAMRecord read : reads) { + if (read.getDuplicateReadFlag()) { // this is our key key = read; if (DEBUG) logger.debug(String.format("Key %s is a duplicate", read.getReadName())); @@ -107,20 +121,21 @@ public class TraverseDuplicates extends TraversalEngine { // At this point, there are two possibilities, we have found at least one dup or not // if it's a dup, add it to the dups list, otherwise add it to the uniques list - if ( key != null ) { + if (key != null) { final GenomeLoc keyLoc = GenomeLocParser.createGenomeLoc(key); final GenomeLoc keyMateLoc = GenomeLocParser.createGenomeLoc(key.getMateReferenceIndex(), key.getMateAlignmentStart(), key.getMateAlignmentStart()); - for ( SAMRecord read : reads ) { + for (SAMRecord read : reads) { final GenomeLoc readLoc = GenomeLocParser.createGenomeLoc(read); final GenomeLoc readMateLoc = GenomeLocParser.createGenomeLoc(read.getMateReferenceIndex(), read.getMateAlignmentStart(), read.getMateAlignmentStart()); - if (DEBUG) logger.debug(String.format("Examining reads at %s vs. %s at %s / %s vs. %s / %s%n", key.getReadName(), read.getReadName(), keyLoc, keyMateLoc, readLoc, readMateLoc)); + if (DEBUG) + logger.debug(String.format("Examining reads at %s vs. %s at %s / %s vs. %s / %s%n", key.getReadName(), read.getReadName(), keyLoc, keyMateLoc, readLoc, readMateLoc)); // read and key start at the same place, and either the this read and the key // share a mate location or the read is flagged as a duplicate - if ( readLoc.compareTo(keyLoc) == 0 && - ( readMateLoc.compareTo(keyMateLoc) == 0) || - read.getDuplicateReadFlag() ) { + if (readLoc.compareTo(keyLoc) == 0 && + (readMateLoc.compareTo(keyMateLoc) == 0) || + read.getDuplicateReadFlag()) { // we are at the same position as the dup and have the same mat pos, it's a dup if (DEBUG) logger.debug(String.format(" => Adding read to dups list: %s%n", read)); dups.add(read); @@ -137,33 +152,43 @@ public class TraverseDuplicates extends TraversalEngine { /** * Traverse by reads, given the data and the walker + * * @param sum of type T, the return from the walker * @param the generic type * @param the return type of the reduce function - * @return + * @param dupWalker our duplicates walker + * @param readIter our iterator + * + * @return the reduce type, T, the final product of all the reduce calls */ - public T actuallyTraverse(DuplicateWalker dupWalker, - Iterator readIter, - T sum) { - // while we still have more reads - // ok, here's the idea. We get all the reads that start at the same position in the genome - // We then split the list of reads into sublists of reads: - // -> those with the same mate pair position, for paired reads - // -> those flagged as unpaired and duplicated but having the same start and end and + private T actuallyTraverse(DuplicateWalker dupWalker, + Iterator readIter, + T sum) { + /** + * while we still have more reads: + * ok, here's the idea. We get all the reads that start at the same position in the genome + * We then split the list of reads into sublists of reads: + * -> those with the same mate pair position, for paired reads + * -> those flagged as unpaired and duplicated but having the same start and end and + */ PushbackIterator iter = new PushbackIterator(readIter); - for (SAMRecord read: iter) { + + + for (SAMRecord read : iter) { + // get the genome loc from the read GenomeLoc site = GenomeLocParser.createGenomeLoc(read); + List reads = readsAtLoc(read, iter); Pair, List> split = splitDuplicates(reads); - List uniqueReads = split.getFirst(); - List duplicateReads = split.getSecond(); + List uniqueReads = split.getFirst(); + List duplicateReads = split.getSecond(); logger.debug(String.format("*** TraverseDuplicates.traverse at %s with %d reads has %d unique and %d duplicate reads", site, reads.size(), uniqueReads.size(), duplicateReads.size())); - if ( reads.size() != uniqueReads.size() + duplicateReads.size() ) - throw new RuntimeException(String.format("Bug occurred spliting reads [N=%d] at loc %s into unique [N=%d] and duplicates [N=%d], sizes don't match", - reads.size(), uniqueReads.size(), duplicateReads.size())); + if (reads.size() != uniqueReads.size() + duplicateReads.size()) + throw new RuntimeException(String.format("Bug occurred spliting reads [N=%d] at loc %s into unique [N=%d] and duplicates [N=%d], sizes don't match", + reads.size(), site.toString(), uniqueReads.size(), duplicateReads.size())); // Jump forward in the reference to this locus location LocusContext locus = new LocusContext(site, duplicateReads, Arrays.asList(0)); @@ -176,15 +201,15 @@ public class TraverseDuplicates extends TraversalEngine { byte[] refBases = new byte[0]; - if ( dupWalker.mapUniqueReadsTooP() ) { + if (dupWalker.mapUniqueReadsTooP()) { // Send each unique read to the map function - for ( SAMRecord unique : uniqueReads ) { + for (SAMRecord unique : uniqueReads) { List l = Arrays.asList(unique); sum = mapOne(dupWalker, uniqueReads, l, site, refBases, locus, sum); } } - if ( duplicateReads.size() > 0 ) + if (duplicateReads.size() > 0) sum = mapOne(dupWalker, uniqueReads, duplicateReads, site, refBases, locus, sum); printProgress("dups", site); @@ -206,21 +231,16 @@ public class TraverseDuplicates extends TraversalEngine { SAMRecord lastRead = null; public boolean filterOut(SAMRecord rec) { boolean result = false; - String why = ""; if (rec.getReadUnmappedFlag()) { TraversalStatistics.nUnmappedReads++; result = true; - why = "Unmapped"; } else if (rec.getNotPrimaryAlignmentFlag()) { TraversalStatistics.nNotPrimary++; result = true; - why = "Not Primary"; } else if (rec.getAlignmentStart() == SAMRecord.NO_ALIGNMENT_START) { TraversalStatistics.nBadAlignments++; result = true; - why = "No alignment start"; - } - else { + } else { result = false; } @@ -250,7 +270,6 @@ public class TraverseDuplicates extends TraversalEngine { } - // -------------------------------------------------------------------------------------------------------------- // // new style interface to the system @@ -258,19 +277,21 @@ public class TraverseDuplicates extends TraversalEngine { // -------------------------------------------------------------------------------------------------------------- /** * Traverse by reads, given the data and the walker + * * @param walker the walker to execute over - * @param shard the shard of data to feed the walker - * @param sum of type T, the return from the walker - * @param the generic type - * @param the return type of the reduce function - * @return + * @param shard the shard of data to feed the walker + * @param sum of type T, the return from the walker + * @param the generic type + * @param the return type of the reduce function + * + * @return the result type T, the product of all the reduce calls */ public T traverse(Walker walker, Shard shard, ShardDataProvider dataProvider, T sum) { - logger.debug(String.format("TraverseDuplicates.traverse Genomic interval is %s", ((ReadShard)shard).getSize())); + logger.debug(String.format("TraverseDuplicates.traverse Genomic interval is %s", ((ReadShard) shard).getSize())); if (!(walker instanceof DuplicateWalker)) throw new IllegalArgumentException("Walker isn't a duplicate walker!"); @@ -287,4 +308,16 @@ public class TraverseDuplicates extends TraversalEngine { PushbackIterator iter = new PushbackIterator(filterIter); return actuallyTraverse(dupWalker, iter, sum); } + + + /** + * Temporary override of printOnTraversalDone. + * TODO: Add some sort of TE.getName() function once all TraversalEngines are ported. + * + * @param sum Result of the computation. + * @param Type of the result. + */ + public void printOnTraversalDone(T sum) { + printOnTraversalDone("reads", sum); + } } \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/duplicates/CountDuplicatesWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/duplicates/CountDuplicatesWalker.java new file mode 100644 index 000000000..91a68992f --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/duplicates/CountDuplicatesWalker.java @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.playground.gatk.walkers.duplicates; + +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.gatk.LocusContext; +import org.broadinstitute.sting.gatk.walkers.DuplicateWalker; +import org.broadinstitute.sting.utils.GenomeLoc; + +import java.util.List; + +/** + * a class to store the duplicates information we pass around + */ +class DuplicateCount { + public int count = 0; + public int undupDepth = 0; + public int depth = 0; + } + +/** + * + * @author aaron + * + * Class CountDuplicatesWalker + * + * Count the number of duplicates, and the average depth of duplicates at all positions. + */ +public class CountDuplicatesWalker extends DuplicateWalker { + + /** + * the map function, conforming to the duplicates interface + * @param loc the genomic location + * @param refBases the reference bases that cover this position, which we turn off + * @param context the LocusContext, containing all the reads overlapping this region + * @param uniqueReads all the unique reads, bundled together + * @param duplicateReads all the duplicate reads + * @return a DuplicateCount object, with the appropriate stats + */ + public DuplicateCount map(GenomeLoc loc, byte[] refBases, LocusContext context, List uniqueReads, List duplicateReads) { + DuplicateCount dup = new DuplicateCount(); + dup.count = 1; + dup.undupDepth = uniqueReads.size(); + dup.depth = duplicateReads.size(); + return dup; + } + + /** + * setup our walker. In this case, new a DuplicateCount object and return it + * @return the object holding the counts of the duplicates + */ + public DuplicateCount reduceInit() { + return new DuplicateCount(); + } + + /** + * the reduce step. This function combines the DuplicateCount objects, and updates the running depth average + * @param value the new DuplicateCount + * @param sum the running sum DuplicateCount + * @return a new DuplicateCount with the updated sums + */ + public DuplicateCount reduce(DuplicateCount value, DuplicateCount sum) { + DuplicateCount dup = new DuplicateCount(); + dup.count = sum.count + value.count; + dup.depth = value.depth + sum.depth; + dup.undupDepth = value.undupDepth + sum.undupDepth; + return dup; + } + + /** + * when we're done, print out the collected stats + * @param result the result of the traversal engine, to be printed out + */ + public void onTraversalDone(DuplicateCount result) { + out.println("[REDUCE RESULT] Traversal result is: "); + out.println("duplicate count = " + result.count); + out.println("average depth = " + (double)result.depth / (double)result.count); + out.println("duplicates seen = " + result.depth); + out.println("average unique depth = " + (double)result.undupDepth / (double)result.undupDepth); + out.println("unique read count = " + result.undupDepth); + + } +} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/duplicates/DuplicateQualsWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/duplicates/DuplicateQualsWalker.java index 32111633f..dd72d83b1 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/duplicates/DuplicateQualsWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/duplicates/DuplicateQualsWalker.java @@ -1,20 +1,44 @@ -package org.broadinstitute.sting.playground.gatk.duplicates.walkers; +/* + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ -import org.broadinstitute.sting.gatk.LocusContext; -import org.broadinstitute.sting.gatk.walkers.DuplicateWalker; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.QualityUtils; -import org.broadinstitute.sting.utils.Pair; -import org.broadinstitute.sting.utils.BaseUtils; -import org.broadinstitute.sting.utils.duplicates.DuplicateComp; -import org.broadinstitute.sting.utils.duplicates.DupUtils; -import org.broadinstitute.sting.utils.cmdLine.Argument; - -import java.util.List; -import java.util.ArrayList; -import java.io.PrintStream; +package org.broadinstitute.sting.playground.gatk.walkers.duplicates; import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.gatk.LocusContext; +import org.broadinstitute.sting.gatk.walkers.DuplicateWalker; +import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.Pair; +import org.broadinstitute.sting.utils.QualityUtils; +import org.broadinstitute.sting.utils.cmdLine.Argument; +import org.broadinstitute.sting.utils.duplicates.DupUtils; +import org.broadinstitute.sting.utils.duplicates.DuplicateComp; + +import java.io.PrintStream; +import java.util.ArrayList; +import java.util.List; class MismatchCounter { long nObs = 0;