Changed the duplicate traversal over to the new style of traversal and plumbed into the genome analysis engine. Also added a CountDuplicates walker, to validate the engine.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1072 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
aaron 2009-06-22 21:11:18 +00:00
parent 4a92a999a0
commit 8b4d0412ca
5 changed files with 354 additions and 112 deletions

View File

@ -1,3 +1,28 @@
/*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk;
import net.sf.picard.reference.ReferenceSequenceFile;
@ -91,7 +116,7 @@ public class GenomeAnalysisEngine {
// if we're a read or a locus walker, we use the new system. Right now we have complicated
// branching based on the input data, but this should disapear when all the traversals are switched over
if (my_walker instanceof LocusWalker || my_walker instanceof ReadWalker) {
if (my_walker instanceof LocusWalker || my_walker instanceof ReadWalker || my_walker instanceof DuplicateWalker) {
microScheduler = createMicroscheduler(my_walker, rods);
} else { // we have an old style traversal, once we're done return
legacyTraversal(my_walker, rods);
@ -129,15 +154,12 @@ public class GenomeAnalysisEngine {
* into their own function allows us to deviate in the two behaviors so the new style traversals aren't limited to what
* the old style does. As traversals are converted, this function should disappear.
*
* @param my_walker
* @param rods
* @param my_walker the walker to run
* @param rods the rod tracks to associate with
*/
private void legacyTraversal(Walker my_walker, List<ReferenceOrderedData<? extends ReferenceOrderedDatum>> rods) {
if (my_walker instanceof LocusWindowWalker) {
this.engine = new TraverseByLocusWindows(argCollection.samFiles, argCollection.referenceFile, rods);
} else if (my_walker instanceof DuplicateWalker) {
// we're a duplicate walker
this.engine = new TraverseDuplicates(argCollection.samFiles, argCollection.referenceFile, rods);
} else {
throw new RuntimeException("Unexpected walker type: " + my_walker);
}
@ -178,11 +200,13 @@ public class GenomeAnalysisEngine {
microScheduler = MicroScheduler.create(my_walker, extractSourceInfoFromArguments(argCollection), argCollection.referenceFile, rods, argCollection.numberOfThreads);
engine = microScheduler.getTraversalEngine();
}
else if (my_walker instanceof ReadWalker) {
else if (my_walker instanceof ReadWalker || my_walker instanceof DuplicateWalker) {
if (argCollection.referenceFile == null)
Utils.scareUser(String.format("Locus-based traversals require a reference file but none was given"));
Utils.scareUser(String.format("Read-based traversals require a reference file but none was given"));
microScheduler = MicroScheduler.create(my_walker, extractSourceInfoFromArguments(argCollection), argCollection.referenceFile, rods, argCollection.numberOfThreads);
engine = microScheduler.getTraversalEngine();
} else {
Utils.scareUser(String.format("Unable to create the appropriate TraversalEngine for analysis type " + argCollection.analysisName));
}
return microScheduler;

View File

@ -1,32 +1,56 @@
/*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.executive;
import net.sf.picard.reference.ReferenceSequenceFile;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.Reads;
import org.broadinstitute.sting.gatk.datasources.providers.ShardDataProvider;
import org.broadinstitute.sting.gatk.datasources.shards.Shard;
import org.broadinstitute.sting.gatk.datasources.shards.ShardStrategy;
import org.broadinstitute.sting.gatk.datasources.shards.ShardStrategyFactory;
import org.broadinstitute.sting.gatk.datasources.shards.Shard;
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.SAMDataSource;
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource;
import org.broadinstitute.sting.gatk.datasources.providers.ShardDataProvider;
import org.broadinstitute.sting.gatk.traversals.TraversalEngine;
import org.broadinstitute.sting.gatk.traversals.TraverseReads;
import org.broadinstitute.sting.gatk.traversals.TraverseLoci;
import org.broadinstitute.sting.gatk.walkers.TreeReducible;
import org.broadinstitute.sting.gatk.walkers.Walker;
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.SAMDataSource;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
import org.broadinstitute.sting.gatk.Reads;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
import org.broadinstitute.sting.gatk.traversals.TraversalEngine;
import org.broadinstitute.sting.gatk.traversals.TraverseDuplicates;
import org.broadinstitute.sting.gatk.traversals.TraverseLoci;
import org.broadinstitute.sting.gatk.traversals.TraverseReads;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile;
import java.io.File;
import java.io.FileNotFoundException;
import java.util.List;
import java.util.ArrayList;
import java.util.List;
/**
* Created by IntelliJ IDEA.
@ -51,8 +75,13 @@ public abstract class MicroScheduler {
/**
* MicroScheduler factory function. Create a microscheduler appropriate for reducing the
* selected walker.
* @param walker Which walker to use.
*
* @param walker Which walker to use.
* @param reads the informations associated with the reads
* @param ref the reference file
* @param rods the rods to include in the traversal
* @param nThreadsToUse Number of threads to utilize.
*
* @return The best-fit microscheduler.
*/
public static MicroScheduler create(Walker walker, Reads reads, File ref, List<ReferenceOrderedData<? extends ReferenceOrderedDatum>> rods, int nThreadsToUse) {
@ -67,16 +96,24 @@ public abstract class MicroScheduler {
/**
* Create a microscheduler given the reads and reference.
* @param reads The reads.
*
* @param walker the walker to execute with
* @param reads The reads.
* @param refFile File pointer to the reference.
* @param rods the rods to include in the traversal
*/
protected MicroScheduler(Walker walker, Reads reads, File refFile, List<ReferenceOrderedData<? extends ReferenceOrderedDatum>> rods) {
if (walker instanceof ReadWalker) {
traversalEngine = new TraverseReads(reads.getReadsFiles(), refFile, rods);
this.reads = getReadsDataSource(reads,true);
} else {
this.reads = getReadsDataSource(reads, true);
} else if (walker instanceof LocusWalker) {
traversalEngine = new TraverseLoci(reads.getReadsFiles(), refFile, rods);
this.reads = getReadsDataSource(reads,false);
this.reads = getReadsDataSource(reads, false);
} else if (walker instanceof DuplicateWalker) {
traversalEngine = new TraverseDuplicates(reads.getReadsFiles(), refFile, rods);
this.reads = getReadsDataSource(reads, true);
} else {
throw new UnsupportedOperationException("Unable to determine traversal type, the walker is an unknown type.");
}
@ -87,6 +124,7 @@ public abstract class MicroScheduler {
/**
* A temporary getter for the traversal engine. In the future, clients
* of the microscheduler shouldn't need to know anything about the traversal engine.
*
* @return The traversal engine.
*/
public TraversalEngine getTraversalEngine() {
@ -95,18 +133,23 @@ public abstract class MicroScheduler {
/**
* Walks a walker over the given list of intervals.
* @param walker Computation to perform over dataset.
* @param intervals A list of intervals over which to walk. Null for whole dataset.
*
* @param walker Computation to perform over dataset.
* @param intervals A list of intervals over which to walk. Null for whole dataset.
* @param maxIterations the maximum number of iterations we're to perform
*
* @return the return type of the walker
*/
public abstract Object execute(Walker walker, GenomeLocSortedSet intervals, Integer maxIterations);
/**
* Get the sharding strategy given a driving data source.
* @param walker Walker for which to infer sharding strategy.
*
* @param walker Walker for which to infer sharding strategy.
* @param drivingDataSource Data on which to shard.
* @param intervals Intervals to use when limiting sharding.
* @param intervals Intervals to use when limiting sharding.
* @param maxIterations the maximum number of iterations to run through
*
* @return Sharding strategy for this driving data source.
*/
protected ShardStrategy getShardStrategy(Walker walker,
@ -118,31 +161,32 @@ public abstract class MicroScheduler {
if (walker instanceof LocusWalker) {
if (intervals != null) {
shardType = (walker.isReduceByInterval()) ?
ShardStrategyFactory.SHATTER_STRATEGY.INTERVAL :
ShardStrategyFactory.SHATTER_STRATEGY.LINEAR;
ShardStrategyFactory.SHATTER_STRATEGY.INTERVAL :
ShardStrategyFactory.SHATTER_STRATEGY.LINEAR;
shardStrategy = ShardStrategyFactory.shatter(shardType,
drivingDataSource.getSequenceDictionary(),
SHARD_SIZE,
intervals, maxIterations);
drivingDataSource.getSequenceDictionary(),
SHARD_SIZE,
intervals, maxIterations);
} else
shardStrategy = ShardStrategyFactory.shatter(ShardStrategyFactory.SHATTER_STRATEGY.LINEAR,
drivingDataSource.getSequenceDictionary(),
SHARD_SIZE, maxIterations);
drivingDataSource.getSequenceDictionary(),
SHARD_SIZE, maxIterations);
} else if (walker instanceof ReadWalker) {
} else if (walker instanceof ReadWalker ||
walker instanceof DuplicateWalker) {
shardType = ShardStrategyFactory.SHATTER_STRATEGY.READS;
if (intervals != null) {
shardStrategy = ShardStrategyFactory.shatter(shardType,
drivingDataSource.getSequenceDictionary(),
SHARD_SIZE,
intervals, maxIterations);
drivingDataSource.getSequenceDictionary(),
SHARD_SIZE,
intervals, maxIterations);
} else {
shardStrategy = ShardStrategyFactory.shatter(shardType,
drivingDataSource.getSequenceDictionary(),
SHARD_SIZE, maxIterations);
drivingDataSource.getSequenceDictionary(),
SHARD_SIZE, maxIterations);
}
} else
throw new StingException("Unable to support walker of type" + walker.getClass().getName());
@ -152,7 +196,9 @@ public abstract class MicroScheduler {
/**
* Gets an window into all the data that can be viewed as a single shard.
*
* @param shard The section of data to view.
*
* @return An accessor for all the data in this shard.
*/
protected ShardDataProvider getShardDataProvider(Shard shard) {
@ -161,6 +207,10 @@ public abstract class MicroScheduler {
/**
* Gets a data source for the given set of reads.
*
* @param reads the read source information
* @param byReads are we a by reads traversal, or not
*
* @return A data source for the given set of reads.
*/
private SAMDataSource getReadsDataSource(Reads reads, boolean byReads) {
@ -180,6 +230,9 @@ public abstract class MicroScheduler {
/**
* Open the reference-ordered data sources.
*
* @param rods the reference order data to execute using
*
* @return A list of reference-ordered data sources.
*/
private List<ReferenceOrderedDataSource> getReferenceOrderedDataSources(List<ReferenceOrderedData<? extends ReferenceOrderedDatum>> rods) {
@ -191,7 +244,9 @@ public abstract class MicroScheduler {
/**
* Opens a reference sequence file paired with an index.
*
* @param refFile Handle to a reference sequence file. Non-null.
*
* @return A thread-safe file wrapper.
*/
private IndexedFastaSequenceFile openReferenceSequenceFile(File refFile) {

View File

@ -1,3 +1,28 @@
/*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.traversals;
import net.sf.picard.filter.FilteringIterator;
@ -29,22 +54,12 @@ import java.util.List;
* User: aaron
* Date: Apr 24, 2009
* Time: 10:35:22 AM
*
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*
*/
/**
* @author Mark DePristo
* @version 0.1
* @date Apr 29, 2009
* <p/>
* Class TraverseDuplicates
* <p/>
@ -67,17 +82,16 @@ public class TraverseDuplicates extends TraversalEngine {
super(reads, ref, rods);
}
private List<SAMRecord> readsAtLoc(final SAMRecord read, PushbackIterator<SAMRecord> iter)
{
private List<SAMRecord> readsAtLoc(final SAMRecord read, PushbackIterator<SAMRecord> iter) {
GenomeLoc site = GenomeLocParser.createGenomeLoc(read);
ArrayList<SAMRecord> l = new ArrayList<SAMRecord>();
l.add(read);
for (SAMRecord read2: iter) {
for (SAMRecord read2 : iter) {
GenomeLoc site2 = GenomeLocParser.createGenomeLoc(read2);
// the next read starts too late
if ( site2.getStart() != site.getStart() ) {
if (site2.getStart() != site.getStart()) {
//System.out.printf("site = %s, site2 = %s%n", site, site2);
iter.pushback(read2);
break;
@ -96,8 +110,8 @@ public class TraverseDuplicates extends TraversalEngine {
// find the first duplicate
SAMRecord key = null;
for ( SAMRecord read : reads ) {
if ( read.getDuplicateReadFlag() ) {
for (SAMRecord read : reads) {
if (read.getDuplicateReadFlag()) {
// this is our key
key = read;
if (DEBUG) logger.debug(String.format("Key %s is a duplicate", read.getReadName()));
@ -107,20 +121,21 @@ public class TraverseDuplicates extends TraversalEngine {
// At this point, there are two possibilities, we have found at least one dup or not
// if it's a dup, add it to the dups list, otherwise add it to the uniques list
if ( key != null ) {
if (key != null) {
final GenomeLoc keyLoc = GenomeLocParser.createGenomeLoc(key);
final GenomeLoc keyMateLoc = GenomeLocParser.createGenomeLoc(key.getMateReferenceIndex(), key.getMateAlignmentStart(), key.getMateAlignmentStart());
for ( SAMRecord read : reads ) {
for (SAMRecord read : reads) {
final GenomeLoc readLoc = GenomeLocParser.createGenomeLoc(read);
final GenomeLoc readMateLoc = GenomeLocParser.createGenomeLoc(read.getMateReferenceIndex(), read.getMateAlignmentStart(), read.getMateAlignmentStart());
if (DEBUG) logger.debug(String.format("Examining reads at %s vs. %s at %s / %s vs. %s / %s%n", key.getReadName(), read.getReadName(), keyLoc, keyMateLoc, readLoc, readMateLoc));
if (DEBUG)
logger.debug(String.format("Examining reads at %s vs. %s at %s / %s vs. %s / %s%n", key.getReadName(), read.getReadName(), keyLoc, keyMateLoc, readLoc, readMateLoc));
// read and key start at the same place, and either the this read and the key
// share a mate location or the read is flagged as a duplicate
if ( readLoc.compareTo(keyLoc) == 0 &&
( readMateLoc.compareTo(keyMateLoc) == 0) ||
read.getDuplicateReadFlag() ) {
if (readLoc.compareTo(keyLoc) == 0 &&
(readMateLoc.compareTo(keyMateLoc) == 0) ||
read.getDuplicateReadFlag()) {
// we are at the same position as the dup and have the same mat pos, it's a dup
if (DEBUG) logger.debug(String.format(" => Adding read to dups list: %s%n", read));
dups.add(read);
@ -137,33 +152,43 @@ public class TraverseDuplicates extends TraversalEngine {
/**
* Traverse by reads, given the data and the walker
*
* @param sum of type T, the return from the walker
* @param <M> the generic type
* @param <T> the return type of the reduce function
* @return
* @param dupWalker our duplicates walker
* @param readIter our iterator
*
* @return the reduce type, T, the final product of all the reduce calls
*/
public <M, T> T actuallyTraverse(DuplicateWalker<M, T> dupWalker,
Iterator<SAMRecord> readIter,
T sum) {
// while we still have more reads
// ok, here's the idea. We get all the reads that start at the same position in the genome
// We then split the list of reads into sublists of reads:
// -> those with the same mate pair position, for paired reads
// -> those flagged as unpaired and duplicated but having the same start and end and
private <M, T> T actuallyTraverse(DuplicateWalker<M, T> dupWalker,
Iterator<SAMRecord> readIter,
T sum) {
/**
* while we still have more reads:
* ok, here's the idea. We get all the reads that start at the same position in the genome
* We then split the list of reads into sublists of reads:
* -> those with the same mate pair position, for paired reads
* -> those flagged as unpaired and duplicated but having the same start and end and
*/
PushbackIterator<SAMRecord> iter = new PushbackIterator<SAMRecord>(readIter);
for (SAMRecord read: iter) {
for (SAMRecord read : iter) {
// get the genome loc from the read
GenomeLoc site = GenomeLocParser.createGenomeLoc(read);
List<SAMRecord> reads = readsAtLoc(read, iter);
Pair<List<SAMRecord>, List<SAMRecord>> split = splitDuplicates(reads);
List<SAMRecord> uniqueReads = split.getFirst();
List<SAMRecord> duplicateReads = split.getSecond();
List<SAMRecord> uniqueReads = split.getFirst();
List<SAMRecord> duplicateReads = split.getSecond();
logger.debug(String.format("*** TraverseDuplicates.traverse at %s with %d reads has %d unique and %d duplicate reads",
site, reads.size(), uniqueReads.size(), duplicateReads.size()));
if ( reads.size() != uniqueReads.size() + duplicateReads.size() )
throw new RuntimeException(String.format("Bug occurred spliting reads [N=%d] at loc %s into unique [N=%d] and duplicates [N=%d], sizes don't match",
reads.size(), uniqueReads.size(), duplicateReads.size()));
if (reads.size() != uniqueReads.size() + duplicateReads.size())
throw new RuntimeException(String.format("Bug occurred spliting reads [N=%d] at loc %s into unique [N=%d] and duplicates [N=%d], sizes don't match",
reads.size(), site.toString(), uniqueReads.size(), duplicateReads.size()));
// Jump forward in the reference to this locus location
LocusContext locus = new LocusContext(site, duplicateReads, Arrays.asList(0));
@ -176,15 +201,15 @@ public class TraverseDuplicates extends TraversalEngine {
byte[] refBases = new byte[0];
if ( dupWalker.mapUniqueReadsTooP() ) {
if (dupWalker.mapUniqueReadsTooP()) {
// Send each unique read to the map function
for ( SAMRecord unique : uniqueReads ) {
for (SAMRecord unique : uniqueReads) {
List<SAMRecord> l = Arrays.asList(unique);
sum = mapOne(dupWalker, uniqueReads, l, site, refBases, locus, sum);
}
}
if ( duplicateReads.size() > 0 )
if (duplicateReads.size() > 0)
sum = mapOne(dupWalker, uniqueReads, duplicateReads, site, refBases, locus, sum);
printProgress("dups", site);
@ -206,21 +231,16 @@ public class TraverseDuplicates extends TraversalEngine {
SAMRecord lastRead = null;
public boolean filterOut(SAMRecord rec) {
boolean result = false;
String why = "";
if (rec.getReadUnmappedFlag()) {
TraversalStatistics.nUnmappedReads++;
result = true;
why = "Unmapped";
} else if (rec.getNotPrimaryAlignmentFlag()) {
TraversalStatistics.nNotPrimary++;
result = true;
why = "Not Primary";
} else if (rec.getAlignmentStart() == SAMRecord.NO_ALIGNMENT_START) {
TraversalStatistics.nBadAlignments++;
result = true;
why = "No alignment start";
}
else {
} else {
result = false;
}
@ -250,7 +270,6 @@ public class TraverseDuplicates extends TraversalEngine {
}
// --------------------------------------------------------------------------------------------------------------
//
// new style interface to the system
@ -258,19 +277,21 @@ public class TraverseDuplicates extends TraversalEngine {
// --------------------------------------------------------------------------------------------------------------
/**
* Traverse by reads, given the data and the walker
*
* @param walker the walker to execute over
* @param shard the shard of data to feed the walker
* @param sum of type T, the return from the walker
* @param <M> the generic type
* @param <T> the return type of the reduce function
* @return
* @param shard the shard of data to feed the walker
* @param sum of type T, the return from the walker
* @param <M> the generic type
* @param <T> the return type of the reduce function
*
* @return the result type T, the product of all the reduce calls
*/
public <M, T> T traverse(Walker<M, T> walker,
Shard shard,
ShardDataProvider dataProvider,
T sum) {
logger.debug(String.format("TraverseDuplicates.traverse Genomic interval is %s", ((ReadShard)shard).getSize()));
logger.debug(String.format("TraverseDuplicates.traverse Genomic interval is %s", ((ReadShard) shard).getSize()));
if (!(walker instanceof DuplicateWalker))
throw new IllegalArgumentException("Walker isn't a duplicate walker!");
@ -287,4 +308,16 @@ public class TraverseDuplicates extends TraversalEngine {
PushbackIterator<SAMRecord> iter = new PushbackIterator<SAMRecord>(filterIter);
return actuallyTraverse(dupWalker, iter, sum);
}
/**
* Temporary override of printOnTraversalDone.
* TODO: Add some sort of TE.getName() function once all TraversalEngines are ported.
*
* @param sum Result of the computation.
* @param <T> Type of the result.
*/
public <T> void printOnTraversalDone(T sum) {
printOnTraversalDone("reads", sum);
}
}

View File

@ -0,0 +1,106 @@
/*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.playground.gatk.walkers.duplicates;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.gatk.LocusContext;
import org.broadinstitute.sting.gatk.walkers.DuplicateWalker;
import org.broadinstitute.sting.utils.GenomeLoc;
import java.util.List;
/**
* a class to store the duplicates information we pass around
*/
class DuplicateCount {
public int count = 0;
public int undupDepth = 0;
public int depth = 0;
}
/**
*
* @author aaron
*
* Class CountDuplicatesWalker
*
* Count the number of duplicates, and the average depth of duplicates at all positions.
*/
public class CountDuplicatesWalker extends DuplicateWalker<DuplicateCount, DuplicateCount> {
/**
* the map function, conforming to the duplicates interface
* @param loc the genomic location
* @param refBases the reference bases that cover this position, which we turn off
* @param context the LocusContext, containing all the reads overlapping this region
* @param uniqueReads all the unique reads, bundled together
* @param duplicateReads all the duplicate reads
* @return a DuplicateCount object, with the appropriate stats
*/
public DuplicateCount map(GenomeLoc loc, byte[] refBases, LocusContext context, List<SAMRecord> uniqueReads, List<SAMRecord> duplicateReads) {
DuplicateCount dup = new DuplicateCount();
dup.count = 1;
dup.undupDepth = uniqueReads.size();
dup.depth = duplicateReads.size();
return dup;
}
/**
* setup our walker. In this case, new a DuplicateCount object and return it
* @return the object holding the counts of the duplicates
*/
public DuplicateCount reduceInit() {
return new DuplicateCount();
}
/**
* the reduce step. This function combines the DuplicateCount objects, and updates the running depth average
* @param value the new DuplicateCount
* @param sum the running sum DuplicateCount
* @return a new DuplicateCount with the updated sums
*/
public DuplicateCount reduce(DuplicateCount value, DuplicateCount sum) {
DuplicateCount dup = new DuplicateCount();
dup.count = sum.count + value.count;
dup.depth = value.depth + sum.depth;
dup.undupDepth = value.undupDepth + sum.undupDepth;
return dup;
}
/**
* when we're done, print out the collected stats
* @param result the result of the traversal engine, to be printed out
*/
public void onTraversalDone(DuplicateCount result) {
out.println("[REDUCE RESULT] Traversal result is: ");
out.println("duplicate count = " + result.count);
out.println("average depth = " + (double)result.depth / (double)result.count);
out.println("duplicates seen = " + result.depth);
out.println("average unique depth = " + (double)result.undupDepth / (double)result.undupDepth);
out.println("unique read count = " + result.undupDepth);
}
}

View File

@ -1,20 +1,44 @@
package org.broadinstitute.sting.playground.gatk.duplicates.walkers;
/*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
import org.broadinstitute.sting.gatk.LocusContext;
import org.broadinstitute.sting.gatk.walkers.DuplicateWalker;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.Pair;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.duplicates.DuplicateComp;
import org.broadinstitute.sting.utils.duplicates.DupUtils;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import java.util.List;
import java.util.ArrayList;
import java.io.PrintStream;
package org.broadinstitute.sting.playground.gatk.walkers.duplicates;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.gatk.LocusContext;
import org.broadinstitute.sting.gatk.walkers.DuplicateWalker;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.Pair;
import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.broadinstitute.sting.utils.duplicates.DupUtils;
import org.broadinstitute.sting.utils.duplicates.DuplicateComp;
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.List;
class MismatchCounter {
long nObs = 0;