Changed the duplicate traversal over to the new style of traversal and plumbed into the genome analysis engine. Also added a CountDuplicates walker, to validate the engine.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1072 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
aaron 2009-06-22 21:11:18 +00:00
parent 4a92a999a0
commit 8b4d0412ca
5 changed files with 354 additions and 112 deletions

View File

@ -1,3 +1,28 @@
/*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk; package org.broadinstitute.sting.gatk;
import net.sf.picard.reference.ReferenceSequenceFile; import net.sf.picard.reference.ReferenceSequenceFile;
@ -91,7 +116,7 @@ public class GenomeAnalysisEngine {
// if we're a read or a locus walker, we use the new system. Right now we have complicated // if we're a read or a locus walker, we use the new system. Right now we have complicated
// branching based on the input data, but this should disapear when all the traversals are switched over // branching based on the input data, but this should disapear when all the traversals are switched over
if (my_walker instanceof LocusWalker || my_walker instanceof ReadWalker) { if (my_walker instanceof LocusWalker || my_walker instanceof ReadWalker || my_walker instanceof DuplicateWalker) {
microScheduler = createMicroscheduler(my_walker, rods); microScheduler = createMicroscheduler(my_walker, rods);
} else { // we have an old style traversal, once we're done return } else { // we have an old style traversal, once we're done return
legacyTraversal(my_walker, rods); legacyTraversal(my_walker, rods);
@ -129,15 +154,12 @@ public class GenomeAnalysisEngine {
* into their own function allows us to deviate in the two behaviors so the new style traversals aren't limited to what * into their own function allows us to deviate in the two behaviors so the new style traversals aren't limited to what
* the old style does. As traversals are converted, this function should disappear. * the old style does. As traversals are converted, this function should disappear.
* *
* @param my_walker * @param my_walker the walker to run
* @param rods * @param rods the rod tracks to associate with
*/ */
private void legacyTraversal(Walker my_walker, List<ReferenceOrderedData<? extends ReferenceOrderedDatum>> rods) { private void legacyTraversal(Walker my_walker, List<ReferenceOrderedData<? extends ReferenceOrderedDatum>> rods) {
if (my_walker instanceof LocusWindowWalker) { if (my_walker instanceof LocusWindowWalker) {
this.engine = new TraverseByLocusWindows(argCollection.samFiles, argCollection.referenceFile, rods); this.engine = new TraverseByLocusWindows(argCollection.samFiles, argCollection.referenceFile, rods);
} else if (my_walker instanceof DuplicateWalker) {
// we're a duplicate walker
this.engine = new TraverseDuplicates(argCollection.samFiles, argCollection.referenceFile, rods);
} else { } else {
throw new RuntimeException("Unexpected walker type: " + my_walker); throw new RuntimeException("Unexpected walker type: " + my_walker);
} }
@ -178,11 +200,13 @@ public class GenomeAnalysisEngine {
microScheduler = MicroScheduler.create(my_walker, extractSourceInfoFromArguments(argCollection), argCollection.referenceFile, rods, argCollection.numberOfThreads); microScheduler = MicroScheduler.create(my_walker, extractSourceInfoFromArguments(argCollection), argCollection.referenceFile, rods, argCollection.numberOfThreads);
engine = microScheduler.getTraversalEngine(); engine = microScheduler.getTraversalEngine();
} }
else if (my_walker instanceof ReadWalker) { else if (my_walker instanceof ReadWalker || my_walker instanceof DuplicateWalker) {
if (argCollection.referenceFile == null) if (argCollection.referenceFile == null)
Utils.scareUser(String.format("Locus-based traversals require a reference file but none was given")); Utils.scareUser(String.format("Read-based traversals require a reference file but none was given"));
microScheduler = MicroScheduler.create(my_walker, extractSourceInfoFromArguments(argCollection), argCollection.referenceFile, rods, argCollection.numberOfThreads); microScheduler = MicroScheduler.create(my_walker, extractSourceInfoFromArguments(argCollection), argCollection.referenceFile, rods, argCollection.numberOfThreads);
engine = microScheduler.getTraversalEngine(); engine = microScheduler.getTraversalEngine();
} else {
Utils.scareUser(String.format("Unable to create the appropriate TraversalEngine for analysis type " + argCollection.analysisName));
} }
return microScheduler; return microScheduler;

View File

@ -1,32 +1,56 @@
/*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.executive; package org.broadinstitute.sting.gatk.executive;
import net.sf.picard.reference.ReferenceSequenceFile; import net.sf.picard.reference.ReferenceSequenceFile;
import org.apache.log4j.Logger; import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.Reads;
import org.broadinstitute.sting.gatk.datasources.providers.ShardDataProvider;
import org.broadinstitute.sting.gatk.datasources.shards.Shard;
import org.broadinstitute.sting.gatk.datasources.shards.ShardStrategy; import org.broadinstitute.sting.gatk.datasources.shards.ShardStrategy;
import org.broadinstitute.sting.gatk.datasources.shards.ShardStrategyFactory; import org.broadinstitute.sting.gatk.datasources.shards.ShardStrategyFactory;
import org.broadinstitute.sting.gatk.datasources.shards.Shard;
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.SAMDataSource;
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource; import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource;
import org.broadinstitute.sting.gatk.datasources.providers.ShardDataProvider; import org.broadinstitute.sting.gatk.datasources.simpleDataSources.SAMDataSource;
import org.broadinstitute.sting.gatk.traversals.TraversalEngine;
import org.broadinstitute.sting.gatk.traversals.TraverseReads;
import org.broadinstitute.sting.gatk.traversals.TraverseLoci;
import org.broadinstitute.sting.gatk.walkers.TreeReducible;
import org.broadinstitute.sting.gatk.walkers.Walker;
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData; import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
import org.broadinstitute.sting.gatk.Reads; import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
import org.broadinstitute.sting.utils.StingException; import org.broadinstitute.sting.gatk.traversals.TraversalEngine;
import org.broadinstitute.sting.utils.GenomeLocSortedSet; import org.broadinstitute.sting.gatk.traversals.TraverseDuplicates;
import org.broadinstitute.sting.gatk.traversals.TraverseLoci;
import org.broadinstitute.sting.gatk.traversals.TraverseReads;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile; import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile;
import java.io.File; import java.io.File;
import java.io.FileNotFoundException; import java.io.FileNotFoundException;
import java.util.List;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List;
/** /**
* Created by IntelliJ IDEA. * Created by IntelliJ IDEA.
@ -51,8 +75,13 @@ public abstract class MicroScheduler {
/** /**
* MicroScheduler factory function. Create a microscheduler appropriate for reducing the * MicroScheduler factory function. Create a microscheduler appropriate for reducing the
* selected walker. * selected walker.
*
* @param walker Which walker to use. * @param walker Which walker to use.
* @param reads the informations associated with the reads
* @param ref the reference file
* @param rods the rods to include in the traversal
* @param nThreadsToUse Number of threads to utilize. * @param nThreadsToUse Number of threads to utilize.
*
* @return The best-fit microscheduler. * @return The best-fit microscheduler.
*/ */
public static MicroScheduler create(Walker walker, Reads reads, File ref, List<ReferenceOrderedData<? extends ReferenceOrderedDatum>> rods, int nThreadsToUse) { public static MicroScheduler create(Walker walker, Reads reads, File ref, List<ReferenceOrderedData<? extends ReferenceOrderedDatum>> rods, int nThreadsToUse) {
@ -67,16 +96,24 @@ public abstract class MicroScheduler {
/** /**
* Create a microscheduler given the reads and reference. * Create a microscheduler given the reads and reference.
*
* @param walker the walker to execute with
* @param reads The reads. * @param reads The reads.
* @param refFile File pointer to the reference. * @param refFile File pointer to the reference.
* @param rods the rods to include in the traversal
*/ */
protected MicroScheduler(Walker walker, Reads reads, File refFile, List<ReferenceOrderedData<? extends ReferenceOrderedDatum>> rods) { protected MicroScheduler(Walker walker, Reads reads, File refFile, List<ReferenceOrderedData<? extends ReferenceOrderedDatum>> rods) {
if (walker instanceof ReadWalker) { if (walker instanceof ReadWalker) {
traversalEngine = new TraverseReads(reads.getReadsFiles(), refFile, rods); traversalEngine = new TraverseReads(reads.getReadsFiles(), refFile, rods);
this.reads = getReadsDataSource(reads, true); this.reads = getReadsDataSource(reads, true);
} else { } else if (walker instanceof LocusWalker) {
traversalEngine = new TraverseLoci(reads.getReadsFiles(), refFile, rods); traversalEngine = new TraverseLoci(reads.getReadsFiles(), refFile, rods);
this.reads = getReadsDataSource(reads, false); this.reads = getReadsDataSource(reads, false);
} else if (walker instanceof DuplicateWalker) {
traversalEngine = new TraverseDuplicates(reads.getReadsFiles(), refFile, rods);
this.reads = getReadsDataSource(reads, true);
} else {
throw new UnsupportedOperationException("Unable to determine traversal type, the walker is an unknown type.");
} }
@ -87,6 +124,7 @@ public abstract class MicroScheduler {
/** /**
* A temporary getter for the traversal engine. In the future, clients * A temporary getter for the traversal engine. In the future, clients
* of the microscheduler shouldn't need to know anything about the traversal engine. * of the microscheduler shouldn't need to know anything about the traversal engine.
*
* @return The traversal engine. * @return The traversal engine.
*/ */
public TraversalEngine getTraversalEngine() { public TraversalEngine getTraversalEngine() {
@ -95,18 +133,23 @@ public abstract class MicroScheduler {
/** /**
* Walks a walker over the given list of intervals. * Walks a walker over the given list of intervals.
*
* @param walker Computation to perform over dataset. * @param walker Computation to perform over dataset.
* @param intervals A list of intervals over which to walk. Null for whole dataset. * @param intervals A list of intervals over which to walk. Null for whole dataset.
* @param maxIterations the maximum number of iterations we're to perform * @param maxIterations the maximum number of iterations we're to perform
*
* @return the return type of the walker * @return the return type of the walker
*/ */
public abstract Object execute(Walker walker, GenomeLocSortedSet intervals, Integer maxIterations); public abstract Object execute(Walker walker, GenomeLocSortedSet intervals, Integer maxIterations);
/** /**
* Get the sharding strategy given a driving data source. * Get the sharding strategy given a driving data source.
*
* @param walker Walker for which to infer sharding strategy. * @param walker Walker for which to infer sharding strategy.
* @param drivingDataSource Data on which to shard. * @param drivingDataSource Data on which to shard.
* @param intervals Intervals to use when limiting sharding. * @param intervals Intervals to use when limiting sharding.
* @param maxIterations the maximum number of iterations to run through
*
* @return Sharding strategy for this driving data source. * @return Sharding strategy for this driving data source.
*/ */
protected ShardStrategy getShardStrategy(Walker walker, protected ShardStrategy getShardStrategy(Walker walker,
@ -130,7 +173,8 @@ public abstract class MicroScheduler {
drivingDataSource.getSequenceDictionary(), drivingDataSource.getSequenceDictionary(),
SHARD_SIZE, maxIterations); SHARD_SIZE, maxIterations);
} else if (walker instanceof ReadWalker) { } else if (walker instanceof ReadWalker ||
walker instanceof DuplicateWalker) {
shardType = ShardStrategyFactory.SHATTER_STRATEGY.READS; shardType = ShardStrategyFactory.SHATTER_STRATEGY.READS;
@ -152,7 +196,9 @@ public abstract class MicroScheduler {
/** /**
* Gets an window into all the data that can be viewed as a single shard. * Gets an window into all the data that can be viewed as a single shard.
*
* @param shard The section of data to view. * @param shard The section of data to view.
*
* @return An accessor for all the data in this shard. * @return An accessor for all the data in this shard.
*/ */
protected ShardDataProvider getShardDataProvider(Shard shard) { protected ShardDataProvider getShardDataProvider(Shard shard) {
@ -161,6 +207,10 @@ public abstract class MicroScheduler {
/** /**
* Gets a data source for the given set of reads. * Gets a data source for the given set of reads.
*
* @param reads the read source information
* @param byReads are we a by reads traversal, or not
*
* @return A data source for the given set of reads. * @return A data source for the given set of reads.
*/ */
private SAMDataSource getReadsDataSource(Reads reads, boolean byReads) { private SAMDataSource getReadsDataSource(Reads reads, boolean byReads) {
@ -180,6 +230,9 @@ public abstract class MicroScheduler {
/** /**
* Open the reference-ordered data sources. * Open the reference-ordered data sources.
*
* @param rods the reference order data to execute using
*
* @return A list of reference-ordered data sources. * @return A list of reference-ordered data sources.
*/ */
private List<ReferenceOrderedDataSource> getReferenceOrderedDataSources(List<ReferenceOrderedData<? extends ReferenceOrderedDatum>> rods) { private List<ReferenceOrderedDataSource> getReferenceOrderedDataSources(List<ReferenceOrderedData<? extends ReferenceOrderedDatum>> rods) {
@ -191,7 +244,9 @@ public abstract class MicroScheduler {
/** /**
* Opens a reference sequence file paired with an index. * Opens a reference sequence file paired with an index.
*
* @param refFile Handle to a reference sequence file. Non-null. * @param refFile Handle to a reference sequence file. Non-null.
*
* @return A thread-safe file wrapper. * @return A thread-safe file wrapper.
*/ */
private IndexedFastaSequenceFile openReferenceSequenceFile(File refFile) { private IndexedFastaSequenceFile openReferenceSequenceFile(File refFile) {

View File

@ -1,3 +1,28 @@
/*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.traversals; package org.broadinstitute.sting.gatk.traversals;
import net.sf.picard.filter.FilteringIterator; import net.sf.picard.filter.FilteringIterator;
@ -29,22 +54,12 @@ import java.util.List;
* User: aaron * User: aaron
* Date: Apr 24, 2009 * Date: Apr 24, 2009
* Time: 10:35:22 AM * Time: 10:35:22 AM
*
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*
*/ */
/** /**
* @author Mark DePristo * @author Mark DePristo
* @version 0.1 * @version 0.1
* @date Apr 29, 2009
* <p/> * <p/>
* Class TraverseDuplicates * Class TraverseDuplicates
* <p/> * <p/>
@ -67,8 +82,7 @@ public class TraverseDuplicates extends TraversalEngine {
super(reads, ref, rods); super(reads, ref, rods);
} }
private List<SAMRecord> readsAtLoc(final SAMRecord read, PushbackIterator<SAMRecord> iter) private List<SAMRecord> readsAtLoc(final SAMRecord read, PushbackIterator<SAMRecord> iter) {
{
GenomeLoc site = GenomeLocParser.createGenomeLoc(read); GenomeLoc site = GenomeLocParser.createGenomeLoc(read);
ArrayList<SAMRecord> l = new ArrayList<SAMRecord>(); ArrayList<SAMRecord> l = new ArrayList<SAMRecord>();
@ -114,7 +128,8 @@ public class TraverseDuplicates extends TraversalEngine {
for (SAMRecord read : reads) { for (SAMRecord read : reads) {
final GenomeLoc readLoc = GenomeLocParser.createGenomeLoc(read); final GenomeLoc readLoc = GenomeLocParser.createGenomeLoc(read);
final GenomeLoc readMateLoc = GenomeLocParser.createGenomeLoc(read.getMateReferenceIndex(), read.getMateAlignmentStart(), read.getMateAlignmentStart()); final GenomeLoc readMateLoc = GenomeLocParser.createGenomeLoc(read.getMateReferenceIndex(), read.getMateAlignmentStart(), read.getMateAlignmentStart());
if (DEBUG) logger.debug(String.format("Examining reads at %s vs. %s at %s / %s vs. %s / %s%n", key.getReadName(), read.getReadName(), keyLoc, keyMateLoc, readLoc, readMateLoc)); if (DEBUG)
logger.debug(String.format("Examining reads at %s vs. %s at %s / %s vs. %s / %s%n", key.getReadName(), read.getReadName(), keyLoc, keyMateLoc, readLoc, readMateLoc));
// read and key start at the same place, and either the this read and the key // read and key start at the same place, and either the this read and the key
// share a mate location or the read is flagged as a duplicate // share a mate location or the read is flagged as a duplicate
@ -137,23 +152,33 @@ public class TraverseDuplicates extends TraversalEngine {
/** /**
* Traverse by reads, given the data and the walker * Traverse by reads, given the data and the walker
*
* @param sum of type T, the return from the walker * @param sum of type T, the return from the walker
* @param <M> the generic type * @param <M> the generic type
* @param <T> the return type of the reduce function * @param <T> the return type of the reduce function
* @return * @param dupWalker our duplicates walker
* @param readIter our iterator
*
* @return the reduce type, T, the final product of all the reduce calls
*/ */
public <M, T> T actuallyTraverse(DuplicateWalker<M, T> dupWalker, private <M, T> T actuallyTraverse(DuplicateWalker<M, T> dupWalker,
Iterator<SAMRecord> readIter, Iterator<SAMRecord> readIter,
T sum) { T sum) {
// while we still have more reads /**
// ok, here's the idea. We get all the reads that start at the same position in the genome * while we still have more reads:
// We then split the list of reads into sublists of reads: * ok, here's the idea. We get all the reads that start at the same position in the genome
// -> those with the same mate pair position, for paired reads * We then split the list of reads into sublists of reads:
// -> those flagged as unpaired and duplicated but having the same start and end and * -> those with the same mate pair position, for paired reads
* -> those flagged as unpaired and duplicated but having the same start and end and
*/
PushbackIterator<SAMRecord> iter = new PushbackIterator<SAMRecord>(readIter); PushbackIterator<SAMRecord> iter = new PushbackIterator<SAMRecord>(readIter);
for (SAMRecord read : iter) { for (SAMRecord read : iter) {
// get the genome loc from the read // get the genome loc from the read
GenomeLoc site = GenomeLocParser.createGenomeLoc(read); GenomeLoc site = GenomeLocParser.createGenomeLoc(read);
List<SAMRecord> reads = readsAtLoc(read, iter); List<SAMRecord> reads = readsAtLoc(read, iter);
Pair<List<SAMRecord>, List<SAMRecord>> split = splitDuplicates(reads); Pair<List<SAMRecord>, List<SAMRecord>> split = splitDuplicates(reads);
List<SAMRecord> uniqueReads = split.getFirst(); List<SAMRecord> uniqueReads = split.getFirst();
@ -163,7 +188,7 @@ public class TraverseDuplicates extends TraversalEngine {
site, reads.size(), uniqueReads.size(), duplicateReads.size())); site, reads.size(), uniqueReads.size(), duplicateReads.size()));
if (reads.size() != uniqueReads.size() + duplicateReads.size()) if (reads.size() != uniqueReads.size() + duplicateReads.size())
throw new RuntimeException(String.format("Bug occurred spliting reads [N=%d] at loc %s into unique [N=%d] and duplicates [N=%d], sizes don't match", throw new RuntimeException(String.format("Bug occurred spliting reads [N=%d] at loc %s into unique [N=%d] and duplicates [N=%d], sizes don't match",
reads.size(), uniqueReads.size(), duplicateReads.size())); reads.size(), site.toString(), uniqueReads.size(), duplicateReads.size()));
// Jump forward in the reference to this locus location // Jump forward in the reference to this locus location
LocusContext locus = new LocusContext(site, duplicateReads, Arrays.asList(0)); LocusContext locus = new LocusContext(site, duplicateReads, Arrays.asList(0));
@ -206,21 +231,16 @@ public class TraverseDuplicates extends TraversalEngine {
SAMRecord lastRead = null; SAMRecord lastRead = null;
public boolean filterOut(SAMRecord rec) { public boolean filterOut(SAMRecord rec) {
boolean result = false; boolean result = false;
String why = "";
if (rec.getReadUnmappedFlag()) { if (rec.getReadUnmappedFlag()) {
TraversalStatistics.nUnmappedReads++; TraversalStatistics.nUnmappedReads++;
result = true; result = true;
why = "Unmapped";
} else if (rec.getNotPrimaryAlignmentFlag()) { } else if (rec.getNotPrimaryAlignmentFlag()) {
TraversalStatistics.nNotPrimary++; TraversalStatistics.nNotPrimary++;
result = true; result = true;
why = "Not Primary";
} else if (rec.getAlignmentStart() == SAMRecord.NO_ALIGNMENT_START) { } else if (rec.getAlignmentStart() == SAMRecord.NO_ALIGNMENT_START) {
TraversalStatistics.nBadAlignments++; TraversalStatistics.nBadAlignments++;
result = true; result = true;
why = "No alignment start"; } else {
}
else {
result = false; result = false;
} }
@ -250,7 +270,6 @@ public class TraverseDuplicates extends TraversalEngine {
} }
// -------------------------------------------------------------------------------------------------------------- // --------------------------------------------------------------------------------------------------------------
// //
// new style interface to the system // new style interface to the system
@ -258,12 +277,14 @@ public class TraverseDuplicates extends TraversalEngine {
// -------------------------------------------------------------------------------------------------------------- // --------------------------------------------------------------------------------------------------------------
/** /**
* Traverse by reads, given the data and the walker * Traverse by reads, given the data and the walker
*
* @param walker the walker to execute over * @param walker the walker to execute over
* @param shard the shard of data to feed the walker * @param shard the shard of data to feed the walker
* @param sum of type T, the return from the walker * @param sum of type T, the return from the walker
* @param <M> the generic type * @param <M> the generic type
* @param <T> the return type of the reduce function * @param <T> the return type of the reduce function
* @return *
* @return the result type T, the product of all the reduce calls
*/ */
public <M, T> T traverse(Walker<M, T> walker, public <M, T> T traverse(Walker<M, T> walker,
Shard shard, Shard shard,
@ -287,4 +308,16 @@ public class TraverseDuplicates extends TraversalEngine {
PushbackIterator<SAMRecord> iter = new PushbackIterator<SAMRecord>(filterIter); PushbackIterator<SAMRecord> iter = new PushbackIterator<SAMRecord>(filterIter);
return actuallyTraverse(dupWalker, iter, sum); return actuallyTraverse(dupWalker, iter, sum);
} }
/**
* Temporary override of printOnTraversalDone.
* TODO: Add some sort of TE.getName() function once all TraversalEngines are ported.
*
* @param sum Result of the computation.
* @param <T> Type of the result.
*/
public <T> void printOnTraversalDone(T sum) {
printOnTraversalDone("reads", sum);
}
} }

View File

@ -0,0 +1,106 @@
/*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.playground.gatk.walkers.duplicates;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.gatk.LocusContext;
import org.broadinstitute.sting.gatk.walkers.DuplicateWalker;
import org.broadinstitute.sting.utils.GenomeLoc;
import java.util.List;
/**
* a class to store the duplicates information we pass around
*/
class DuplicateCount {
public int count = 0;
public int undupDepth = 0;
public int depth = 0;
}
/**
*
* @author aaron
*
* Class CountDuplicatesWalker
*
* Count the number of duplicates, and the average depth of duplicates at all positions.
*/
public class CountDuplicatesWalker extends DuplicateWalker<DuplicateCount, DuplicateCount> {
/**
* the map function, conforming to the duplicates interface
* @param loc the genomic location
* @param refBases the reference bases that cover this position, which we turn off
* @param context the LocusContext, containing all the reads overlapping this region
* @param uniqueReads all the unique reads, bundled together
* @param duplicateReads all the duplicate reads
* @return a DuplicateCount object, with the appropriate stats
*/
public DuplicateCount map(GenomeLoc loc, byte[] refBases, LocusContext context, List<SAMRecord> uniqueReads, List<SAMRecord> duplicateReads) {
DuplicateCount dup = new DuplicateCount();
dup.count = 1;
dup.undupDepth = uniqueReads.size();
dup.depth = duplicateReads.size();
return dup;
}
/**
* setup our walker. In this case, new a DuplicateCount object and return it
* @return the object holding the counts of the duplicates
*/
public DuplicateCount reduceInit() {
return new DuplicateCount();
}
/**
* the reduce step. This function combines the DuplicateCount objects, and updates the running depth average
* @param value the new DuplicateCount
* @param sum the running sum DuplicateCount
* @return a new DuplicateCount with the updated sums
*/
public DuplicateCount reduce(DuplicateCount value, DuplicateCount sum) {
DuplicateCount dup = new DuplicateCount();
dup.count = sum.count + value.count;
dup.depth = value.depth + sum.depth;
dup.undupDepth = value.undupDepth + sum.undupDepth;
return dup;
}
/**
* when we're done, print out the collected stats
* @param result the result of the traversal engine, to be printed out
*/
public void onTraversalDone(DuplicateCount result) {
out.println("[REDUCE RESULT] Traversal result is: ");
out.println("duplicate count = " + result.count);
out.println("average depth = " + (double)result.depth / (double)result.count);
out.println("duplicates seen = " + result.depth);
out.println("average unique depth = " + (double)result.undupDepth / (double)result.undupDepth);
out.println("unique read count = " + result.undupDepth);
}
}

View File

@ -1,20 +1,44 @@
package org.broadinstitute.sting.playground.gatk.duplicates.walkers; /*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
import org.broadinstitute.sting.gatk.LocusContext; package org.broadinstitute.sting.playground.gatk.walkers.duplicates;
import org.broadinstitute.sting.gatk.walkers.DuplicateWalker;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.Pair;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.duplicates.DuplicateComp;
import org.broadinstitute.sting.utils.duplicates.DupUtils;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import java.util.List;
import java.util.ArrayList;
import java.io.PrintStream;
import net.sf.samtools.SAMRecord; import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.gatk.LocusContext;
import org.broadinstitute.sting.gatk.walkers.DuplicateWalker;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.Pair;
import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.broadinstitute.sting.utils.duplicates.DupUtils;
import org.broadinstitute.sting.utils.duplicates.DuplicateComp;
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.List;
class MismatchCounter { class MismatchCounter {
long nObs = 0; long nObs = 0;