Changed the duplicate traversal over to the new style of traversal and plumbed into the genome analysis engine. Also added a CountDuplicates walker, to validate the engine.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1072 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
4a92a999a0
commit
8b4d0412ca
|
|
@ -1,3 +1,28 @@
|
|||
/*
|
||||
* Copyright (c) 2009 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk;
|
||||
|
||||
import net.sf.picard.reference.ReferenceSequenceFile;
|
||||
|
|
@ -91,7 +116,7 @@ public class GenomeAnalysisEngine {
|
|||
|
||||
// if we're a read or a locus walker, we use the new system. Right now we have complicated
|
||||
// branching based on the input data, but this should disapear when all the traversals are switched over
|
||||
if (my_walker instanceof LocusWalker || my_walker instanceof ReadWalker) {
|
||||
if (my_walker instanceof LocusWalker || my_walker instanceof ReadWalker || my_walker instanceof DuplicateWalker) {
|
||||
microScheduler = createMicroscheduler(my_walker, rods);
|
||||
} else { // we have an old style traversal, once we're done return
|
||||
legacyTraversal(my_walker, rods);
|
||||
|
|
@ -129,15 +154,12 @@ public class GenomeAnalysisEngine {
|
|||
* into their own function allows us to deviate in the two behaviors so the new style traversals aren't limited to what
|
||||
* the old style does. As traversals are converted, this function should disappear.
|
||||
*
|
||||
* @param my_walker
|
||||
* @param rods
|
||||
* @param my_walker the walker to run
|
||||
* @param rods the rod tracks to associate with
|
||||
*/
|
||||
private void legacyTraversal(Walker my_walker, List<ReferenceOrderedData<? extends ReferenceOrderedDatum>> rods) {
|
||||
if (my_walker instanceof LocusWindowWalker) {
|
||||
this.engine = new TraverseByLocusWindows(argCollection.samFiles, argCollection.referenceFile, rods);
|
||||
} else if (my_walker instanceof DuplicateWalker) {
|
||||
// we're a duplicate walker
|
||||
this.engine = new TraverseDuplicates(argCollection.samFiles, argCollection.referenceFile, rods);
|
||||
} else {
|
||||
throw new RuntimeException("Unexpected walker type: " + my_walker);
|
||||
}
|
||||
|
|
@ -178,11 +200,13 @@ public class GenomeAnalysisEngine {
|
|||
microScheduler = MicroScheduler.create(my_walker, extractSourceInfoFromArguments(argCollection), argCollection.referenceFile, rods, argCollection.numberOfThreads);
|
||||
engine = microScheduler.getTraversalEngine();
|
||||
}
|
||||
else if (my_walker instanceof ReadWalker) {
|
||||
else if (my_walker instanceof ReadWalker || my_walker instanceof DuplicateWalker) {
|
||||
if (argCollection.referenceFile == null)
|
||||
Utils.scareUser(String.format("Locus-based traversals require a reference file but none was given"));
|
||||
Utils.scareUser(String.format("Read-based traversals require a reference file but none was given"));
|
||||
microScheduler = MicroScheduler.create(my_walker, extractSourceInfoFromArguments(argCollection), argCollection.referenceFile, rods, argCollection.numberOfThreads);
|
||||
engine = microScheduler.getTraversalEngine();
|
||||
} else {
|
||||
Utils.scareUser(String.format("Unable to create the appropriate TraversalEngine for analysis type " + argCollection.analysisName));
|
||||
}
|
||||
|
||||
return microScheduler;
|
||||
|
|
|
|||
|
|
@ -1,32 +1,56 @@
|
|||
/*
|
||||
* Copyright (c) 2009 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.executive;
|
||||
|
||||
import net.sf.picard.reference.ReferenceSequenceFile;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.gatk.Reads;
|
||||
import org.broadinstitute.sting.gatk.datasources.providers.ShardDataProvider;
|
||||
import org.broadinstitute.sting.gatk.datasources.shards.Shard;
|
||||
import org.broadinstitute.sting.gatk.datasources.shards.ShardStrategy;
|
||||
import org.broadinstitute.sting.gatk.datasources.shards.ShardStrategyFactory;
|
||||
import org.broadinstitute.sting.gatk.datasources.shards.Shard;
|
||||
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.SAMDataSource;
|
||||
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource;
|
||||
import org.broadinstitute.sting.gatk.datasources.providers.ShardDataProvider;
|
||||
import org.broadinstitute.sting.gatk.traversals.TraversalEngine;
|
||||
import org.broadinstitute.sting.gatk.traversals.TraverseReads;
|
||||
import org.broadinstitute.sting.gatk.traversals.TraverseLoci;
|
||||
import org.broadinstitute.sting.gatk.walkers.TreeReducible;
|
||||
import org.broadinstitute.sting.gatk.walkers.Walker;
|
||||
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
|
||||
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.SAMDataSource;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
|
||||
import org.broadinstitute.sting.gatk.Reads;
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
|
||||
import org.broadinstitute.sting.gatk.traversals.TraversalEngine;
|
||||
import org.broadinstitute.sting.gatk.traversals.TraverseDuplicates;
|
||||
import org.broadinstitute.sting.gatk.traversals.TraverseLoci;
|
||||
import org.broadinstitute.sting.gatk.traversals.TraverseReads;
|
||||
import org.broadinstitute.sting.gatk.walkers.*;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.util.List;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
|
|
@ -51,8 +75,13 @@ public abstract class MicroScheduler {
|
|||
/**
|
||||
* MicroScheduler factory function. Create a microscheduler appropriate for reducing the
|
||||
* selected walker.
|
||||
* @param walker Which walker to use.
|
||||
*
|
||||
* @param walker Which walker to use.
|
||||
* @param reads the informations associated with the reads
|
||||
* @param ref the reference file
|
||||
* @param rods the rods to include in the traversal
|
||||
* @param nThreadsToUse Number of threads to utilize.
|
||||
*
|
||||
* @return The best-fit microscheduler.
|
||||
*/
|
||||
public static MicroScheduler create(Walker walker, Reads reads, File ref, List<ReferenceOrderedData<? extends ReferenceOrderedDatum>> rods, int nThreadsToUse) {
|
||||
|
|
@ -67,16 +96,24 @@ public abstract class MicroScheduler {
|
|||
|
||||
/**
|
||||
* Create a microscheduler given the reads and reference.
|
||||
* @param reads The reads.
|
||||
*
|
||||
* @param walker the walker to execute with
|
||||
* @param reads The reads.
|
||||
* @param refFile File pointer to the reference.
|
||||
* @param rods the rods to include in the traversal
|
||||
*/
|
||||
protected MicroScheduler(Walker walker, Reads reads, File refFile, List<ReferenceOrderedData<? extends ReferenceOrderedDatum>> rods) {
|
||||
if (walker instanceof ReadWalker) {
|
||||
traversalEngine = new TraverseReads(reads.getReadsFiles(), refFile, rods);
|
||||
this.reads = getReadsDataSource(reads,true);
|
||||
} else {
|
||||
this.reads = getReadsDataSource(reads, true);
|
||||
} else if (walker instanceof LocusWalker) {
|
||||
traversalEngine = new TraverseLoci(reads.getReadsFiles(), refFile, rods);
|
||||
this.reads = getReadsDataSource(reads,false);
|
||||
this.reads = getReadsDataSource(reads, false);
|
||||
} else if (walker instanceof DuplicateWalker) {
|
||||
traversalEngine = new TraverseDuplicates(reads.getReadsFiles(), refFile, rods);
|
||||
this.reads = getReadsDataSource(reads, true);
|
||||
} else {
|
||||
throw new UnsupportedOperationException("Unable to determine traversal type, the walker is an unknown type.");
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -87,6 +124,7 @@ public abstract class MicroScheduler {
|
|||
/**
|
||||
* A temporary getter for the traversal engine. In the future, clients
|
||||
* of the microscheduler shouldn't need to know anything about the traversal engine.
|
||||
*
|
||||
* @return The traversal engine.
|
||||
*/
|
||||
public TraversalEngine getTraversalEngine() {
|
||||
|
|
@ -95,18 +133,23 @@ public abstract class MicroScheduler {
|
|||
|
||||
/**
|
||||
* Walks a walker over the given list of intervals.
|
||||
* @param walker Computation to perform over dataset.
|
||||
* @param intervals A list of intervals over which to walk. Null for whole dataset.
|
||||
*
|
||||
* @param walker Computation to perform over dataset.
|
||||
* @param intervals A list of intervals over which to walk. Null for whole dataset.
|
||||
* @param maxIterations the maximum number of iterations we're to perform
|
||||
*
|
||||
* @return the return type of the walker
|
||||
*/
|
||||
public abstract Object execute(Walker walker, GenomeLocSortedSet intervals, Integer maxIterations);
|
||||
|
||||
/**
|
||||
* Get the sharding strategy given a driving data source.
|
||||
* @param walker Walker for which to infer sharding strategy.
|
||||
*
|
||||
* @param walker Walker for which to infer sharding strategy.
|
||||
* @param drivingDataSource Data on which to shard.
|
||||
* @param intervals Intervals to use when limiting sharding.
|
||||
* @param intervals Intervals to use when limiting sharding.
|
||||
* @param maxIterations the maximum number of iterations to run through
|
||||
*
|
||||
* @return Sharding strategy for this driving data source.
|
||||
*/
|
||||
protected ShardStrategy getShardStrategy(Walker walker,
|
||||
|
|
@ -118,31 +161,32 @@ public abstract class MicroScheduler {
|
|||
if (walker instanceof LocusWalker) {
|
||||
if (intervals != null) {
|
||||
shardType = (walker.isReduceByInterval()) ?
|
||||
ShardStrategyFactory.SHATTER_STRATEGY.INTERVAL :
|
||||
ShardStrategyFactory.SHATTER_STRATEGY.LINEAR;
|
||||
ShardStrategyFactory.SHATTER_STRATEGY.INTERVAL :
|
||||
ShardStrategyFactory.SHATTER_STRATEGY.LINEAR;
|
||||
|
||||
shardStrategy = ShardStrategyFactory.shatter(shardType,
|
||||
drivingDataSource.getSequenceDictionary(),
|
||||
SHARD_SIZE,
|
||||
intervals, maxIterations);
|
||||
drivingDataSource.getSequenceDictionary(),
|
||||
SHARD_SIZE,
|
||||
intervals, maxIterations);
|
||||
} else
|
||||
shardStrategy = ShardStrategyFactory.shatter(ShardStrategyFactory.SHATTER_STRATEGY.LINEAR,
|
||||
drivingDataSource.getSequenceDictionary(),
|
||||
SHARD_SIZE, maxIterations);
|
||||
drivingDataSource.getSequenceDictionary(),
|
||||
SHARD_SIZE, maxIterations);
|
||||
|
||||
} else if (walker instanceof ReadWalker) {
|
||||
} else if (walker instanceof ReadWalker ||
|
||||
walker instanceof DuplicateWalker) {
|
||||
|
||||
shardType = ShardStrategyFactory.SHATTER_STRATEGY.READS;
|
||||
|
||||
if (intervals != null) {
|
||||
shardStrategy = ShardStrategyFactory.shatter(shardType,
|
||||
drivingDataSource.getSequenceDictionary(),
|
||||
SHARD_SIZE,
|
||||
intervals, maxIterations);
|
||||
drivingDataSource.getSequenceDictionary(),
|
||||
SHARD_SIZE,
|
||||
intervals, maxIterations);
|
||||
} else {
|
||||
shardStrategy = ShardStrategyFactory.shatter(shardType,
|
||||
drivingDataSource.getSequenceDictionary(),
|
||||
SHARD_SIZE, maxIterations);
|
||||
drivingDataSource.getSequenceDictionary(),
|
||||
SHARD_SIZE, maxIterations);
|
||||
}
|
||||
} else
|
||||
throw new StingException("Unable to support walker of type" + walker.getClass().getName());
|
||||
|
|
@ -152,7 +196,9 @@ public abstract class MicroScheduler {
|
|||
|
||||
/**
|
||||
* Gets an window into all the data that can be viewed as a single shard.
|
||||
*
|
||||
* @param shard The section of data to view.
|
||||
*
|
||||
* @return An accessor for all the data in this shard.
|
||||
*/
|
||||
protected ShardDataProvider getShardDataProvider(Shard shard) {
|
||||
|
|
@ -161,6 +207,10 @@ public abstract class MicroScheduler {
|
|||
|
||||
/**
|
||||
* Gets a data source for the given set of reads.
|
||||
*
|
||||
* @param reads the read source information
|
||||
* @param byReads are we a by reads traversal, or not
|
||||
*
|
||||
* @return A data source for the given set of reads.
|
||||
*/
|
||||
private SAMDataSource getReadsDataSource(Reads reads, boolean byReads) {
|
||||
|
|
@ -180,6 +230,9 @@ public abstract class MicroScheduler {
|
|||
|
||||
/**
|
||||
* Open the reference-ordered data sources.
|
||||
*
|
||||
* @param rods the reference order data to execute using
|
||||
*
|
||||
* @return A list of reference-ordered data sources.
|
||||
*/
|
||||
private List<ReferenceOrderedDataSource> getReferenceOrderedDataSources(List<ReferenceOrderedData<? extends ReferenceOrderedDatum>> rods) {
|
||||
|
|
@ -191,7 +244,9 @@ public abstract class MicroScheduler {
|
|||
|
||||
/**
|
||||
* Opens a reference sequence file paired with an index.
|
||||
*
|
||||
* @param refFile Handle to a reference sequence file. Non-null.
|
||||
*
|
||||
* @return A thread-safe file wrapper.
|
||||
*/
|
||||
private IndexedFastaSequenceFile openReferenceSequenceFile(File refFile) {
|
||||
|
|
|
|||
|
|
@ -1,3 +1,28 @@
|
|||
/*
|
||||
* Copyright (c) 2009 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.traversals;
|
||||
|
||||
import net.sf.picard.filter.FilteringIterator;
|
||||
|
|
@ -29,22 +54,12 @@ import java.util.List;
|
|||
* User: aaron
|
||||
* Date: Apr 24, 2009
|
||||
* Time: 10:35:22 AM
|
||||
*
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2009 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
*
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
/**
|
||||
* @author Mark DePristo
|
||||
* @version 0.1
|
||||
* @date Apr 29, 2009
|
||||
* <p/>
|
||||
* Class TraverseDuplicates
|
||||
* <p/>
|
||||
|
|
@ -67,17 +82,16 @@ public class TraverseDuplicates extends TraversalEngine {
|
|||
super(reads, ref, rods);
|
||||
}
|
||||
|
||||
private List<SAMRecord> readsAtLoc(final SAMRecord read, PushbackIterator<SAMRecord> iter)
|
||||
{
|
||||
private List<SAMRecord> readsAtLoc(final SAMRecord read, PushbackIterator<SAMRecord> iter) {
|
||||
GenomeLoc site = GenomeLocParser.createGenomeLoc(read);
|
||||
ArrayList<SAMRecord> l = new ArrayList<SAMRecord>();
|
||||
|
||||
l.add(read);
|
||||
for (SAMRecord read2: iter) {
|
||||
for (SAMRecord read2 : iter) {
|
||||
GenomeLoc site2 = GenomeLocParser.createGenomeLoc(read2);
|
||||
|
||||
// the next read starts too late
|
||||
if ( site2.getStart() != site.getStart() ) {
|
||||
if (site2.getStart() != site.getStart()) {
|
||||
//System.out.printf("site = %s, site2 = %s%n", site, site2);
|
||||
iter.pushback(read2);
|
||||
break;
|
||||
|
|
@ -96,8 +110,8 @@ public class TraverseDuplicates extends TraversalEngine {
|
|||
|
||||
// find the first duplicate
|
||||
SAMRecord key = null;
|
||||
for ( SAMRecord read : reads ) {
|
||||
if ( read.getDuplicateReadFlag() ) {
|
||||
for (SAMRecord read : reads) {
|
||||
if (read.getDuplicateReadFlag()) {
|
||||
// this is our key
|
||||
key = read;
|
||||
if (DEBUG) logger.debug(String.format("Key %s is a duplicate", read.getReadName()));
|
||||
|
|
@ -107,20 +121,21 @@ public class TraverseDuplicates extends TraversalEngine {
|
|||
|
||||
// At this point, there are two possibilities, we have found at least one dup or not
|
||||
// if it's a dup, add it to the dups list, otherwise add it to the uniques list
|
||||
if ( key != null ) {
|
||||
if (key != null) {
|
||||
final GenomeLoc keyLoc = GenomeLocParser.createGenomeLoc(key);
|
||||
final GenomeLoc keyMateLoc = GenomeLocParser.createGenomeLoc(key.getMateReferenceIndex(), key.getMateAlignmentStart(), key.getMateAlignmentStart());
|
||||
|
||||
for ( SAMRecord read : reads ) {
|
||||
for (SAMRecord read : reads) {
|
||||
final GenomeLoc readLoc = GenomeLocParser.createGenomeLoc(read);
|
||||
final GenomeLoc readMateLoc = GenomeLocParser.createGenomeLoc(read.getMateReferenceIndex(), read.getMateAlignmentStart(), read.getMateAlignmentStart());
|
||||
if (DEBUG) logger.debug(String.format("Examining reads at %s vs. %s at %s / %s vs. %s / %s%n", key.getReadName(), read.getReadName(), keyLoc, keyMateLoc, readLoc, readMateLoc));
|
||||
if (DEBUG)
|
||||
logger.debug(String.format("Examining reads at %s vs. %s at %s / %s vs. %s / %s%n", key.getReadName(), read.getReadName(), keyLoc, keyMateLoc, readLoc, readMateLoc));
|
||||
|
||||
// read and key start at the same place, and either the this read and the key
|
||||
// share a mate location or the read is flagged as a duplicate
|
||||
if ( readLoc.compareTo(keyLoc) == 0 &&
|
||||
( readMateLoc.compareTo(keyMateLoc) == 0) ||
|
||||
read.getDuplicateReadFlag() ) {
|
||||
if (readLoc.compareTo(keyLoc) == 0 &&
|
||||
(readMateLoc.compareTo(keyMateLoc) == 0) ||
|
||||
read.getDuplicateReadFlag()) {
|
||||
// we are at the same position as the dup and have the same mat pos, it's a dup
|
||||
if (DEBUG) logger.debug(String.format(" => Adding read to dups list: %s%n", read));
|
||||
dups.add(read);
|
||||
|
|
@ -137,33 +152,43 @@ public class TraverseDuplicates extends TraversalEngine {
|
|||
|
||||
/**
|
||||
* Traverse by reads, given the data and the walker
|
||||
*
|
||||
* @param sum of type T, the return from the walker
|
||||
* @param <M> the generic type
|
||||
* @param <T> the return type of the reduce function
|
||||
* @return
|
||||
* @param dupWalker our duplicates walker
|
||||
* @param readIter our iterator
|
||||
*
|
||||
* @return the reduce type, T, the final product of all the reduce calls
|
||||
*/
|
||||
public <M, T> T actuallyTraverse(DuplicateWalker<M, T> dupWalker,
|
||||
Iterator<SAMRecord> readIter,
|
||||
T sum) {
|
||||
// while we still have more reads
|
||||
// ok, here's the idea. We get all the reads that start at the same position in the genome
|
||||
// We then split the list of reads into sublists of reads:
|
||||
// -> those with the same mate pair position, for paired reads
|
||||
// -> those flagged as unpaired and duplicated but having the same start and end and
|
||||
private <M, T> T actuallyTraverse(DuplicateWalker<M, T> dupWalker,
|
||||
Iterator<SAMRecord> readIter,
|
||||
T sum) {
|
||||
/**
|
||||
* while we still have more reads:
|
||||
* ok, here's the idea. We get all the reads that start at the same position in the genome
|
||||
* We then split the list of reads into sublists of reads:
|
||||
* -> those with the same mate pair position, for paired reads
|
||||
* -> those flagged as unpaired and duplicated but having the same start and end and
|
||||
*/
|
||||
PushbackIterator<SAMRecord> iter = new PushbackIterator<SAMRecord>(readIter);
|
||||
for (SAMRecord read: iter) {
|
||||
|
||||
|
||||
for (SAMRecord read : iter) {
|
||||
|
||||
// get the genome loc from the read
|
||||
GenomeLoc site = GenomeLocParser.createGenomeLoc(read);
|
||||
|
||||
List<SAMRecord> reads = readsAtLoc(read, iter);
|
||||
Pair<List<SAMRecord>, List<SAMRecord>> split = splitDuplicates(reads);
|
||||
List<SAMRecord> uniqueReads = split.getFirst();
|
||||
List<SAMRecord> duplicateReads = split.getSecond();
|
||||
List<SAMRecord> uniqueReads = split.getFirst();
|
||||
List<SAMRecord> duplicateReads = split.getSecond();
|
||||
|
||||
logger.debug(String.format("*** TraverseDuplicates.traverse at %s with %d reads has %d unique and %d duplicate reads",
|
||||
site, reads.size(), uniqueReads.size(), duplicateReads.size()));
|
||||
if ( reads.size() != uniqueReads.size() + duplicateReads.size() )
|
||||
throw new RuntimeException(String.format("Bug occurred spliting reads [N=%d] at loc %s into unique [N=%d] and duplicates [N=%d], sizes don't match",
|
||||
reads.size(), uniqueReads.size(), duplicateReads.size()));
|
||||
if (reads.size() != uniqueReads.size() + duplicateReads.size())
|
||||
throw new RuntimeException(String.format("Bug occurred spliting reads [N=%d] at loc %s into unique [N=%d] and duplicates [N=%d], sizes don't match",
|
||||
reads.size(), site.toString(), uniqueReads.size(), duplicateReads.size()));
|
||||
|
||||
// Jump forward in the reference to this locus location
|
||||
LocusContext locus = new LocusContext(site, duplicateReads, Arrays.asList(0));
|
||||
|
|
@ -176,15 +201,15 @@ public class TraverseDuplicates extends TraversalEngine {
|
|||
|
||||
byte[] refBases = new byte[0];
|
||||
|
||||
if ( dupWalker.mapUniqueReadsTooP() ) {
|
||||
if (dupWalker.mapUniqueReadsTooP()) {
|
||||
// Send each unique read to the map function
|
||||
for ( SAMRecord unique : uniqueReads ) {
|
||||
for (SAMRecord unique : uniqueReads) {
|
||||
List<SAMRecord> l = Arrays.asList(unique);
|
||||
sum = mapOne(dupWalker, uniqueReads, l, site, refBases, locus, sum);
|
||||
}
|
||||
}
|
||||
|
||||
if ( duplicateReads.size() > 0 )
|
||||
if (duplicateReads.size() > 0)
|
||||
sum = mapOne(dupWalker, uniqueReads, duplicateReads, site, refBases, locus, sum);
|
||||
|
||||
printProgress("dups", site);
|
||||
|
|
@ -206,21 +231,16 @@ public class TraverseDuplicates extends TraversalEngine {
|
|||
SAMRecord lastRead = null;
|
||||
public boolean filterOut(SAMRecord rec) {
|
||||
boolean result = false;
|
||||
String why = "";
|
||||
if (rec.getReadUnmappedFlag()) {
|
||||
TraversalStatistics.nUnmappedReads++;
|
||||
result = true;
|
||||
why = "Unmapped";
|
||||
} else if (rec.getNotPrimaryAlignmentFlag()) {
|
||||
TraversalStatistics.nNotPrimary++;
|
||||
result = true;
|
||||
why = "Not Primary";
|
||||
} else if (rec.getAlignmentStart() == SAMRecord.NO_ALIGNMENT_START) {
|
||||
TraversalStatistics.nBadAlignments++;
|
||||
result = true;
|
||||
why = "No alignment start";
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
result = false;
|
||||
}
|
||||
|
||||
|
|
@ -250,7 +270,6 @@ public class TraverseDuplicates extends TraversalEngine {
|
|||
}
|
||||
|
||||
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// new style interface to the system
|
||||
|
|
@ -258,19 +277,21 @@ public class TraverseDuplicates extends TraversalEngine {
|
|||
// --------------------------------------------------------------------------------------------------------------
|
||||
/**
|
||||
* Traverse by reads, given the data and the walker
|
||||
*
|
||||
* @param walker the walker to execute over
|
||||
* @param shard the shard of data to feed the walker
|
||||
* @param sum of type T, the return from the walker
|
||||
* @param <M> the generic type
|
||||
* @param <T> the return type of the reduce function
|
||||
* @return
|
||||
* @param shard the shard of data to feed the walker
|
||||
* @param sum of type T, the return from the walker
|
||||
* @param <M> the generic type
|
||||
* @param <T> the return type of the reduce function
|
||||
*
|
||||
* @return the result type T, the product of all the reduce calls
|
||||
*/
|
||||
public <M, T> T traverse(Walker<M, T> walker,
|
||||
Shard shard,
|
||||
ShardDataProvider dataProvider,
|
||||
T sum) {
|
||||
|
||||
logger.debug(String.format("TraverseDuplicates.traverse Genomic interval is %s", ((ReadShard)shard).getSize()));
|
||||
logger.debug(String.format("TraverseDuplicates.traverse Genomic interval is %s", ((ReadShard) shard).getSize()));
|
||||
|
||||
if (!(walker instanceof DuplicateWalker))
|
||||
throw new IllegalArgumentException("Walker isn't a duplicate walker!");
|
||||
|
|
@ -287,4 +308,16 @@ public class TraverseDuplicates extends TraversalEngine {
|
|||
PushbackIterator<SAMRecord> iter = new PushbackIterator<SAMRecord>(filterIter);
|
||||
return actuallyTraverse(dupWalker, iter, sum);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Temporary override of printOnTraversalDone.
|
||||
* TODO: Add some sort of TE.getName() function once all TraversalEngines are ported.
|
||||
*
|
||||
* @param sum Result of the computation.
|
||||
* @param <T> Type of the result.
|
||||
*/
|
||||
public <T> void printOnTraversalDone(T sum) {
|
||||
printOnTraversalDone("reads", sum);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,106 @@
|
|||
/*
|
||||
* Copyright (c) 2009 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.playground.gatk.walkers.duplicates;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.gatk.LocusContext;
|
||||
import org.broadinstitute.sting.gatk.walkers.DuplicateWalker;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* a class to store the duplicates information we pass around
|
||||
*/
|
||||
class DuplicateCount {
|
||||
public int count = 0;
|
||||
public int undupDepth = 0;
|
||||
public int depth = 0;
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @author aaron
|
||||
*
|
||||
* Class CountDuplicatesWalker
|
||||
*
|
||||
* Count the number of duplicates, and the average depth of duplicates at all positions.
|
||||
*/
|
||||
public class CountDuplicatesWalker extends DuplicateWalker<DuplicateCount, DuplicateCount> {
|
||||
|
||||
/**
|
||||
* the map function, conforming to the duplicates interface
|
||||
* @param loc the genomic location
|
||||
* @param refBases the reference bases that cover this position, which we turn off
|
||||
* @param context the LocusContext, containing all the reads overlapping this region
|
||||
* @param uniqueReads all the unique reads, bundled together
|
||||
* @param duplicateReads all the duplicate reads
|
||||
* @return a DuplicateCount object, with the appropriate stats
|
||||
*/
|
||||
public DuplicateCount map(GenomeLoc loc, byte[] refBases, LocusContext context, List<SAMRecord> uniqueReads, List<SAMRecord> duplicateReads) {
|
||||
DuplicateCount dup = new DuplicateCount();
|
||||
dup.count = 1;
|
||||
dup.undupDepth = uniqueReads.size();
|
||||
dup.depth = duplicateReads.size();
|
||||
return dup;
|
||||
}
|
||||
|
||||
/**
|
||||
* setup our walker. In this case, new a DuplicateCount object and return it
|
||||
* @return the object holding the counts of the duplicates
|
||||
*/
|
||||
public DuplicateCount reduceInit() {
|
||||
return new DuplicateCount();
|
||||
}
|
||||
|
||||
/**
|
||||
* the reduce step. This function combines the DuplicateCount objects, and updates the running depth average
|
||||
* @param value the new DuplicateCount
|
||||
* @param sum the running sum DuplicateCount
|
||||
* @return a new DuplicateCount with the updated sums
|
||||
*/
|
||||
public DuplicateCount reduce(DuplicateCount value, DuplicateCount sum) {
|
||||
DuplicateCount dup = new DuplicateCount();
|
||||
dup.count = sum.count + value.count;
|
||||
dup.depth = value.depth + sum.depth;
|
||||
dup.undupDepth = value.undupDepth + sum.undupDepth;
|
||||
return dup;
|
||||
}
|
||||
|
||||
/**
|
||||
* when we're done, print out the collected stats
|
||||
* @param result the result of the traversal engine, to be printed out
|
||||
*/
|
||||
public void onTraversalDone(DuplicateCount result) {
|
||||
out.println("[REDUCE RESULT] Traversal result is: ");
|
||||
out.println("duplicate count = " + result.count);
|
||||
out.println("average depth = " + (double)result.depth / (double)result.count);
|
||||
out.println("duplicates seen = " + result.depth);
|
||||
out.println("average unique depth = " + (double)result.undupDepth / (double)result.undupDepth);
|
||||
out.println("unique read count = " + result.undupDepth);
|
||||
|
||||
}
|
||||
}
|
||||
|
|
@ -1,20 +1,44 @@
|
|||
package org.broadinstitute.sting.playground.gatk.duplicates.walkers;
|
||||
/*
|
||||
* Copyright (c) 2009 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
import org.broadinstitute.sting.gatk.LocusContext;
|
||||
import org.broadinstitute.sting.gatk.walkers.DuplicateWalker;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
import org.broadinstitute.sting.utils.Pair;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.duplicates.DuplicateComp;
|
||||
import org.broadinstitute.sting.utils.duplicates.DupUtils;
|
||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.ArrayList;
|
||||
import java.io.PrintStream;
|
||||
package org.broadinstitute.sting.playground.gatk.walkers.duplicates;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.gatk.LocusContext;
|
||||
import org.broadinstitute.sting.gatk.walkers.DuplicateWalker;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.Pair;
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||
import org.broadinstitute.sting.utils.duplicates.DupUtils;
|
||||
import org.broadinstitute.sting.utils.duplicates.DuplicateComp;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
class MismatchCounter {
|
||||
long nObs = 0;
|
||||
|
|
|
|||
Loading…
Reference in New Issue