2009-04-29 01:55:08 +08:00
|
|
|
package org.broadinstitute.sting.gatk.traversals;
|
|
|
|
|
|
|
|
|
|
import net.sf.samtools.SAMRecord;
|
|
|
|
|
import org.apache.log4j.Logger;
|
|
|
|
|
import org.broadinstitute.sting.gatk.LocusContext;
|
2009-05-15 00:52:18 +08:00
|
|
|
import org.broadinstitute.sting.gatk.dataSources.providers.ShardDataProvider;
|
2009-05-23 03:12:00 +08:00
|
|
|
import org.broadinstitute.sting.gatk.dataSources.providers.ReadView;
|
|
|
|
|
import org.broadinstitute.sting.gatk.dataSources.providers.ReadReferenceView;
|
2009-04-29 03:49:58 +08:00
|
|
|
import org.broadinstitute.sting.gatk.dataSources.shards.ReadShard;
|
2009-04-29 01:55:08 +08:00
|
|
|
import org.broadinstitute.sting.gatk.dataSources.shards.Shard;
|
2009-05-28 02:24:31 +08:00
|
|
|
import org.broadinstitute.sting.gatk.dataSources.shards.IntervalShard;
|
2009-04-29 01:55:08 +08:00
|
|
|
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
|
|
|
|
|
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
|
|
|
|
|
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
|
|
|
|
|
import org.broadinstitute.sting.gatk.walkers.Walker;
|
|
|
|
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
|
|
|
|
|
|
|
|
|
import java.io.File;
|
2009-05-01 04:35:56 +08:00
|
|
|
import java.util.ArrayList;
|
2009-04-29 03:49:58 +08:00
|
|
|
import java.util.Arrays;
|
2009-05-01 04:35:56 +08:00
|
|
|
import java.util.List;
|
2009-04-29 01:55:08 +08:00
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
*
|
|
|
|
|
* User: aaron
|
|
|
|
|
* Date: Apr 24, 2009
|
|
|
|
|
* Time: 10:35:22 AM
|
|
|
|
|
*
|
|
|
|
|
* The Broad Institute
|
|
|
|
|
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
|
|
|
|
* This software and its documentation are copyright 2009 by the
|
|
|
|
|
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
|
|
|
|
*
|
|
|
|
|
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
|
|
|
|
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
|
|
|
|
*
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* @author aaron
|
|
|
|
|
* @version 1.0
|
|
|
|
|
* @date Apr 24, 2009
|
|
|
|
|
* <p/>
|
|
|
|
|
* Class TraverseReads
|
|
|
|
|
* <p/>
|
|
|
|
|
* This class handles traversing by reads in the new shardable style
|
|
|
|
|
*/
|
|
|
|
|
public class TraverseReads extends TraversalEngine {
|
2009-05-01 04:35:56 +08:00
|
|
|
final ArrayList<String> x = new ArrayList<String>();
|
2009-04-29 01:55:08 +08:00
|
|
|
|
|
|
|
|
/** our log, which we want to capture anything from this class */
|
|
|
|
|
protected static Logger logger = Logger.getLogger(TraverseReads.class);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Creates a new, uninitialized TraversalEngine
|
|
|
|
|
*
|
|
|
|
|
* @param reads SAM/BAM file of reads
|
|
|
|
|
* @param ref Reference file in FASTA format, assumes a .dict file is also available
|
|
|
|
|
* @param rods Array of reference ordered data sets
|
|
|
|
|
*/
|
|
|
|
|
public TraverseReads(List<File> reads, File ref, List<ReferenceOrderedData<? extends ReferenceOrderedDatum>> rods) {
|
|
|
|
|
super(reads, ref, rods);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Traverse by reads, given the data and the walker
|
2009-05-01 04:35:56 +08:00
|
|
|
*
|
2009-04-29 01:55:08 +08:00
|
|
|
* @param walker the walker to execute over
|
2009-05-01 04:35:56 +08:00
|
|
|
* @param shard the shard of data to feed the walker
|
|
|
|
|
* @param sum of type T, the return from the walker
|
|
|
|
|
* @param <M> the generic type
|
|
|
|
|
* @param <T> the return type of the reduce function
|
2009-04-29 01:55:08 +08:00
|
|
|
* @return
|
|
|
|
|
*/
|
|
|
|
|
public <M, T> T traverse(Walker<M, T> walker,
|
|
|
|
|
Shard shard,
|
2009-05-09 05:27:54 +08:00
|
|
|
ShardDataProvider dataProvider,
|
2009-04-29 01:55:08 +08:00
|
|
|
T sum) {
|
|
|
|
|
|
2009-05-28 02:24:31 +08:00
|
|
|
if (shard instanceof ReadShard) {
|
|
|
|
|
logger.debug(String.format("TraverseReads.traverse Genomic interval is %s", ((ReadShard) shard).getSize()));
|
|
|
|
|
} else if (shard instanceof IntervalShard) {
|
|
|
|
|
logger.debug(String.format("TraverseReads.traverse Genomic interval is %s", ((IntervalShard) shard).getGenomeLoc()));
|
|
|
|
|
}
|
|
|
|
|
|
2009-04-29 01:55:08 +08:00
|
|
|
|
|
|
|
|
if (!(walker instanceof ReadWalker))
|
|
|
|
|
throw new IllegalArgumentException("Walker isn't a read walker!");
|
|
|
|
|
|
2009-05-09 05:27:54 +08:00
|
|
|
if( !dataProvider.hasReads() )
|
|
|
|
|
throw new IllegalArgumentException("Unable to traverse reads; no read data is available.");
|
|
|
|
|
|
2009-04-29 01:55:08 +08:00
|
|
|
ReadWalker<M, T> readWalker = (ReadWalker<M, T>) walker;
|
2009-04-29 03:49:58 +08:00
|
|
|
|
2009-05-23 03:12:00 +08:00
|
|
|
ReadView reads = new ReadView(dataProvider);
|
|
|
|
|
ReadReferenceView reference = new ReadReferenceView(dataProvider);
|
|
|
|
|
|
2009-04-29 01:55:08 +08:00
|
|
|
// while we still have more reads
|
2009-05-23 03:12:00 +08:00
|
|
|
for (SAMRecord read : reads) {
|
2009-04-29 01:55:08 +08:00
|
|
|
|
2009-05-06 04:36:00 +08:00
|
|
|
// our locus context
|
|
|
|
|
LocusContext locus = null;
|
2009-04-29 01:55:08 +08:00
|
|
|
|
2009-05-11 11:42:38 +08:00
|
|
|
// an array of characters that represent the reference
|
|
|
|
|
char[] refSeq = null;
|
|
|
|
|
|
2009-05-06 04:36:00 +08:00
|
|
|
if (read.getReferenceIndex() >= 0) {
|
|
|
|
|
// get the genome loc from the read
|
|
|
|
|
GenomeLoc site = new GenomeLoc(read);
|
|
|
|
|
|
|
|
|
|
// Jump forward in the reference to this locus location
|
|
|
|
|
locus = new LocusContext(site, Arrays.asList(read), Arrays.asList(0));
|
2009-05-11 11:42:38 +08:00
|
|
|
|
|
|
|
|
// get the array of characters for the reference sequence, since we're a mapped read
|
2009-05-12 21:29:44 +08:00
|
|
|
if( dataProvider.hasReference() )
|
2009-05-23 03:12:00 +08:00
|
|
|
refSeq = reference.getReferenceBases( read );
|
2009-05-06 04:36:00 +08:00
|
|
|
}
|
2009-04-29 03:49:58 +08:00
|
|
|
|
2009-04-29 01:55:08 +08:00
|
|
|
// update the number of reads we've seen
|
|
|
|
|
TraversalStatistics.nRecords++;
|
|
|
|
|
|
2009-05-12 06:45:11 +08:00
|
|
|
final boolean keepMeP = readWalker.filter(refSeq, read);
|
2009-04-29 01:55:08 +08:00
|
|
|
if (keepMeP) {
|
2009-05-12 06:45:11 +08:00
|
|
|
M x = readWalker.map(refSeq, read);
|
2009-04-29 01:55:08 +08:00
|
|
|
sum = readWalker.reduce(x, sum);
|
|
|
|
|
}
|
|
|
|
|
|
2009-05-06 04:36:00 +08:00
|
|
|
if (locus != null) { printProgress("loci", locus.getLocation()); }
|
2009-04-29 01:55:08 +08:00
|
|
|
}
|
|
|
|
|
return sum;
|
|
|
|
|
}
|
|
|
|
|
|
2009-05-08 22:12:45 +08:00
|
|
|
/**
|
|
|
|
|
* Temporary override of printOnTraversalDone.
|
|
|
|
|
* TODO: Add some sort of TE.getName() function once all TraversalEngines are ported.
|
|
|
|
|
* @param sum Result of the computation.
|
|
|
|
|
* @param <T> Type of the result.
|
|
|
|
|
*/
|
|
|
|
|
public <T> void printOnTraversalDone( T sum ) {
|
|
|
|
|
printOnTraversalDone( "reads", sum );
|
2009-05-15 00:52:18 +08:00
|
|
|
}
|
2009-04-29 01:55:08 +08:00
|
|
|
}
|