Bringing up scaffolding for integration of locus traversals by reference with Aaron's data source code.
Reverts to original TraverseByLociByReference behavior unless a special combination of command-line flags are used. Lightly tested at best, and major flaws include: - MicroManager is not doing MicroScheduling right now; it's driving the traversals. - New database-ish data providers imply by their interface that they're stateless, but they're highly stateful. - Using static objects to circumvent encapsulation. - Code duplication is rampant. - Plus more! git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@346 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
49b2622e3d
commit
8a1207e4db
|
|
@ -20,6 +20,7 @@ import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
|||
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.Walker;
|
||||
import org.broadinstitute.sting.gatk.traversals.*;
|
||||
import org.broadinstitute.sting.gatk.executive.MicroManager;
|
||||
import org.broadinstitute.sting.utils.FastaSequenceFile2;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
|
|
@ -65,6 +66,7 @@ public class GenomeAnalysisTK extends CommandLineProgram {
|
|||
private TraversalEngine engine = null;
|
||||
public boolean DEBUGGING = false;
|
||||
public Boolean WALK_ALL_LOCI = false;
|
||||
public Boolean ENABLE_THREADING = false;
|
||||
|
||||
/**
|
||||
* An output file presented to the walker.
|
||||
|
|
@ -81,6 +83,11 @@ public class GenomeAnalysisTK extends CommandLineProgram {
|
|||
*/
|
||||
public String outErrFileName = null;
|
||||
|
||||
/**
|
||||
* How many threads should be allocated to this analysis.
|
||||
*/
|
||||
public int numThreads = 1;
|
||||
|
||||
/**
|
||||
* The output stream, initialized from OUTFILENAME / OUTERRFILENAME.
|
||||
* Used by the walker.
|
||||
|
|
@ -127,7 +134,9 @@ public class GenomeAnalysisTK extends CommandLineProgram {
|
|||
m_parser.addOptionalArg("out", "o", "An output file presented to the walker. Will overwrite contents if file exists.", "outFileName" );
|
||||
m_parser.addOptionalArg("err", "e", "An error output file presented to the walker. Will overwrite contents if file exists.", "errFileName" );
|
||||
m_parser.addOptionalArg("outerr", "oe", "A joint file for 'normal' and error output presented to the walker. Will overwrite contents if file exists.", "outErrFileName");
|
||||
|
||||
|
||||
m_parser.addOptionalArg("numthreads", "nt", "How many threads should be allocated to running this analysis.", "numThreads");
|
||||
m_parser.addOptionalFlag("enablethreading", "et", "Enable experimental threading support.", "ENABLE_THREADING");
|
||||
//TODO: remove when walkers can ask for tracks
|
||||
m_parser.addOptionalArg("mother", "MOM", "Mother's genotype (SAM pileup)", "MOTHER_GENOTYPE_FILE");
|
||||
m_parser.addOptionalArg("father", "DAD", "Father's genotype (SAM pileup)", "FATHER_GENOTYPE_FILE");
|
||||
|
|
@ -138,7 +147,7 @@ public class GenomeAnalysisTK extends CommandLineProgram {
|
|||
.hasArgs()
|
||||
.withDescription( "Bind rod with <name> and <type> to <file>" )
|
||||
.create("B");
|
||||
m_parser.addOptionalArg(rodBinder, "ROD_BINDINGS");
|
||||
m_parser.addOptionalArg(rodBinder, "ROD_BINDINGS");
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -233,6 +242,15 @@ public class GenomeAnalysisTK extends CommandLineProgram {
|
|||
throw new RuntimeException( "Unable to access walker", ex );
|
||||
}
|
||||
|
||||
// Prepare the sort ordering w.r.t. the sequence dictionary
|
||||
FastaSequenceFile2 refFile = null;
|
||||
if (REF_FILE_ARG != null) {
|
||||
refFile = new FastaSequenceFile2(REF_FILE_ARG);
|
||||
GenomeLoc.setupRefContigOrdering(refFile);
|
||||
}
|
||||
|
||||
MicroManager microManager = null;
|
||||
|
||||
// Try to get the walker specified
|
||||
try {
|
||||
LocusWalker<?, ?> walker = (LocusWalker<?, ?>) my_walker;
|
||||
|
|
@ -245,9 +263,18 @@ public class GenomeAnalysisTK extends CommandLineProgram {
|
|||
if ( walker.cannotHandleReads() )
|
||||
Utils.scareUser(String.format("Analysis %s doesn't support SAM/BAM reads, but a read file %s was provided", Analysis_Name, INPUT_FILE));
|
||||
|
||||
|
||||
if ( WALK_ALL_LOCI )
|
||||
this.engine = new TraverseByLociByReference(INPUT_FILE, REF_FILE_ARG, rods);
|
||||
if ( WALK_ALL_LOCI ) {
|
||||
// TODO: Temporary debugging code. Activate the new debugging code only when the MicroManager
|
||||
// is not filtered.
|
||||
if( ENABLE_THREADING && REGION_STR == null ) {
|
||||
logger.warn("Preliminary threading support enabled");
|
||||
microManager = new MicroManager( INPUT_FILE, REF_FILE_ARG, numThreads );
|
||||
this.engine = microManager.getTraversalEngine();
|
||||
}
|
||||
else {
|
||||
this.engine = new TraverseByLociByReference(INPUT_FILE, REF_FILE_ARG, rods);
|
||||
}
|
||||
}
|
||||
else
|
||||
this.engine = new TraverseByLoci(INPUT_FILE, REF_FILE_ARG, rods);
|
||||
}
|
||||
|
|
@ -263,14 +290,13 @@ public class GenomeAnalysisTK extends CommandLineProgram {
|
|||
final ReferenceSequenceFile refFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(REF_FILE_ARG);
|
||||
GenomeLoc.setupRefContigOrdering(refFile);
|
||||
}
|
||||
|
||||
// Determine the validation stringency. Default to ValidationStringency.STRICT.
|
||||
ValidationStringency strictness;
|
||||
if (STRICTNESS_ARG == null) {
|
||||
strictness = ValidationStringency.STRICT;
|
||||
} else if (STRICTNESS_ARG.toLowerCase().equals("lenient")) {
|
||||
strictness = ValidationStringency.LENIENT;
|
||||
} else if (STRICTNESS_ARG.toLowerCase().equals("silent")) {
|
||||
strictness = ValidationStringency.SILENT;
|
||||
} else {
|
||||
try {
|
||||
strictness = Enum.valueOf(ValidationStringency.class, STRICTNESS_ARG);
|
||||
}
|
||||
catch( IllegalArgumentException ex ) {
|
||||
strictness = ValidationStringency.STRICT;
|
||||
}
|
||||
logger.info("Strictness is " + strictness);
|
||||
|
|
@ -304,7 +330,12 @@ public class GenomeAnalysisTK extends CommandLineProgram {
|
|||
engine.setWalkOverAllSites(WALK_ALL_LOCI);
|
||||
engine.initialize();
|
||||
|
||||
engine.traverse(my_walker);
|
||||
if( microManager != null ) {
|
||||
List<GenomeLoc> locations = GenomeLoc.parseGenomeLocs( REGION_STR );
|
||||
microManager.execute( my_walker, locations );
|
||||
}
|
||||
else
|
||||
engine.traverse(my_walker);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,112 @@
|
|||
package org.broadinstitute.sting.gatk.dataSources.providers;
|
||||
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.gatk.iterators.LocusIterator;
|
||||
import org.broadinstitute.sting.gatk.iterators.LocusIteratorByHanger;
|
||||
import org.broadinstitute.sting.gatk.LocusContext;
|
||||
import org.broadinstitute.sting.gatk.traversals.TraversalStatistics;
|
||||
import org.apache.log4j.Logger;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
|
||||
import java.util.Iterator;
|
||||
import java.util.ArrayList;
|
||||
|
||||
import edu.mit.broad.picard.filter.FilteringIterator;
|
||||
import edu.mit.broad.picard.filter.SamRecordFilter;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: hanna
|
||||
* Date: Apr 8, 2009
|
||||
* Time: 3:00:28 PM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public class LocusContextProvider {
|
||||
private Iterator<SAMRecord> reads;
|
||||
|
||||
// What's the last locus accessed? Used for sanity checking.
|
||||
private GenomeLoc lastLoc = null;
|
||||
private LocusIterator loci;
|
||||
|
||||
protected static Logger logger = Logger.getLogger(LocusContextProvider.class);
|
||||
|
||||
public LocusContextProvider( Iterator<SAMRecord> reads ) {
|
||||
this.reads = new FilteringIterator(reads, new locusStreamFilterFunc());
|
||||
// prepare the iterator by loci from reads
|
||||
loci = new LocusIteratorByHanger(this.reads);
|
||||
}
|
||||
|
||||
public LocusContext getLocusContext( GenomeLoc loc ) {
|
||||
// Precondition checks
|
||||
if( lastLoc != null && !loc.isPast( lastLoc ) )
|
||||
throw new RuntimeException( "Internal error: LocusContextProvider assumes that queries it receives are ordered." );
|
||||
|
||||
if( (loc.getStop() - loc.getStart()) > 0 )
|
||||
throw new RuntimeException( "Internal error :LocusContextProviders currently require 1-base genomeLocs.");
|
||||
|
||||
// jump to the first reference site
|
||||
LocusContext locusContext = advanceReadsToLoc( loci, loc );
|
||||
|
||||
// if no locus context was found, create an empty locus
|
||||
if ( locusContext == null || locusContext.getLocation().compareTo( loc ) != 0 )
|
||||
locusContext = new LocusContext(loc, new ArrayList<SAMRecord>(), new ArrayList<Integer>());
|
||||
|
||||
lastLoc = loc;
|
||||
|
||||
return locusContext;
|
||||
}
|
||||
|
||||
private LocusContext advanceReadsToLoc(LocusIterator locusIter, GenomeLoc target) {
|
||||
if ( ! locusIter.hasNext() )
|
||||
return null;
|
||||
|
||||
LocusContext locus = locusIter.next();
|
||||
|
||||
while ( ! target.containsP(locus.getLocation()) && locusIter.hasNext() ) {
|
||||
logger.debug(String.format(" current locus is %s vs %s => %d", locus.getLocation(), target, locus.getLocation().compareTo(target)));
|
||||
locus = locusIter.next();
|
||||
}
|
||||
|
||||
logger.debug(String.format(" returning %s", locus.getLocation()));
|
||||
return locus;
|
||||
}
|
||||
|
||||
/**
|
||||
* Class to filter out un-handle-able reads from the stream. We currently are skipping
|
||||
* unmapped reads, non-primary reads, unaligned reads, and those with indels. We should
|
||||
* really change this to handle indel containing reads.
|
||||
* TODO: Update TraversalEngine with our runtime stats.
|
||||
*/
|
||||
private class locusStreamFilterFunc implements SamRecordFilter {
|
||||
SAMRecord lastRead = null;
|
||||
public boolean filterOut(SAMRecord rec) {
|
||||
boolean result = false;
|
||||
String why = "";
|
||||
if (rec.getReadUnmappedFlag()) {
|
||||
TraversalStatistics.nUnmappedReads++;
|
||||
result = true;
|
||||
why = "Unmapped";
|
||||
} else if (rec.getNotPrimaryAlignmentFlag()) {
|
||||
TraversalStatistics.nNotPrimary++;
|
||||
result = true;
|
||||
why = "Not Primary";
|
||||
} else if (rec.getAlignmentStart() == SAMRecord.NO_ALIGNMENT_START) {
|
||||
TraversalStatistics.nBadAlignments++;
|
||||
result = true;
|
||||
why = "No alignment start";
|
||||
}
|
||||
else {
|
||||
result = false;
|
||||
}
|
||||
|
||||
if (result) {
|
||||
//nSkippedReads++;
|
||||
//System.out.printf(" [filter] %s => %b %s", rec.getReadName(), result, why);
|
||||
} else {
|
||||
//nReads++;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,27 @@
|
|||
package org.broadinstitute.sting.gatk.dataSources.providers;
|
||||
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.gatk.iterators.ReferenceIterator;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: hanna
|
||||
* Date: Apr 8, 2009
|
||||
* Time: 5:01:37 PM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public class ReferenceProvider {
|
||||
private ReferenceIterator reference;
|
||||
|
||||
public ReferenceProvider( ReferenceIterator reference ) {
|
||||
this.reference = reference;
|
||||
}
|
||||
|
||||
public ReferenceIterator getReferenceSequence( GenomeLoc genomeLoc ) {
|
||||
if( (genomeLoc.getStop() - genomeLoc.getStart()) > 0 )
|
||||
throw new RuntimeException( "Internal error :LocusContextProviders currently require 1-base genomeLocs.");
|
||||
|
||||
// jump to the first reference site
|
||||
return reference.seekForward(genomeLoc);
|
||||
}
|
||||
}
|
||||
|
|
@ -1,22 +1,88 @@
|
|||
package org.broadinstitute.sting.gatk.executive;
|
||||
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.FastaSequenceFile2;
|
||||
import org.broadinstitute.sting.gatk.walkers.Walker;
|
||||
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
||||
import org.broadinstitute.sting.gatk.dataSources.shards.ShardStrategy;
|
||||
import org.broadinstitute.sting.gatk.dataSources.shards.ShardStrategyFactory;
|
||||
import org.broadinstitute.sting.gatk.dataSources.shards.Shard;
|
||||
import org.broadinstitute.sting.gatk.dataSources.simpleDataSources.SAMBAMDataSource;
|
||||
import org.broadinstitute.sting.gatk.dataSources.simpleDataSources.SimpleDataSourceLoadException;
|
||||
import org.broadinstitute.sting.gatk.dataSources.providers.LocusContextProvider;
|
||||
import org.broadinstitute.sting.gatk.dataSources.providers.ReferenceProvider;
|
||||
import org.broadinstitute.sting.gatk.iterators.ReferenceIterator;
|
||||
import org.broadinstitute.sting.gatk.traversals.TraverseLociByReference;
|
||||
import org.broadinstitute.sting.gatk.traversals.TraversalEngine;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.apache.log4j.Logger;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Arrays;
|
||||
import java.util.Iterator;
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* A micro-scheduling manager for N-way threaded execution of a traversal
|
||||
*
|
||||
*/
|
||||
public class MicroManager {
|
||||
private static long SHARD_SIZE = 5L;
|
||||
|
||||
public MicroManager( TraversalEngineExecutive TEfactory, // makes worker units
|
||||
GenomeLoc[] locations, // list of work to do
|
||||
int nThreadsToUse, // maximum number of threads to use to do the work
|
||||
int initialChunkSize ) { // the initial chunk size for dividing up the work
|
||||
// do a lot of work here to organize the computation
|
||||
private File reads;
|
||||
private FastaSequenceFile2 ref;
|
||||
|
||||
private TraverseLociByReference traversalEngine = null;
|
||||
|
||||
protected static Logger logger = Logger.getLogger(MicroManager.class);
|
||||
|
||||
public TraversalEngine getTraversalEngine() {
|
||||
return traversalEngine;
|
||||
}
|
||||
|
||||
public void execute() {
|
||||
// actually divide up the work, create worker units, and send them off to do work
|
||||
public MicroManager( File reads, // the reads file
|
||||
File refFile, // the reference file driving the traversal
|
||||
int nThreadsToUse ) { // maximum number of threads to use to do the work
|
||||
|
||||
this.reads = reads;
|
||||
ref = new FastaSequenceFile2(refFile);
|
||||
GenomeLoc.setupRefContigOrdering(ref);
|
||||
|
||||
traversalEngine = new TraverseLociByReference( reads, refFile, new java.util.ArrayList() );
|
||||
}
|
||||
|
||||
|
||||
public void execute( Walker walker, // the analysis technique to use.
|
||||
List<GenomeLoc> locations ) { // list of work to do
|
||||
ShardStrategy shardStrategy = ShardStrategyFactory.shatter( ShardStrategyFactory.SHATTER_STRATEGY.LINEAR,
|
||||
ref.getSequenceDictionary(),
|
||||
SHARD_SIZE );
|
||||
Object accumulator = ((LocusWalker<?,?>)walker).reduceInit();
|
||||
|
||||
for(Shard shard: shardStrategy) {
|
||||
Iterator<SAMRecord> readShard = null;
|
||||
try {
|
||||
SAMBAMDataSource dataSource = new SAMBAMDataSource( Arrays.asList( new String[] { reads.getCanonicalPath() } ) );
|
||||
readShard = dataSource.seek( shard.getGenomeLoc() );
|
||||
}
|
||||
catch( SimpleDataSourceLoadException ex ) {
|
||||
throw new RuntimeException( ex );
|
||||
}
|
||||
catch( IOException ex ) {
|
||||
throw new RuntimeException( ex );
|
||||
}
|
||||
|
||||
ReferenceProvider referenceProvider = new ReferenceProvider( new ReferenceIterator(ref) );
|
||||
LocusContextProvider locusProvider = new LocusContextProvider( readShard );
|
||||
|
||||
accumulator = traversalEngine.traverse( walker, shard, referenceProvider, locusProvider, accumulator );
|
||||
}
|
||||
|
||||
traversalEngine.printOnTraversalDone("loci", accumulator);
|
||||
walker.onTraversalDone(accumulator);
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,31 +1,21 @@
|
|||
package org.broadinstitute.sting.gatk.traversals;
|
||||
|
||||
import edu.mit.broad.picard.filter.FilteringIterator;
|
||||
import edu.mit.broad.picard.filter.SamRecordFilter;
|
||||
import edu.mit.broad.picard.reference.ReferenceSequence;
|
||||
import edu.mit.broad.picard.sam.SamFileHeaderMerger;
|
||||
import edu.mit.broad.picard.directed.IntervalList;
|
||||
import edu.mit.broad.picard.util.Interval;
|
||||
import net.sf.functionalj.Function1;
|
||||
import net.sf.functionalj.FunctionN;
|
||||
import net.sf.functionalj.Functions;
|
||||
import net.sf.functionalj.reflect.JdkStdReflect;
|
||||
import net.sf.functionalj.reflect.StdReflect;
|
||||
import net.sf.samtools.SAMFileHeader;
|
||||
import net.sf.samtools.SAMFileReader;
|
||||
import net.sf.samtools.SAMFileReader.ValidationStringency;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import net.sf.samtools.util.RuntimeIOException;
|
||||
import net.sf.samtools.util.CloseableIterator;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.gatk.iterators.*;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.Walker;
|
||||
import org.broadinstitute.sting.gatk.LocusContext;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
|
||||
import java.io.*;
|
||||
|
|
@ -66,16 +56,6 @@ public abstract class TraversalEngine {
|
|||
protected FastaSequenceFile2 refFile = null; // todo: merge FastaSequenceFile2 into picard!
|
||||
protected ReferenceIterator refIter = null;
|
||||
|
||||
// Number of records (loci, reads) we've processed
|
||||
protected long nRecords = 0;
|
||||
// How many reads have we processed, along with those skipped for various reasons
|
||||
protected int nReads = 0;
|
||||
protected int nSkippedReads = 0;
|
||||
protected int nUnmappedReads = 0;
|
||||
protected int nNotPrimary = 0;
|
||||
protected int nBadAlignments = 0;
|
||||
protected int nSkippedIndels = 0;
|
||||
|
||||
// Progress tracker for the sam file
|
||||
protected FileProgressTracker samReadingTracker = null;
|
||||
|
||||
|
|
@ -277,7 +257,7 @@ public abstract class TraversalEngine {
|
|||
* @param loc Current location
|
||||
*/
|
||||
public void printProgress(boolean mustPrint, final String type, GenomeLoc loc) {
|
||||
final long nRecords = this.nRecords;
|
||||
final long nRecords = TraversalStatistics.nRecords;
|
||||
final long curTime = System.currentTimeMillis();
|
||||
final double elapsed = (curTime - startTime) / 1000.0;
|
||||
//System.out.printf("Cur = %d, last print = %d, elapsed=%.2f, nRecords=%d, met=%b%n", curTime, lastProgressPrintTime, elapsed, nRecords, maxElapsedIntervalForPrinting(curTime));
|
||||
|
|
@ -307,17 +287,20 @@ public abstract class TraversalEngine {
|
|||
* @param sum The reduce result of the traversal
|
||||
* @param <T> ReduceType of the traversal
|
||||
*/
|
||||
protected <T> void printOnTraversalDone(final String type, T sum) {
|
||||
public <T> void printOnTraversalDone(final String type, T sum) {
|
||||
printProgress(true, type, null);
|
||||
logger.info("Traversal reduce result is " + sum);
|
||||
final long curTime = System.currentTimeMillis();
|
||||
final double elapsed = (curTime - startTime) / 1000.0;
|
||||
logger.info(String.format("Total runtime %.2f secs, %.2f min, %.2f hours%n", elapsed, elapsed / 60, elapsed / 3600));
|
||||
logger.info(String.format("Traversal skipped %d reads out of %d total (%.2f%%)", nSkippedReads, nReads, (nSkippedReads * 100.0) / nReads));
|
||||
logger.info(String.format(" -> %d unmapped reads", nUnmappedReads));
|
||||
logger.info(String.format(" -> %d non-primary reads", nNotPrimary));
|
||||
logger.info(String.format(" -> %d reads with bad alignments", nBadAlignments));
|
||||
logger.info(String.format(" -> %d reads with indels", nSkippedIndels));
|
||||
logger.info(String.format("Traversal skipped %d reads out of %d total (%.2f%%)",
|
||||
TraversalStatistics.nSkippedReads,
|
||||
TraversalStatistics.nReads,
|
||||
(TraversalStatistics.nSkippedReads * 100.0) / TraversalStatistics.nReads));
|
||||
logger.info(String.format(" -> %d unmapped reads", TraversalStatistics.nUnmappedReads));
|
||||
logger.info(String.format(" -> %d non-primary reads", TraversalStatistics.nNotPrimary));
|
||||
logger.info(String.format(" -> %d reads with bad alignments", TraversalStatistics.nBadAlignments));
|
||||
logger.info(String.format(" -> %d reads with indels", TraversalStatistics.nSkippedIndels));
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
|
|
@ -520,27 +503,27 @@ public abstract class TraversalEngine {
|
|||
boolean result = false;
|
||||
String why = "";
|
||||
if (rec.getReadUnmappedFlag()) {
|
||||
nUnmappedReads++;
|
||||
TraversalStatistics.nUnmappedReads++;
|
||||
result = true;
|
||||
why = "Unmapped";
|
||||
} else if (rec.getNotPrimaryAlignmentFlag()) {
|
||||
nNotPrimary++;
|
||||
TraversalStatistics.nNotPrimary++;
|
||||
result = true;
|
||||
why = "Not Primary";
|
||||
} else if (rec.getAlignmentStart() == SAMRecord.NO_ALIGNMENT_START) {
|
||||
nBadAlignments++;
|
||||
TraversalStatistics.nBadAlignments++;
|
||||
result = true;
|
||||
why = "No alignment start";
|
||||
}
|
||||
}
|
||||
else {
|
||||
result = false;
|
||||
}
|
||||
|
||||
if (result) {
|
||||
nSkippedReads++;
|
||||
TraversalStatistics.nSkippedReads++;
|
||||
//System.out.printf(" [filter] %s => %b %s", rec.getReadName(), result, why);
|
||||
} else {
|
||||
nReads++;
|
||||
TraversalStatistics.nReads++;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,36 @@
|
|||
package org.broadinstitute.sting.gatk.traversals;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: hanna
|
||||
* Date: Apr 8, 2009
|
||||
* Time: 4:13:40 PM
|
||||
*
|
||||
* Holds a bunch of basic information about the traversal.
|
||||
* TODO: Make this a class that can be passed around from the TraversalEngine to other entries that want to update it.
|
||||
*/
|
||||
public class TraversalStatistics {
|
||||
// Number of records (loci, reads) we've processed
|
||||
public static long nRecords;
|
||||
// How many reads have we processed, along with those skipped for various reasons
|
||||
public static int nReads;
|
||||
public static int nSkippedReads;
|
||||
public static int nUnmappedReads;
|
||||
public static int nNotPrimary;
|
||||
public static int nBadAlignments;
|
||||
public static int nSkippedIndels;
|
||||
|
||||
static {
|
||||
reset();
|
||||
}
|
||||
|
||||
public static void reset() {
|
||||
nRecords = 0;
|
||||
nReads = 0;
|
||||
nSkippedReads = 0;
|
||||
nUnmappedReads = 0;
|
||||
nNotPrimary = 0;
|
||||
nBadAlignments = 0;
|
||||
nSkippedIndels = 0;
|
||||
}
|
||||
}
|
||||
|
|
@ -107,7 +107,7 @@ public class TraverseByLoci extends TraversalEngine {
|
|||
boolean done = false;
|
||||
LocusIterator iter = new LocusIteratorByHanger(filterIter);
|
||||
while (iter.hasNext() && !done) {
|
||||
this.nRecords++;
|
||||
TraversalStatistics.nRecords++;
|
||||
|
||||
// actually get the read and hand it to the walker
|
||||
LocusContext locus = iter.next();
|
||||
|
|
@ -127,8 +127,8 @@ public class TraverseByLoci extends TraversalEngine {
|
|||
|
||||
//System.out.format("Working at %s\n", locus.getLocation().toString());
|
||||
|
||||
if (this.maxReads > 0 && this.nRecords > this.maxReads) {
|
||||
logger.warn(String.format("Maximum number of reads encountered, terminating traversal " + this.nRecords));
|
||||
if (this.maxReads > 0 && TraversalStatistics.nRecords > this.maxReads) {
|
||||
logger.warn(String.format("Maximum number of reads encountered, terminating traversal " + TraversalStatistics.nRecords));
|
||||
done = true;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -61,7 +61,7 @@ public class TraverseByLociByReference extends TraverseByLoci {
|
|||
while ( interval.containsP(refSite.getLocation()) && ! done ) {
|
||||
logger.debug(String.format(" LocusFromReads is %s", locusFromReads == null ? null : locusFromReads.getLocation()));
|
||||
|
||||
this.nRecords++;
|
||||
TraversalStatistics.nRecords++;
|
||||
GenomeLoc current = refSite.getLocation();
|
||||
|
||||
// Iterate forward to get all reference ordered data covering this locus
|
||||
|
|
@ -82,8 +82,8 @@ public class TraverseByLociByReference extends TraverseByLoci {
|
|||
locus.downsampleToCoverage(downsamplingCoverage);
|
||||
sum = walkAtLocus( walker, sum, locus, refSite, tracker );
|
||||
|
||||
if (this.maxReads > 0 && this.nRecords > this.maxReads) {
|
||||
logger.warn(String.format("Maximum number of reads encountered, terminating traversal " + this.nRecords));
|
||||
if (this.maxReads > 0 && TraversalStatistics.nRecords > this.maxReads) {
|
||||
logger.warn(String.format("Maximum number of reads encountered, terminating traversal " + TraversalStatistics.nRecords));
|
||||
done = true;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -76,7 +76,7 @@ public class TraverseByReads extends TraversalEngine {
|
|||
// copy the locations here in case we ever want to use the full list again later and so that we can remove efficiently
|
||||
LinkedList notYetTraversedLocations = new LinkedList(locations);
|
||||
while (samReadIter.hasNext() && !done) {
|
||||
this.nRecords++;
|
||||
TraversalStatistics.nRecords++;
|
||||
|
||||
// get the next read
|
||||
final SAMRecord read = samReadIter.next();
|
||||
|
|
@ -102,8 +102,8 @@ public class TraverseByReads extends TraversalEngine {
|
|||
sum = walker.reduce(x, sum);
|
||||
}
|
||||
|
||||
if (this.maxReads > 0 && this.nRecords > this.maxReads) {
|
||||
logger.warn(String.format(("Maximum number of reads encountered, terminating traversal " + this.nRecords)));
|
||||
if (this.maxReads > 0 && TraversalStatistics.nRecords > this.maxReads) {
|
||||
logger.warn(String.format(("Maximum number of reads encountered, terminating traversal " + TraversalStatistics.nRecords)));
|
||||
done = true;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -87,7 +87,7 @@ public class TraverseByReference extends TraverseByLoci {
|
|||
|
||||
// We keep processing while the next reference location is within the interval
|
||||
while ( (interval == null || interval.containsP(refSite.getLocation())) && ! done ) {
|
||||
this.nRecords++;
|
||||
TraversalStatistics.nRecords++;
|
||||
GenomeLoc current = refSite.getLocation();
|
||||
|
||||
// Iterate forward to get all reference ordered data covering this locus
|
||||
|
|
@ -97,8 +97,8 @@ public class TraverseByReference extends TraverseByLoci {
|
|||
locus.setReferenceContig(refSite.getCurrentContig());
|
||||
sum = walkAtLocus( walker, sum, locus, refSite, tracker );
|
||||
|
||||
if (this.maxReads > 0 && this.nRecords > this.maxReads) {
|
||||
logger.warn(String.format("Maximum number of reads encountered, terminating traversal " + this.nRecords));
|
||||
if (this.maxReads > 0 && TraversalStatistics.nRecords > this.maxReads) {
|
||||
logger.warn(String.format("Maximum number of reads encountered, terminating traversal " + TraversalStatistics.nRecords));
|
||||
done = true;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,93 @@
|
|||
package org.broadinstitute.sting.gatk.traversals;
|
||||
|
||||
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.Walker;
|
||||
import org.broadinstitute.sting.gatk.LocusContext;
|
||||
import org.broadinstitute.sting.gatk.dataSources.shards.Shard;
|
||||
import org.broadinstitute.sting.gatk.dataSources.providers.LocusContextProvider;
|
||||
import org.broadinstitute.sting.gatk.dataSources.providers.ReferenceProvider;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.iterators.ReferenceIterator;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.apache.log4j.Logger;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.ArrayList;
|
||||
import java.io.File;
|
||||
|
||||
/**
|
||||
* A simple, short-term solution to iterating over all reference positions over a series of
|
||||
* genomic locations. Simply overloads the superclass traverse function to go over the entire
|
||||
* interval's reference positions.
|
||||
* mhanna - Added better data source integration.
|
||||
* TODO: Gain confidence in this implementation and remove the original.
|
||||
*/
|
||||
public class TraverseLociByReference extends TraversalEngine {
|
||||
|
||||
/**
|
||||
* our log, which we want to capture anything from this class
|
||||
*/
|
||||
protected static Logger logger = Logger.getLogger(TraversalEngine.class);
|
||||
|
||||
|
||||
public TraverseLociByReference(File reads, File ref, List<ReferenceOrderedData<? extends ReferenceOrderedDatum>> rods) {
|
||||
super( reads, ref, rods );
|
||||
}
|
||||
|
||||
public <M,T> T traverse(Walker<M,T> walker, ArrayList<GenomeLoc> locations) {
|
||||
if ( locations.isEmpty() )
|
||||
Utils.scareUser("Requested all locations be processed without providing locations to be processed!");
|
||||
|
||||
throw new UnsupportedOperationException("This traversal type not supported by TraverseLociByReference");
|
||||
}
|
||||
|
||||
public <M,T> T traverse( Walker<M,T> walker,
|
||||
Shard shard,
|
||||
ReferenceProvider referenceProvider,
|
||||
LocusContextProvider locusProvider,
|
||||
T sum ) {
|
||||
logger.debug(String.format("TraverseLociByReference.traverse Genomic interval is %s", shard.getGenomeLoc()));
|
||||
|
||||
if ( !(walker instanceof LocusWalker) )
|
||||
throw new IllegalArgumentException("Walker isn't a loci walker!");
|
||||
|
||||
LocusWalker<M, T> locusWalker = (LocusWalker<M, T>)walker;
|
||||
GenomeLoc loc = shard.getGenomeLoc();
|
||||
|
||||
// We keep processing while the next reference location is within the interval
|
||||
for( long pos = loc.getStart(); pos <= loc.getStop(); pos++ ) {
|
||||
GenomeLoc site = new GenomeLoc( loc.getContig(), pos );
|
||||
|
||||
TraversalStatistics.nRecords++;
|
||||
|
||||
// Iterate forward to get all reference ordered data covering this locus
|
||||
final RefMetaDataTracker tracker = getReferenceOrderedDataAtLocus( site );
|
||||
|
||||
ReferenceIterator refSite = referenceProvider.getReferenceSequence( site );
|
||||
LocusContext locus = locusProvider.getLocusContext( site );
|
||||
locus.setReferenceContig( refSite.getCurrentContig() );
|
||||
|
||||
if ( DOWNSAMPLE_BY_COVERAGE )
|
||||
locus.downsampleToCoverage(downsamplingCoverage);
|
||||
|
||||
final char refBase = refSite.getBaseAsChar();
|
||||
final boolean keepMeP = locusWalker.filter(tracker, refBase, locus);
|
||||
if (keepMeP) {
|
||||
M x = locusWalker.map(tracker, refBase, locus);
|
||||
sum = locusWalker.reduce(x, sum);
|
||||
}
|
||||
|
||||
if (this.maxReads > 0 && TraversalStatistics.nRecords > this.maxReads) {
|
||||
logger.warn(String.format("Maximum number of reads encountered, terminating traversal " + TraversalStatistics.nRecords));
|
||||
break;
|
||||
}
|
||||
|
||||
printProgress("loci", locus.getLocation());
|
||||
}
|
||||
|
||||
return sum;
|
||||
}
|
||||
}
|
||||
|
|
@ -206,10 +206,13 @@ public class GenomeLoc implements Comparable<GenomeLoc> {
|
|||
* Useful utility function that parses a location string into a coordinate-order sorted
|
||||
* array of GenomeLoc objects
|
||||
*
|
||||
* @param str
|
||||
* @param str String representation of genome locs. Null string corresponds to no filter.
|
||||
* @return Array of GenomeLoc objects corresponding to the locations in the string, sorted by coordinate order
|
||||
*/
|
||||
public static ArrayList<GenomeLoc> parseGenomeLocs(final String str) {
|
||||
// Null string means no filter.
|
||||
if( str == null ) return null;
|
||||
|
||||
// Of the form: loc1;loc2;...
|
||||
// Where each locN can be:
|
||||
// 'chr2', 'chr2:1000000' or 'chr2:1,000,000-2,000,000'
|
||||
|
|
|
|||
Loading…
Reference in New Issue