Bringing up scaffolding for integration of locus traversals by reference with Aaron's data source code.

Reverts to original TraverseByLociByReference behavior unless a special combination of command-line flags are used.

Lightly tested at best, and major flaws include:
- MicroManager is not doing MicroScheduling right now; it's driving the traversals.
- New database-ish data providers imply by their interface that they're stateless, but they're highly stateful.
- Using static objects to circumvent encapsulation.
- Code duplication is rampant.
- Plus more!


git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@346 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
hanna 2009-04-09 20:28:17 +00:00
parent 49b2622e3d
commit 8a1207e4db
12 changed files with 417 additions and 66 deletions

View File

@ -20,6 +20,7 @@ import org.broadinstitute.sting.gatk.walkers.LocusWalker;
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
import org.broadinstitute.sting.gatk.walkers.Walker;
import org.broadinstitute.sting.gatk.traversals.*;
import org.broadinstitute.sting.gatk.executive.MicroManager;
import org.broadinstitute.sting.utils.FastaSequenceFile2;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.Utils;
@ -65,6 +66,7 @@ public class GenomeAnalysisTK extends CommandLineProgram {
private TraversalEngine engine = null;
public boolean DEBUGGING = false;
public Boolean WALK_ALL_LOCI = false;
public Boolean ENABLE_THREADING = false;
/**
* An output file presented to the walker.
@ -81,6 +83,11 @@ public class GenomeAnalysisTK extends CommandLineProgram {
*/
public String outErrFileName = null;
/**
* How many threads should be allocated to this analysis.
*/
public int numThreads = 1;
/**
* The output stream, initialized from OUTFILENAME / OUTERRFILENAME.
* Used by the walker.
@ -127,7 +134,9 @@ public class GenomeAnalysisTK extends CommandLineProgram {
m_parser.addOptionalArg("out", "o", "An output file presented to the walker. Will overwrite contents if file exists.", "outFileName" );
m_parser.addOptionalArg("err", "e", "An error output file presented to the walker. Will overwrite contents if file exists.", "errFileName" );
m_parser.addOptionalArg("outerr", "oe", "A joint file for 'normal' and error output presented to the walker. Will overwrite contents if file exists.", "outErrFileName");
m_parser.addOptionalArg("numthreads", "nt", "How many threads should be allocated to running this analysis.", "numThreads");
m_parser.addOptionalFlag("enablethreading", "et", "Enable experimental threading support.", "ENABLE_THREADING");
//TODO: remove when walkers can ask for tracks
m_parser.addOptionalArg("mother", "MOM", "Mother's genotype (SAM pileup)", "MOTHER_GENOTYPE_FILE");
m_parser.addOptionalArg("father", "DAD", "Father's genotype (SAM pileup)", "FATHER_GENOTYPE_FILE");
@ -138,7 +147,7 @@ public class GenomeAnalysisTK extends CommandLineProgram {
.hasArgs()
.withDescription( "Bind rod with <name> and <type> to <file>" )
.create("B");
m_parser.addOptionalArg(rodBinder, "ROD_BINDINGS");
m_parser.addOptionalArg(rodBinder, "ROD_BINDINGS");
}
/**
@ -233,6 +242,15 @@ public class GenomeAnalysisTK extends CommandLineProgram {
throw new RuntimeException( "Unable to access walker", ex );
}
// Prepare the sort ordering w.r.t. the sequence dictionary
FastaSequenceFile2 refFile = null;
if (REF_FILE_ARG != null) {
refFile = new FastaSequenceFile2(REF_FILE_ARG);
GenomeLoc.setupRefContigOrdering(refFile);
}
MicroManager microManager = null;
// Try to get the walker specified
try {
LocusWalker<?, ?> walker = (LocusWalker<?, ?>) my_walker;
@ -245,9 +263,18 @@ public class GenomeAnalysisTK extends CommandLineProgram {
if ( walker.cannotHandleReads() )
Utils.scareUser(String.format("Analysis %s doesn't support SAM/BAM reads, but a read file %s was provided", Analysis_Name, INPUT_FILE));
if ( WALK_ALL_LOCI )
this.engine = new TraverseByLociByReference(INPUT_FILE, REF_FILE_ARG, rods);
if ( WALK_ALL_LOCI ) {
// TODO: Temporary debugging code. Activate the new debugging code only when the MicroManager
// is not filtered.
if( ENABLE_THREADING && REGION_STR == null ) {
logger.warn("Preliminary threading support enabled");
microManager = new MicroManager( INPUT_FILE, REF_FILE_ARG, numThreads );
this.engine = microManager.getTraversalEngine();
}
else {
this.engine = new TraverseByLociByReference(INPUT_FILE, REF_FILE_ARG, rods);
}
}
else
this.engine = new TraverseByLoci(INPUT_FILE, REF_FILE_ARG, rods);
}
@ -263,14 +290,13 @@ public class GenomeAnalysisTK extends CommandLineProgram {
final ReferenceSequenceFile refFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(REF_FILE_ARG);
GenomeLoc.setupRefContigOrdering(refFile);
}
// Determine the validation stringency. Default to ValidationStringency.STRICT.
ValidationStringency strictness;
if (STRICTNESS_ARG == null) {
strictness = ValidationStringency.STRICT;
} else if (STRICTNESS_ARG.toLowerCase().equals("lenient")) {
strictness = ValidationStringency.LENIENT;
} else if (STRICTNESS_ARG.toLowerCase().equals("silent")) {
strictness = ValidationStringency.SILENT;
} else {
try {
strictness = Enum.valueOf(ValidationStringency.class, STRICTNESS_ARG);
}
catch( IllegalArgumentException ex ) {
strictness = ValidationStringency.STRICT;
}
logger.info("Strictness is " + strictness);
@ -304,7 +330,12 @@ public class GenomeAnalysisTK extends CommandLineProgram {
engine.setWalkOverAllSites(WALK_ALL_LOCI);
engine.initialize();
engine.traverse(my_walker);
if( microManager != null ) {
List<GenomeLoc> locations = GenomeLoc.parseGenomeLocs( REGION_STR );
microManager.execute( my_walker, locations );
}
else
engine.traverse(my_walker);
return 0;
}

View File

@ -0,0 +1,112 @@
package org.broadinstitute.sting.gatk.dataSources.providers;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.gatk.iterators.LocusIterator;
import org.broadinstitute.sting.gatk.iterators.LocusIteratorByHanger;
import org.broadinstitute.sting.gatk.LocusContext;
import org.broadinstitute.sting.gatk.traversals.TraversalStatistics;
import org.apache.log4j.Logger;
import net.sf.samtools.SAMRecord;
import java.util.Iterator;
import java.util.ArrayList;
import edu.mit.broad.picard.filter.FilteringIterator;
import edu.mit.broad.picard.filter.SamRecordFilter;
/**
* Created by IntelliJ IDEA.
* User: hanna
* Date: Apr 8, 2009
* Time: 3:00:28 PM
* To change this template use File | Settings | File Templates.
*/
public class LocusContextProvider {
private Iterator<SAMRecord> reads;
// What's the last locus accessed? Used for sanity checking.
private GenomeLoc lastLoc = null;
private LocusIterator loci;
protected static Logger logger = Logger.getLogger(LocusContextProvider.class);
public LocusContextProvider( Iterator<SAMRecord> reads ) {
this.reads = new FilteringIterator(reads, new locusStreamFilterFunc());
// prepare the iterator by loci from reads
loci = new LocusIteratorByHanger(this.reads);
}
public LocusContext getLocusContext( GenomeLoc loc ) {
// Precondition checks
if( lastLoc != null && !loc.isPast( lastLoc ) )
throw new RuntimeException( "Internal error: LocusContextProvider assumes that queries it receives are ordered." );
if( (loc.getStop() - loc.getStart()) > 0 )
throw new RuntimeException( "Internal error :LocusContextProviders currently require 1-base genomeLocs.");
// jump to the first reference site
LocusContext locusContext = advanceReadsToLoc( loci, loc );
// if no locus context was found, create an empty locus
if ( locusContext == null || locusContext.getLocation().compareTo( loc ) != 0 )
locusContext = new LocusContext(loc, new ArrayList<SAMRecord>(), new ArrayList<Integer>());
lastLoc = loc;
return locusContext;
}
private LocusContext advanceReadsToLoc(LocusIterator locusIter, GenomeLoc target) {
if ( ! locusIter.hasNext() )
return null;
LocusContext locus = locusIter.next();
while ( ! target.containsP(locus.getLocation()) && locusIter.hasNext() ) {
logger.debug(String.format(" current locus is %s vs %s => %d", locus.getLocation(), target, locus.getLocation().compareTo(target)));
locus = locusIter.next();
}
logger.debug(String.format(" returning %s", locus.getLocation()));
return locus;
}
/**
* Class to filter out un-handle-able reads from the stream. We currently are skipping
* unmapped reads, non-primary reads, unaligned reads, and those with indels. We should
* really change this to handle indel containing reads.
* TODO: Update TraversalEngine with our runtime stats.
*/
private class locusStreamFilterFunc implements SamRecordFilter {
SAMRecord lastRead = null;
public boolean filterOut(SAMRecord rec) {
boolean result = false;
String why = "";
if (rec.getReadUnmappedFlag()) {
TraversalStatistics.nUnmappedReads++;
result = true;
why = "Unmapped";
} else if (rec.getNotPrimaryAlignmentFlag()) {
TraversalStatistics.nNotPrimary++;
result = true;
why = "Not Primary";
} else if (rec.getAlignmentStart() == SAMRecord.NO_ALIGNMENT_START) {
TraversalStatistics.nBadAlignments++;
result = true;
why = "No alignment start";
}
else {
result = false;
}
if (result) {
//nSkippedReads++;
//System.out.printf(" [filter] %s => %b %s", rec.getReadName(), result, why);
} else {
//nReads++;
}
return result;
}
}
}

View File

@ -0,0 +1,27 @@
package org.broadinstitute.sting.gatk.dataSources.providers;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.gatk.iterators.ReferenceIterator;
/**
* Created by IntelliJ IDEA.
* User: hanna
* Date: Apr 8, 2009
* Time: 5:01:37 PM
* To change this template use File | Settings | File Templates.
*/
public class ReferenceProvider {
private ReferenceIterator reference;
public ReferenceProvider( ReferenceIterator reference ) {
this.reference = reference;
}
public ReferenceIterator getReferenceSequence( GenomeLoc genomeLoc ) {
if( (genomeLoc.getStop() - genomeLoc.getStart()) > 0 )
throw new RuntimeException( "Internal error :LocusContextProviders currently require 1-base genomeLocs.");
// jump to the first reference site
return reference.seekForward(genomeLoc);
}
}

View File

@ -1,22 +1,88 @@
package org.broadinstitute.sting.gatk.executive;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.FastaSequenceFile2;
import org.broadinstitute.sting.gatk.walkers.Walker;
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
import org.broadinstitute.sting.gatk.dataSources.shards.ShardStrategy;
import org.broadinstitute.sting.gatk.dataSources.shards.ShardStrategyFactory;
import org.broadinstitute.sting.gatk.dataSources.shards.Shard;
import org.broadinstitute.sting.gatk.dataSources.simpleDataSources.SAMBAMDataSource;
import org.broadinstitute.sting.gatk.dataSources.simpleDataSources.SimpleDataSourceLoadException;
import org.broadinstitute.sting.gatk.dataSources.providers.LocusContextProvider;
import org.broadinstitute.sting.gatk.dataSources.providers.ReferenceProvider;
import org.broadinstitute.sting.gatk.iterators.ReferenceIterator;
import org.broadinstitute.sting.gatk.traversals.TraverseLociByReference;
import org.broadinstitute.sting.gatk.traversals.TraversalEngine;
import net.sf.samtools.SAMRecord;
import org.apache.log4j.Logger;
import java.util.List;
import java.util.Arrays;
import java.util.Iterator;
import java.io.File;
import java.io.IOException;
/**
* A micro-scheduling manager for N-way threaded execution of a traversal
*
*/
public class MicroManager {
private static long SHARD_SIZE = 5L;
public MicroManager( TraversalEngineExecutive TEfactory, // makes worker units
GenomeLoc[] locations, // list of work to do
int nThreadsToUse, // maximum number of threads to use to do the work
int initialChunkSize ) { // the initial chunk size for dividing up the work
// do a lot of work here to organize the computation
private File reads;
private FastaSequenceFile2 ref;
private TraverseLociByReference traversalEngine = null;
protected static Logger logger = Logger.getLogger(MicroManager.class);
public TraversalEngine getTraversalEngine() {
return traversalEngine;
}
public void execute() {
// actually divide up the work, create worker units, and send them off to do work
public MicroManager( File reads, // the reads file
File refFile, // the reference file driving the traversal
int nThreadsToUse ) { // maximum number of threads to use to do the work
this.reads = reads;
ref = new FastaSequenceFile2(refFile);
GenomeLoc.setupRefContigOrdering(ref);
traversalEngine = new TraverseLociByReference( reads, refFile, new java.util.ArrayList() );
}
public void execute( Walker walker, // the analysis technique to use.
List<GenomeLoc> locations ) { // list of work to do
ShardStrategy shardStrategy = ShardStrategyFactory.shatter( ShardStrategyFactory.SHATTER_STRATEGY.LINEAR,
ref.getSequenceDictionary(),
SHARD_SIZE );
Object accumulator = ((LocusWalker<?,?>)walker).reduceInit();
for(Shard shard: shardStrategy) {
Iterator<SAMRecord> readShard = null;
try {
SAMBAMDataSource dataSource = new SAMBAMDataSource( Arrays.asList( new String[] { reads.getCanonicalPath() } ) );
readShard = dataSource.seek( shard.getGenomeLoc() );
}
catch( SimpleDataSourceLoadException ex ) {
throw new RuntimeException( ex );
}
catch( IOException ex ) {
throw new RuntimeException( ex );
}
ReferenceProvider referenceProvider = new ReferenceProvider( new ReferenceIterator(ref) );
LocusContextProvider locusProvider = new LocusContextProvider( readShard );
accumulator = traversalEngine.traverse( walker, shard, referenceProvider, locusProvider, accumulator );
}
traversalEngine.printOnTraversalDone("loci", accumulator);
walker.onTraversalDone(accumulator);
}
}

View File

@ -1,31 +1,21 @@
package org.broadinstitute.sting.gatk.traversals;
import edu.mit.broad.picard.filter.FilteringIterator;
import edu.mit.broad.picard.filter.SamRecordFilter;
import edu.mit.broad.picard.reference.ReferenceSequence;
import edu.mit.broad.picard.sam.SamFileHeaderMerger;
import edu.mit.broad.picard.directed.IntervalList;
import edu.mit.broad.picard.util.Interval;
import net.sf.functionalj.Function1;
import net.sf.functionalj.FunctionN;
import net.sf.functionalj.Functions;
import net.sf.functionalj.reflect.JdkStdReflect;
import net.sf.functionalj.reflect.StdReflect;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMFileReader;
import net.sf.samtools.SAMFileReader.ValidationStringency;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.util.RuntimeIOException;
import net.sf.samtools.util.CloseableIterator;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.iterators.*;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
import org.broadinstitute.sting.gatk.walkers.Walker;
import org.broadinstitute.sting.gatk.LocusContext;
import org.broadinstitute.sting.utils.*;
import java.io.*;
@ -66,16 +56,6 @@ public abstract class TraversalEngine {
protected FastaSequenceFile2 refFile = null; // todo: merge FastaSequenceFile2 into picard!
protected ReferenceIterator refIter = null;
// Number of records (loci, reads) we've processed
protected long nRecords = 0;
// How many reads have we processed, along with those skipped for various reasons
protected int nReads = 0;
protected int nSkippedReads = 0;
protected int nUnmappedReads = 0;
protected int nNotPrimary = 0;
protected int nBadAlignments = 0;
protected int nSkippedIndels = 0;
// Progress tracker for the sam file
protected FileProgressTracker samReadingTracker = null;
@ -277,7 +257,7 @@ public abstract class TraversalEngine {
* @param loc Current location
*/
public void printProgress(boolean mustPrint, final String type, GenomeLoc loc) {
final long nRecords = this.nRecords;
final long nRecords = TraversalStatistics.nRecords;
final long curTime = System.currentTimeMillis();
final double elapsed = (curTime - startTime) / 1000.0;
//System.out.printf("Cur = %d, last print = %d, elapsed=%.2f, nRecords=%d, met=%b%n", curTime, lastProgressPrintTime, elapsed, nRecords, maxElapsedIntervalForPrinting(curTime));
@ -307,17 +287,20 @@ public abstract class TraversalEngine {
* @param sum The reduce result of the traversal
* @param <T> ReduceType of the traversal
*/
protected <T> void printOnTraversalDone(final String type, T sum) {
public <T> void printOnTraversalDone(final String type, T sum) {
printProgress(true, type, null);
logger.info("Traversal reduce result is " + sum);
final long curTime = System.currentTimeMillis();
final double elapsed = (curTime - startTime) / 1000.0;
logger.info(String.format("Total runtime %.2f secs, %.2f min, %.2f hours%n", elapsed, elapsed / 60, elapsed / 3600));
logger.info(String.format("Traversal skipped %d reads out of %d total (%.2f%%)", nSkippedReads, nReads, (nSkippedReads * 100.0) / nReads));
logger.info(String.format(" -> %d unmapped reads", nUnmappedReads));
logger.info(String.format(" -> %d non-primary reads", nNotPrimary));
logger.info(String.format(" -> %d reads with bad alignments", nBadAlignments));
logger.info(String.format(" -> %d reads with indels", nSkippedIndels));
logger.info(String.format("Traversal skipped %d reads out of %d total (%.2f%%)",
TraversalStatistics.nSkippedReads,
TraversalStatistics.nReads,
(TraversalStatistics.nSkippedReads * 100.0) / TraversalStatistics.nReads));
logger.info(String.format(" -> %d unmapped reads", TraversalStatistics.nUnmappedReads));
logger.info(String.format(" -> %d non-primary reads", TraversalStatistics.nNotPrimary));
logger.info(String.format(" -> %d reads with bad alignments", TraversalStatistics.nBadAlignments));
logger.info(String.format(" -> %d reads with indels", TraversalStatistics.nSkippedIndels));
}
// --------------------------------------------------------------------------------------------------------------
@ -520,27 +503,27 @@ public abstract class TraversalEngine {
boolean result = false;
String why = "";
if (rec.getReadUnmappedFlag()) {
nUnmappedReads++;
TraversalStatistics.nUnmappedReads++;
result = true;
why = "Unmapped";
} else if (rec.getNotPrimaryAlignmentFlag()) {
nNotPrimary++;
TraversalStatistics.nNotPrimary++;
result = true;
why = "Not Primary";
} else if (rec.getAlignmentStart() == SAMRecord.NO_ALIGNMENT_START) {
nBadAlignments++;
TraversalStatistics.nBadAlignments++;
result = true;
why = "No alignment start";
}
}
else {
result = false;
}
if (result) {
nSkippedReads++;
TraversalStatistics.nSkippedReads++;
//System.out.printf(" [filter] %s => %b %s", rec.getReadName(), result, why);
} else {
nReads++;
TraversalStatistics.nReads++;
}
return result;
}

View File

@ -0,0 +1,36 @@
package org.broadinstitute.sting.gatk.traversals;
/**
* Created by IntelliJ IDEA.
* User: hanna
* Date: Apr 8, 2009
* Time: 4:13:40 PM
*
* Holds a bunch of basic information about the traversal.
* TODO: Make this a class that can be passed around from the TraversalEngine to other entries that want to update it.
*/
public class TraversalStatistics {
// Number of records (loci, reads) we've processed
public static long nRecords;
// How many reads have we processed, along with those skipped for various reasons
public static int nReads;
public static int nSkippedReads;
public static int nUnmappedReads;
public static int nNotPrimary;
public static int nBadAlignments;
public static int nSkippedIndels;
static {
reset();
}
public static void reset() {
nRecords = 0;
nReads = 0;
nSkippedReads = 0;
nUnmappedReads = 0;
nNotPrimary = 0;
nBadAlignments = 0;
nSkippedIndels = 0;
}
}

View File

@ -107,7 +107,7 @@ public class TraverseByLoci extends TraversalEngine {
boolean done = false;
LocusIterator iter = new LocusIteratorByHanger(filterIter);
while (iter.hasNext() && !done) {
this.nRecords++;
TraversalStatistics.nRecords++;
// actually get the read and hand it to the walker
LocusContext locus = iter.next();
@ -127,8 +127,8 @@ public class TraverseByLoci extends TraversalEngine {
//System.out.format("Working at %s\n", locus.getLocation().toString());
if (this.maxReads > 0 && this.nRecords > this.maxReads) {
logger.warn(String.format("Maximum number of reads encountered, terminating traversal " + this.nRecords));
if (this.maxReads > 0 && TraversalStatistics.nRecords > this.maxReads) {
logger.warn(String.format("Maximum number of reads encountered, terminating traversal " + TraversalStatistics.nRecords));
done = true;
}

View File

@ -61,7 +61,7 @@ public class TraverseByLociByReference extends TraverseByLoci {
while ( interval.containsP(refSite.getLocation()) && ! done ) {
logger.debug(String.format(" LocusFromReads is %s", locusFromReads == null ? null : locusFromReads.getLocation()));
this.nRecords++;
TraversalStatistics.nRecords++;
GenomeLoc current = refSite.getLocation();
// Iterate forward to get all reference ordered data covering this locus
@ -82,8 +82,8 @@ public class TraverseByLociByReference extends TraverseByLoci {
locus.downsampleToCoverage(downsamplingCoverage);
sum = walkAtLocus( walker, sum, locus, refSite, tracker );
if (this.maxReads > 0 && this.nRecords > this.maxReads) {
logger.warn(String.format("Maximum number of reads encountered, terminating traversal " + this.nRecords));
if (this.maxReads > 0 && TraversalStatistics.nRecords > this.maxReads) {
logger.warn(String.format("Maximum number of reads encountered, terminating traversal " + TraversalStatistics.nRecords));
done = true;
}

View File

@ -76,7 +76,7 @@ public class TraverseByReads extends TraversalEngine {
// copy the locations here in case we ever want to use the full list again later and so that we can remove efficiently
LinkedList notYetTraversedLocations = new LinkedList(locations);
while (samReadIter.hasNext() && !done) {
this.nRecords++;
TraversalStatistics.nRecords++;
// get the next read
final SAMRecord read = samReadIter.next();
@ -102,8 +102,8 @@ public class TraverseByReads extends TraversalEngine {
sum = walker.reduce(x, sum);
}
if (this.maxReads > 0 && this.nRecords > this.maxReads) {
logger.warn(String.format(("Maximum number of reads encountered, terminating traversal " + this.nRecords)));
if (this.maxReads > 0 && TraversalStatistics.nRecords > this.maxReads) {
logger.warn(String.format(("Maximum number of reads encountered, terminating traversal " + TraversalStatistics.nRecords)));
done = true;
}
}

View File

@ -87,7 +87,7 @@ public class TraverseByReference extends TraverseByLoci {
// We keep processing while the next reference location is within the interval
while ( (interval == null || interval.containsP(refSite.getLocation())) && ! done ) {
this.nRecords++;
TraversalStatistics.nRecords++;
GenomeLoc current = refSite.getLocation();
// Iterate forward to get all reference ordered data covering this locus
@ -97,8 +97,8 @@ public class TraverseByReference extends TraverseByLoci {
locus.setReferenceContig(refSite.getCurrentContig());
sum = walkAtLocus( walker, sum, locus, refSite, tracker );
if (this.maxReads > 0 && this.nRecords > this.maxReads) {
logger.warn(String.format("Maximum number of reads encountered, terminating traversal " + this.nRecords));
if (this.maxReads > 0 && TraversalStatistics.nRecords > this.maxReads) {
logger.warn(String.format("Maximum number of reads encountered, terminating traversal " + TraversalStatistics.nRecords));
done = true;
}

View File

@ -0,0 +1,93 @@
package org.broadinstitute.sting.gatk.traversals;
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
import org.broadinstitute.sting.gatk.walkers.Walker;
import org.broadinstitute.sting.gatk.LocusContext;
import org.broadinstitute.sting.gatk.dataSources.shards.Shard;
import org.broadinstitute.sting.gatk.dataSources.providers.LocusContextProvider;
import org.broadinstitute.sting.gatk.dataSources.providers.ReferenceProvider;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.iterators.ReferenceIterator;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.Utils;
import org.apache.log4j.Logger;
import java.util.List;
import java.util.ArrayList;
import java.io.File;
/**
* A simple, short-term solution to iterating over all reference positions over a series of
* genomic locations. Simply overloads the superclass traverse function to go over the entire
* interval's reference positions.
* mhanna - Added better data source integration.
* TODO: Gain confidence in this implementation and remove the original.
*/
public class TraverseLociByReference extends TraversalEngine {
/**
* our log, which we want to capture anything from this class
*/
protected static Logger logger = Logger.getLogger(TraversalEngine.class);
public TraverseLociByReference(File reads, File ref, List<ReferenceOrderedData<? extends ReferenceOrderedDatum>> rods) {
super( reads, ref, rods );
}
public <M,T> T traverse(Walker<M,T> walker, ArrayList<GenomeLoc> locations) {
if ( locations.isEmpty() )
Utils.scareUser("Requested all locations be processed without providing locations to be processed!");
throw new UnsupportedOperationException("This traversal type not supported by TraverseLociByReference");
}
public <M,T> T traverse( Walker<M,T> walker,
Shard shard,
ReferenceProvider referenceProvider,
LocusContextProvider locusProvider,
T sum ) {
logger.debug(String.format("TraverseLociByReference.traverse Genomic interval is %s", shard.getGenomeLoc()));
if ( !(walker instanceof LocusWalker) )
throw new IllegalArgumentException("Walker isn't a loci walker!");
LocusWalker<M, T> locusWalker = (LocusWalker<M, T>)walker;
GenomeLoc loc = shard.getGenomeLoc();
// We keep processing while the next reference location is within the interval
for( long pos = loc.getStart(); pos <= loc.getStop(); pos++ ) {
GenomeLoc site = new GenomeLoc( loc.getContig(), pos );
TraversalStatistics.nRecords++;
// Iterate forward to get all reference ordered data covering this locus
final RefMetaDataTracker tracker = getReferenceOrderedDataAtLocus( site );
ReferenceIterator refSite = referenceProvider.getReferenceSequence( site );
LocusContext locus = locusProvider.getLocusContext( site );
locus.setReferenceContig( refSite.getCurrentContig() );
if ( DOWNSAMPLE_BY_COVERAGE )
locus.downsampleToCoverage(downsamplingCoverage);
final char refBase = refSite.getBaseAsChar();
final boolean keepMeP = locusWalker.filter(tracker, refBase, locus);
if (keepMeP) {
M x = locusWalker.map(tracker, refBase, locus);
sum = locusWalker.reduce(x, sum);
}
if (this.maxReads > 0 && TraversalStatistics.nRecords > this.maxReads) {
logger.warn(String.format("Maximum number of reads encountered, terminating traversal " + TraversalStatistics.nRecords));
break;
}
printProgress("loci", locus.getLocation());
}
return sum;
}
}

View File

@ -206,10 +206,13 @@ public class GenomeLoc implements Comparable<GenomeLoc> {
* Useful utility function that parses a location string into a coordinate-order sorted
* array of GenomeLoc objects
*
* @param str
* @param str String representation of genome locs. Null string corresponds to no filter.
* @return Array of GenomeLoc objects corresponding to the locations in the string, sorted by coordinate order
*/
public static ArrayList<GenomeLoc> parseGenomeLocs(final String str) {
// Null string means no filter.
if( str == null ) return null;
// Of the form: loc1;loc2;...
// Where each locN can be:
// 'chr2', 'chr2:1000000' or 'chr2:1,000,000-2,000,000'