From 8a1207e4db0d3f3b3a1697a4ffde955458c9304d Mon Sep 17 00:00:00 2001 From: hanna Date: Thu, 9 Apr 2009 20:28:17 +0000 Subject: [PATCH] Bringing up scaffolding for integration of locus traversals by reference with Aaron's data source code. Reverts to original TraverseByLociByReference behavior unless a special combination of command-line flags are used. Lightly tested at best, and major flaws include: - MicroManager is not doing MicroScheduling right now; it's driving the traversals. - New database-ish data providers imply by their interface that they're stateless, but they're highly stateful. - Using static objects to circumvent encapsulation. - Code duplication is rampant. - Plus more! git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@346 348d0f76-0448-11de-a6fe-93d51630548a --- .../sting/gatk/GenomeAnalysisTK.java | 57 +++++++-- .../providers/LocusContextProvider.java | 112 ++++++++++++++++++ .../providers/ReferenceProvider.java | 27 +++++ .../sting/gatk/executive/MicroManager.java | 80 +++++++++++-- .../gatk/traversals/TraversalEngine.java | 49 +++----- .../gatk/traversals/TraversalStatistics.java | 36 ++++++ .../sting/gatk/traversals/TraverseByLoci.java | 6 +- .../traversals/TraverseByLociByReference.java | 6 +- .../gatk/traversals/TraverseByReads.java | 6 +- .../gatk/traversals/TraverseByReference.java | 6 +- .../traversals/TraverseLociByReference.java | 93 +++++++++++++++ .../broadinstitute/sting/utils/GenomeLoc.java | 5 +- 12 files changed, 417 insertions(+), 66 deletions(-) create mode 100755 java/src/org/broadinstitute/sting/gatk/dataSources/providers/LocusContextProvider.java create mode 100755 java/src/org/broadinstitute/sting/gatk/dataSources/providers/ReferenceProvider.java create mode 100755 java/src/org/broadinstitute/sting/gatk/traversals/TraversalStatistics.java create mode 100755 java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociByReference.java diff --git a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisTK.java b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisTK.java index 084cf2c7a..953876412 100644 --- a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisTK.java +++ b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisTK.java @@ -20,6 +20,7 @@ import org.broadinstitute.sting.gatk.walkers.LocusWalker; import org.broadinstitute.sting.gatk.walkers.ReadWalker; import org.broadinstitute.sting.gatk.walkers.Walker; import org.broadinstitute.sting.gatk.traversals.*; +import org.broadinstitute.sting.gatk.executive.MicroManager; import org.broadinstitute.sting.utils.FastaSequenceFile2; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.Utils; @@ -65,6 +66,7 @@ public class GenomeAnalysisTK extends CommandLineProgram { private TraversalEngine engine = null; public boolean DEBUGGING = false; public Boolean WALK_ALL_LOCI = false; + public Boolean ENABLE_THREADING = false; /** * An output file presented to the walker. @@ -81,6 +83,11 @@ public class GenomeAnalysisTK extends CommandLineProgram { */ public String outErrFileName = null; + /** + * How many threads should be allocated to this analysis. + */ + public int numThreads = 1; + /** * The output stream, initialized from OUTFILENAME / OUTERRFILENAME. * Used by the walker. @@ -127,7 +134,9 @@ public class GenomeAnalysisTK extends CommandLineProgram { m_parser.addOptionalArg("out", "o", "An output file presented to the walker. Will overwrite contents if file exists.", "outFileName" ); m_parser.addOptionalArg("err", "e", "An error output file presented to the walker. Will overwrite contents if file exists.", "errFileName" ); m_parser.addOptionalArg("outerr", "oe", "A joint file for 'normal' and error output presented to the walker. Will overwrite contents if file exists.", "outErrFileName"); - + + m_parser.addOptionalArg("numthreads", "nt", "How many threads should be allocated to running this analysis.", "numThreads"); + m_parser.addOptionalFlag("enablethreading", "et", "Enable experimental threading support.", "ENABLE_THREADING"); //TODO: remove when walkers can ask for tracks m_parser.addOptionalArg("mother", "MOM", "Mother's genotype (SAM pileup)", "MOTHER_GENOTYPE_FILE"); m_parser.addOptionalArg("father", "DAD", "Father's genotype (SAM pileup)", "FATHER_GENOTYPE_FILE"); @@ -138,7 +147,7 @@ public class GenomeAnalysisTK extends CommandLineProgram { .hasArgs() .withDescription( "Bind rod with and to " ) .create("B"); - m_parser.addOptionalArg(rodBinder, "ROD_BINDINGS"); + m_parser.addOptionalArg(rodBinder, "ROD_BINDINGS"); } /** @@ -233,6 +242,15 @@ public class GenomeAnalysisTK extends CommandLineProgram { throw new RuntimeException( "Unable to access walker", ex ); } + // Prepare the sort ordering w.r.t. the sequence dictionary + FastaSequenceFile2 refFile = null; + if (REF_FILE_ARG != null) { + refFile = new FastaSequenceFile2(REF_FILE_ARG); + GenomeLoc.setupRefContigOrdering(refFile); + } + + MicroManager microManager = null; + // Try to get the walker specified try { LocusWalker walker = (LocusWalker) my_walker; @@ -245,9 +263,18 @@ public class GenomeAnalysisTK extends CommandLineProgram { if ( walker.cannotHandleReads() ) Utils.scareUser(String.format("Analysis %s doesn't support SAM/BAM reads, but a read file %s was provided", Analysis_Name, INPUT_FILE)); - - if ( WALK_ALL_LOCI ) - this.engine = new TraverseByLociByReference(INPUT_FILE, REF_FILE_ARG, rods); + if ( WALK_ALL_LOCI ) { + // TODO: Temporary debugging code. Activate the new debugging code only when the MicroManager + // is not filtered. + if( ENABLE_THREADING && REGION_STR == null ) { + logger.warn("Preliminary threading support enabled"); + microManager = new MicroManager( INPUT_FILE, REF_FILE_ARG, numThreads ); + this.engine = microManager.getTraversalEngine(); + } + else { + this.engine = new TraverseByLociByReference(INPUT_FILE, REF_FILE_ARG, rods); + } + } else this.engine = new TraverseByLoci(INPUT_FILE, REF_FILE_ARG, rods); } @@ -263,14 +290,13 @@ public class GenomeAnalysisTK extends CommandLineProgram { final ReferenceSequenceFile refFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(REF_FILE_ARG); GenomeLoc.setupRefContigOrdering(refFile); } + + // Determine the validation stringency. Default to ValidationStringency.STRICT. ValidationStringency strictness; - if (STRICTNESS_ARG == null) { - strictness = ValidationStringency.STRICT; - } else if (STRICTNESS_ARG.toLowerCase().equals("lenient")) { - strictness = ValidationStringency.LENIENT; - } else if (STRICTNESS_ARG.toLowerCase().equals("silent")) { - strictness = ValidationStringency.SILENT; - } else { + try { + strictness = Enum.valueOf(ValidationStringency.class, STRICTNESS_ARG); + } + catch( IllegalArgumentException ex ) { strictness = ValidationStringency.STRICT; } logger.info("Strictness is " + strictness); @@ -304,7 +330,12 @@ public class GenomeAnalysisTK extends CommandLineProgram { engine.setWalkOverAllSites(WALK_ALL_LOCI); engine.initialize(); - engine.traverse(my_walker); + if( microManager != null ) { + List locations = GenomeLoc.parseGenomeLocs( REGION_STR ); + microManager.execute( my_walker, locations ); + } + else + engine.traverse(my_walker); return 0; } diff --git a/java/src/org/broadinstitute/sting/gatk/dataSources/providers/LocusContextProvider.java b/java/src/org/broadinstitute/sting/gatk/dataSources/providers/LocusContextProvider.java new file mode 100755 index 000000000..1f2afd78d --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/dataSources/providers/LocusContextProvider.java @@ -0,0 +1,112 @@ +package org.broadinstitute.sting.gatk.dataSources.providers; + +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.gatk.iterators.LocusIterator; +import org.broadinstitute.sting.gatk.iterators.LocusIteratorByHanger; +import org.broadinstitute.sting.gatk.LocusContext; +import org.broadinstitute.sting.gatk.traversals.TraversalStatistics; +import org.apache.log4j.Logger; +import net.sf.samtools.SAMRecord; + +import java.util.Iterator; +import java.util.ArrayList; + +import edu.mit.broad.picard.filter.FilteringIterator; +import edu.mit.broad.picard.filter.SamRecordFilter; + +/** + * Created by IntelliJ IDEA. + * User: hanna + * Date: Apr 8, 2009 + * Time: 3:00:28 PM + * To change this template use File | Settings | File Templates. + */ +public class LocusContextProvider { + private Iterator reads; + + // What's the last locus accessed? Used for sanity checking. + private GenomeLoc lastLoc = null; + private LocusIterator loci; + + protected static Logger logger = Logger.getLogger(LocusContextProvider.class); + + public LocusContextProvider( Iterator reads ) { + this.reads = new FilteringIterator(reads, new locusStreamFilterFunc()); + // prepare the iterator by loci from reads + loci = new LocusIteratorByHanger(this.reads); + } + + public LocusContext getLocusContext( GenomeLoc loc ) { + // Precondition checks + if( lastLoc != null && !loc.isPast( lastLoc ) ) + throw new RuntimeException( "Internal error: LocusContextProvider assumes that queries it receives are ordered." ); + + if( (loc.getStop() - loc.getStart()) > 0 ) + throw new RuntimeException( "Internal error :LocusContextProviders currently require 1-base genomeLocs."); + + // jump to the first reference site + LocusContext locusContext = advanceReadsToLoc( loci, loc ); + + // if no locus context was found, create an empty locus + if ( locusContext == null || locusContext.getLocation().compareTo( loc ) != 0 ) + locusContext = new LocusContext(loc, new ArrayList(), new ArrayList()); + + lastLoc = loc; + + return locusContext; + } + + private LocusContext advanceReadsToLoc(LocusIterator locusIter, GenomeLoc target) { + if ( ! locusIter.hasNext() ) + return null; + + LocusContext locus = locusIter.next(); + + while ( ! target.containsP(locus.getLocation()) && locusIter.hasNext() ) { + logger.debug(String.format(" current locus is %s vs %s => %d", locus.getLocation(), target, locus.getLocation().compareTo(target))); + locus = locusIter.next(); + } + + logger.debug(String.format(" returning %s", locus.getLocation())); + return locus; + } + + /** + * Class to filter out un-handle-able reads from the stream. We currently are skipping + * unmapped reads, non-primary reads, unaligned reads, and those with indels. We should + * really change this to handle indel containing reads. + * TODO: Update TraversalEngine with our runtime stats. + */ + private class locusStreamFilterFunc implements SamRecordFilter { + SAMRecord lastRead = null; + public boolean filterOut(SAMRecord rec) { + boolean result = false; + String why = ""; + if (rec.getReadUnmappedFlag()) { + TraversalStatistics.nUnmappedReads++; + result = true; + why = "Unmapped"; + } else if (rec.getNotPrimaryAlignmentFlag()) { + TraversalStatistics.nNotPrimary++; + result = true; + why = "Not Primary"; + } else if (rec.getAlignmentStart() == SAMRecord.NO_ALIGNMENT_START) { + TraversalStatistics.nBadAlignments++; + result = true; + why = "No alignment start"; + } + else { + result = false; + } + + if (result) { + //nSkippedReads++; + //System.out.printf(" [filter] %s => %b %s", rec.getReadName(), result, why); + } else { + //nReads++; + } + return result; + } + } + +} diff --git a/java/src/org/broadinstitute/sting/gatk/dataSources/providers/ReferenceProvider.java b/java/src/org/broadinstitute/sting/gatk/dataSources/providers/ReferenceProvider.java new file mode 100755 index 000000000..047a9e4b7 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/dataSources/providers/ReferenceProvider.java @@ -0,0 +1,27 @@ +package org.broadinstitute.sting.gatk.dataSources.providers; + +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.gatk.iterators.ReferenceIterator; + +/** + * Created by IntelliJ IDEA. + * User: hanna + * Date: Apr 8, 2009 + * Time: 5:01:37 PM + * To change this template use File | Settings | File Templates. + */ +public class ReferenceProvider { + private ReferenceIterator reference; + + public ReferenceProvider( ReferenceIterator reference ) { + this.reference = reference; + } + + public ReferenceIterator getReferenceSequence( GenomeLoc genomeLoc ) { + if( (genomeLoc.getStop() - genomeLoc.getStart()) > 0 ) + throw new RuntimeException( "Internal error :LocusContextProviders currently require 1-base genomeLocs."); + + // jump to the first reference site + return reference.seekForward(genomeLoc); + } +} diff --git a/java/src/org/broadinstitute/sting/gatk/executive/MicroManager.java b/java/src/org/broadinstitute/sting/gatk/executive/MicroManager.java index 47b316baa..7c4db183b 100644 --- a/java/src/org/broadinstitute/sting/gatk/executive/MicroManager.java +++ b/java/src/org/broadinstitute/sting/gatk/executive/MicroManager.java @@ -1,22 +1,88 @@ package org.broadinstitute.sting.gatk.executive; import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.FastaSequenceFile2; +import org.broadinstitute.sting.gatk.walkers.Walker; +import org.broadinstitute.sting.gatk.walkers.LocusWalker; +import org.broadinstitute.sting.gatk.dataSources.shards.ShardStrategy; +import org.broadinstitute.sting.gatk.dataSources.shards.ShardStrategyFactory; +import org.broadinstitute.sting.gatk.dataSources.shards.Shard; +import org.broadinstitute.sting.gatk.dataSources.simpleDataSources.SAMBAMDataSource; +import org.broadinstitute.sting.gatk.dataSources.simpleDataSources.SimpleDataSourceLoadException; +import org.broadinstitute.sting.gatk.dataSources.providers.LocusContextProvider; +import org.broadinstitute.sting.gatk.dataSources.providers.ReferenceProvider; +import org.broadinstitute.sting.gatk.iterators.ReferenceIterator; +import org.broadinstitute.sting.gatk.traversals.TraverseLociByReference; +import org.broadinstitute.sting.gatk.traversals.TraversalEngine; + +import net.sf.samtools.SAMRecord; +import org.apache.log4j.Logger; + +import java.util.List; +import java.util.Arrays; +import java.util.Iterator; +import java.io.File; +import java.io.IOException; /** * A micro-scheduling manager for N-way threaded execution of a traversal * */ public class MicroManager { + private static long SHARD_SIZE = 5L; - public MicroManager( TraversalEngineExecutive TEfactory, // makes worker units - GenomeLoc[] locations, // list of work to do - int nThreadsToUse, // maximum number of threads to use to do the work - int initialChunkSize ) { // the initial chunk size for dividing up the work - // do a lot of work here to organize the computation + private File reads; + private FastaSequenceFile2 ref; + + private TraverseLociByReference traversalEngine = null; + + protected static Logger logger = Logger.getLogger(MicroManager.class); + + public TraversalEngine getTraversalEngine() { + return traversalEngine; } - public void execute() { - // actually divide up the work, create worker units, and send them off to do work + public MicroManager( File reads, // the reads file + File refFile, // the reference file driving the traversal + int nThreadsToUse ) { // maximum number of threads to use to do the work + this.reads = reads; + ref = new FastaSequenceFile2(refFile); + GenomeLoc.setupRefContigOrdering(ref); + + traversalEngine = new TraverseLociByReference( reads, refFile, new java.util.ArrayList() ); } + + + public void execute( Walker walker, // the analysis technique to use. + List locations ) { // list of work to do + ShardStrategy shardStrategy = ShardStrategyFactory.shatter( ShardStrategyFactory.SHATTER_STRATEGY.LINEAR, + ref.getSequenceDictionary(), + SHARD_SIZE ); + Object accumulator = ((LocusWalker)walker).reduceInit(); + + for(Shard shard: shardStrategy) { + Iterator readShard = null; + try { + SAMBAMDataSource dataSource = new SAMBAMDataSource( Arrays.asList( new String[] { reads.getCanonicalPath() } ) ); + readShard = dataSource.seek( shard.getGenomeLoc() ); + } + catch( SimpleDataSourceLoadException ex ) { + throw new RuntimeException( ex ); + } + catch( IOException ex ) { + throw new RuntimeException( ex ); + } + + ReferenceProvider referenceProvider = new ReferenceProvider( new ReferenceIterator(ref) ); + LocusContextProvider locusProvider = new LocusContextProvider( readShard ); + + accumulator = traversalEngine.traverse( walker, shard, referenceProvider, locusProvider, accumulator ); + } + + traversalEngine.printOnTraversalDone("loci", accumulator); + walker.onTraversalDone(accumulator); + } + + } diff --git a/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java b/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java index 7a7beeeba..30c295e0e 100755 --- a/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java +++ b/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java @@ -1,31 +1,21 @@ package org.broadinstitute.sting.gatk.traversals; -import edu.mit.broad.picard.filter.FilteringIterator; import edu.mit.broad.picard.filter.SamRecordFilter; import edu.mit.broad.picard.reference.ReferenceSequence; import edu.mit.broad.picard.sam.SamFileHeaderMerger; import edu.mit.broad.picard.directed.IntervalList; import edu.mit.broad.picard.util.Interval; -import net.sf.functionalj.Function1; -import net.sf.functionalj.FunctionN; -import net.sf.functionalj.Functions; -import net.sf.functionalj.reflect.JdkStdReflect; -import net.sf.functionalj.reflect.StdReflect; import net.sf.samtools.SAMFileHeader; import net.sf.samtools.SAMFileReader; import net.sf.samtools.SAMFileReader.ValidationStringency; import net.sf.samtools.SAMRecord; import net.sf.samtools.util.RuntimeIOException; -import net.sf.samtools.util.CloseableIterator; import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.iterators.*; import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData; import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.LocusWalker; -import org.broadinstitute.sting.gatk.walkers.ReadWalker; import org.broadinstitute.sting.gatk.walkers.Walker; -import org.broadinstitute.sting.gatk.LocusContext; import org.broadinstitute.sting.utils.*; import java.io.*; @@ -66,16 +56,6 @@ public abstract class TraversalEngine { protected FastaSequenceFile2 refFile = null; // todo: merge FastaSequenceFile2 into picard! protected ReferenceIterator refIter = null; - // Number of records (loci, reads) we've processed - protected long nRecords = 0; - // How many reads have we processed, along with those skipped for various reasons - protected int nReads = 0; - protected int nSkippedReads = 0; - protected int nUnmappedReads = 0; - protected int nNotPrimary = 0; - protected int nBadAlignments = 0; - protected int nSkippedIndels = 0; - // Progress tracker for the sam file protected FileProgressTracker samReadingTracker = null; @@ -277,7 +257,7 @@ public abstract class TraversalEngine { * @param loc Current location */ public void printProgress(boolean mustPrint, final String type, GenomeLoc loc) { - final long nRecords = this.nRecords; + final long nRecords = TraversalStatistics.nRecords; final long curTime = System.currentTimeMillis(); final double elapsed = (curTime - startTime) / 1000.0; //System.out.printf("Cur = %d, last print = %d, elapsed=%.2f, nRecords=%d, met=%b%n", curTime, lastProgressPrintTime, elapsed, nRecords, maxElapsedIntervalForPrinting(curTime)); @@ -307,17 +287,20 @@ public abstract class TraversalEngine { * @param sum The reduce result of the traversal * @param ReduceType of the traversal */ - protected void printOnTraversalDone(final String type, T sum) { + public void printOnTraversalDone(final String type, T sum) { printProgress(true, type, null); logger.info("Traversal reduce result is " + sum); final long curTime = System.currentTimeMillis(); final double elapsed = (curTime - startTime) / 1000.0; logger.info(String.format("Total runtime %.2f secs, %.2f min, %.2f hours%n", elapsed, elapsed / 60, elapsed / 3600)); - logger.info(String.format("Traversal skipped %d reads out of %d total (%.2f%%)", nSkippedReads, nReads, (nSkippedReads * 100.0) / nReads)); - logger.info(String.format(" -> %d unmapped reads", nUnmappedReads)); - logger.info(String.format(" -> %d non-primary reads", nNotPrimary)); - logger.info(String.format(" -> %d reads with bad alignments", nBadAlignments)); - logger.info(String.format(" -> %d reads with indels", nSkippedIndels)); + logger.info(String.format("Traversal skipped %d reads out of %d total (%.2f%%)", + TraversalStatistics.nSkippedReads, + TraversalStatistics.nReads, + (TraversalStatistics.nSkippedReads * 100.0) / TraversalStatistics.nReads)); + logger.info(String.format(" -> %d unmapped reads", TraversalStatistics.nUnmappedReads)); + logger.info(String.format(" -> %d non-primary reads", TraversalStatistics.nNotPrimary)); + logger.info(String.format(" -> %d reads with bad alignments", TraversalStatistics.nBadAlignments)); + logger.info(String.format(" -> %d reads with indels", TraversalStatistics.nSkippedIndels)); } // -------------------------------------------------------------------------------------------------------------- @@ -520,27 +503,27 @@ public abstract class TraversalEngine { boolean result = false; String why = ""; if (rec.getReadUnmappedFlag()) { - nUnmappedReads++; + TraversalStatistics.nUnmappedReads++; result = true; why = "Unmapped"; } else if (rec.getNotPrimaryAlignmentFlag()) { - nNotPrimary++; + TraversalStatistics.nNotPrimary++; result = true; why = "Not Primary"; } else if (rec.getAlignmentStart() == SAMRecord.NO_ALIGNMENT_START) { - nBadAlignments++; + TraversalStatistics.nBadAlignments++; result = true; why = "No alignment start"; - } + } else { result = false; } if (result) { - nSkippedReads++; + TraversalStatistics.nSkippedReads++; //System.out.printf(" [filter] %s => %b %s", rec.getReadName(), result, why); } else { - nReads++; + TraversalStatistics.nReads++; } return result; } diff --git a/java/src/org/broadinstitute/sting/gatk/traversals/TraversalStatistics.java b/java/src/org/broadinstitute/sting/gatk/traversals/TraversalStatistics.java new file mode 100755 index 000000000..8c45ada90 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/traversals/TraversalStatistics.java @@ -0,0 +1,36 @@ +package org.broadinstitute.sting.gatk.traversals; + +/** + * Created by IntelliJ IDEA. + * User: hanna + * Date: Apr 8, 2009 + * Time: 4:13:40 PM + * + * Holds a bunch of basic information about the traversal. + * TODO: Make this a class that can be passed around from the TraversalEngine to other entries that want to update it. + */ +public class TraversalStatistics { + // Number of records (loci, reads) we've processed + public static long nRecords; + // How many reads have we processed, along with those skipped for various reasons + public static int nReads; + public static int nSkippedReads; + public static int nUnmappedReads; + public static int nNotPrimary; + public static int nBadAlignments; + public static int nSkippedIndels; + + static { + reset(); + } + + public static void reset() { + nRecords = 0; + nReads = 0; + nSkippedReads = 0; + nUnmappedReads = 0; + nNotPrimary = 0; + nBadAlignments = 0; + nSkippedIndels = 0; + } +} diff --git a/java/src/org/broadinstitute/sting/gatk/traversals/TraverseByLoci.java b/java/src/org/broadinstitute/sting/gatk/traversals/TraverseByLoci.java index bbf4007e5..92dbf381c 100644 --- a/java/src/org/broadinstitute/sting/gatk/traversals/TraverseByLoci.java +++ b/java/src/org/broadinstitute/sting/gatk/traversals/TraverseByLoci.java @@ -107,7 +107,7 @@ public class TraverseByLoci extends TraversalEngine { boolean done = false; LocusIterator iter = new LocusIteratorByHanger(filterIter); while (iter.hasNext() && !done) { - this.nRecords++; + TraversalStatistics.nRecords++; // actually get the read and hand it to the walker LocusContext locus = iter.next(); @@ -127,8 +127,8 @@ public class TraverseByLoci extends TraversalEngine { //System.out.format("Working at %s\n", locus.getLocation().toString()); - if (this.maxReads > 0 && this.nRecords > this.maxReads) { - logger.warn(String.format("Maximum number of reads encountered, terminating traversal " + this.nRecords)); + if (this.maxReads > 0 && TraversalStatistics.nRecords > this.maxReads) { + logger.warn(String.format("Maximum number of reads encountered, terminating traversal " + TraversalStatistics.nRecords)); done = true; } diff --git a/java/src/org/broadinstitute/sting/gatk/traversals/TraverseByLociByReference.java b/java/src/org/broadinstitute/sting/gatk/traversals/TraverseByLociByReference.java index d482a9861..de9266e32 100644 --- a/java/src/org/broadinstitute/sting/gatk/traversals/TraverseByLociByReference.java +++ b/java/src/org/broadinstitute/sting/gatk/traversals/TraverseByLociByReference.java @@ -61,7 +61,7 @@ public class TraverseByLociByReference extends TraverseByLoci { while ( interval.containsP(refSite.getLocation()) && ! done ) { logger.debug(String.format(" LocusFromReads is %s", locusFromReads == null ? null : locusFromReads.getLocation())); - this.nRecords++; + TraversalStatistics.nRecords++; GenomeLoc current = refSite.getLocation(); // Iterate forward to get all reference ordered data covering this locus @@ -82,8 +82,8 @@ public class TraverseByLociByReference extends TraverseByLoci { locus.downsampleToCoverage(downsamplingCoverage); sum = walkAtLocus( walker, sum, locus, refSite, tracker ); - if (this.maxReads > 0 && this.nRecords > this.maxReads) { - logger.warn(String.format("Maximum number of reads encountered, terminating traversal " + this.nRecords)); + if (this.maxReads > 0 && TraversalStatistics.nRecords > this.maxReads) { + logger.warn(String.format("Maximum number of reads encountered, terminating traversal " + TraversalStatistics.nRecords)); done = true; } diff --git a/java/src/org/broadinstitute/sting/gatk/traversals/TraverseByReads.java b/java/src/org/broadinstitute/sting/gatk/traversals/TraverseByReads.java index 36a14a355..dc6219ff4 100644 --- a/java/src/org/broadinstitute/sting/gatk/traversals/TraverseByReads.java +++ b/java/src/org/broadinstitute/sting/gatk/traversals/TraverseByReads.java @@ -76,7 +76,7 @@ public class TraverseByReads extends TraversalEngine { // copy the locations here in case we ever want to use the full list again later and so that we can remove efficiently LinkedList notYetTraversedLocations = new LinkedList(locations); while (samReadIter.hasNext() && !done) { - this.nRecords++; + TraversalStatistics.nRecords++; // get the next read final SAMRecord read = samReadIter.next(); @@ -102,8 +102,8 @@ public class TraverseByReads extends TraversalEngine { sum = walker.reduce(x, sum); } - if (this.maxReads > 0 && this.nRecords > this.maxReads) { - logger.warn(String.format(("Maximum number of reads encountered, terminating traversal " + this.nRecords))); + if (this.maxReads > 0 && TraversalStatistics.nRecords > this.maxReads) { + logger.warn(String.format(("Maximum number of reads encountered, terminating traversal " + TraversalStatistics.nRecords))); done = true; } } diff --git a/java/src/org/broadinstitute/sting/gatk/traversals/TraverseByReference.java b/java/src/org/broadinstitute/sting/gatk/traversals/TraverseByReference.java index 99444b7fa..69557a2a2 100644 --- a/java/src/org/broadinstitute/sting/gatk/traversals/TraverseByReference.java +++ b/java/src/org/broadinstitute/sting/gatk/traversals/TraverseByReference.java @@ -87,7 +87,7 @@ public class TraverseByReference extends TraverseByLoci { // We keep processing while the next reference location is within the interval while ( (interval == null || interval.containsP(refSite.getLocation())) && ! done ) { - this.nRecords++; + TraversalStatistics.nRecords++; GenomeLoc current = refSite.getLocation(); // Iterate forward to get all reference ordered data covering this locus @@ -97,8 +97,8 @@ public class TraverseByReference extends TraverseByLoci { locus.setReferenceContig(refSite.getCurrentContig()); sum = walkAtLocus( walker, sum, locus, refSite, tracker ); - if (this.maxReads > 0 && this.nRecords > this.maxReads) { - logger.warn(String.format("Maximum number of reads encountered, terminating traversal " + this.nRecords)); + if (this.maxReads > 0 && TraversalStatistics.nRecords > this.maxReads) { + logger.warn(String.format("Maximum number of reads encountered, terminating traversal " + TraversalStatistics.nRecords)); done = true; } diff --git a/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociByReference.java b/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociByReference.java new file mode 100755 index 000000000..d08ae0da1 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociByReference.java @@ -0,0 +1,93 @@ +package org.broadinstitute.sting.gatk.traversals; + +import org.broadinstitute.sting.gatk.walkers.LocusWalker; +import org.broadinstitute.sting.gatk.walkers.Walker; +import org.broadinstitute.sting.gatk.LocusContext; +import org.broadinstitute.sting.gatk.dataSources.shards.Shard; +import org.broadinstitute.sting.gatk.dataSources.providers.LocusContextProvider; +import org.broadinstitute.sting.gatk.dataSources.providers.ReferenceProvider; +import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData; +import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.iterators.ReferenceIterator; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.Utils; +import org.apache.log4j.Logger; + +import java.util.List; +import java.util.ArrayList; +import java.io.File; + +/** + * A simple, short-term solution to iterating over all reference positions over a series of + * genomic locations. Simply overloads the superclass traverse function to go over the entire + * interval's reference positions. + * mhanna - Added better data source integration. + * TODO: Gain confidence in this implementation and remove the original. + */ +public class TraverseLociByReference extends TraversalEngine { + + /** + * our log, which we want to capture anything from this class + */ + protected static Logger logger = Logger.getLogger(TraversalEngine.class); + + + public TraverseLociByReference(File reads, File ref, List> rods) { + super( reads, ref, rods ); + } + + public T traverse(Walker walker, ArrayList locations) { + if ( locations.isEmpty() ) + Utils.scareUser("Requested all locations be processed without providing locations to be processed!"); + + throw new UnsupportedOperationException("This traversal type not supported by TraverseLociByReference"); + } + + public T traverse( Walker walker, + Shard shard, + ReferenceProvider referenceProvider, + LocusContextProvider locusProvider, + T sum ) { + logger.debug(String.format("TraverseLociByReference.traverse Genomic interval is %s", shard.getGenomeLoc())); + + if ( !(walker instanceof LocusWalker) ) + throw new IllegalArgumentException("Walker isn't a loci walker!"); + + LocusWalker locusWalker = (LocusWalker)walker; + GenomeLoc loc = shard.getGenomeLoc(); + + // We keep processing while the next reference location is within the interval + for( long pos = loc.getStart(); pos <= loc.getStop(); pos++ ) { + GenomeLoc site = new GenomeLoc( loc.getContig(), pos ); + + TraversalStatistics.nRecords++; + + // Iterate forward to get all reference ordered data covering this locus + final RefMetaDataTracker tracker = getReferenceOrderedDataAtLocus( site ); + + ReferenceIterator refSite = referenceProvider.getReferenceSequence( site ); + LocusContext locus = locusProvider.getLocusContext( site ); + locus.setReferenceContig( refSite.getCurrentContig() ); + + if ( DOWNSAMPLE_BY_COVERAGE ) + locus.downsampleToCoverage(downsamplingCoverage); + + final char refBase = refSite.getBaseAsChar(); + final boolean keepMeP = locusWalker.filter(tracker, refBase, locus); + if (keepMeP) { + M x = locusWalker.map(tracker, refBase, locus); + sum = locusWalker.reduce(x, sum); + } + + if (this.maxReads > 0 && TraversalStatistics.nRecords > this.maxReads) { + logger.warn(String.format("Maximum number of reads encountered, terminating traversal " + TraversalStatistics.nRecords)); + break; + } + + printProgress("loci", locus.getLocation()); + } + + return sum; + } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/utils/GenomeLoc.java b/java/src/org/broadinstitute/sting/utils/GenomeLoc.java index fc47c01a3..48b9a9206 100644 --- a/java/src/org/broadinstitute/sting/utils/GenomeLoc.java +++ b/java/src/org/broadinstitute/sting/utils/GenomeLoc.java @@ -206,10 +206,13 @@ public class GenomeLoc implements Comparable { * Useful utility function that parses a location string into a coordinate-order sorted * array of GenomeLoc objects * - * @param str + * @param str String representation of genome locs. Null string corresponds to no filter. * @return Array of GenomeLoc objects corresponding to the locations in the string, sorted by coordinate order */ public static ArrayList parseGenomeLocs(final String str) { + // Null string means no filter. + if( str == null ) return null; + // Of the form: loc1;loc2;... // Where each locN can be: // 'chr2', 'chr2:1000000' or 'chr2:1,000,000-2,000,000'