Bringing up scaffolding for integration of locus traversals by reference with Aaron's data source code.
Reverts to original TraverseByLociByReference behavior unless a special combination of command-line flags are used. Lightly tested at best, and major flaws include: - MicroManager is not doing MicroScheduling right now; it's driving the traversals. - New database-ish data providers imply by their interface that they're stateless, but they're highly stateful. - Using static objects to circumvent encapsulation. - Code duplication is rampant. - Plus more! git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@346 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
49b2622e3d
commit
8a1207e4db
|
|
@ -20,6 +20,7 @@ import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
||||||
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
|
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
|
||||||
import org.broadinstitute.sting.gatk.walkers.Walker;
|
import org.broadinstitute.sting.gatk.walkers.Walker;
|
||||||
import org.broadinstitute.sting.gatk.traversals.*;
|
import org.broadinstitute.sting.gatk.traversals.*;
|
||||||
|
import org.broadinstitute.sting.gatk.executive.MicroManager;
|
||||||
import org.broadinstitute.sting.utils.FastaSequenceFile2;
|
import org.broadinstitute.sting.utils.FastaSequenceFile2;
|
||||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
import org.broadinstitute.sting.utils.Utils;
|
import org.broadinstitute.sting.utils.Utils;
|
||||||
|
|
@ -65,6 +66,7 @@ public class GenomeAnalysisTK extends CommandLineProgram {
|
||||||
private TraversalEngine engine = null;
|
private TraversalEngine engine = null;
|
||||||
public boolean DEBUGGING = false;
|
public boolean DEBUGGING = false;
|
||||||
public Boolean WALK_ALL_LOCI = false;
|
public Boolean WALK_ALL_LOCI = false;
|
||||||
|
public Boolean ENABLE_THREADING = false;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* An output file presented to the walker.
|
* An output file presented to the walker.
|
||||||
|
|
@ -81,6 +83,11 @@ public class GenomeAnalysisTK extends CommandLineProgram {
|
||||||
*/
|
*/
|
||||||
public String outErrFileName = null;
|
public String outErrFileName = null;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* How many threads should be allocated to this analysis.
|
||||||
|
*/
|
||||||
|
public int numThreads = 1;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* The output stream, initialized from OUTFILENAME / OUTERRFILENAME.
|
* The output stream, initialized from OUTFILENAME / OUTERRFILENAME.
|
||||||
* Used by the walker.
|
* Used by the walker.
|
||||||
|
|
@ -128,6 +135,8 @@ public class GenomeAnalysisTK extends CommandLineProgram {
|
||||||
m_parser.addOptionalArg("err", "e", "An error output file presented to the walker. Will overwrite contents if file exists.", "errFileName" );
|
m_parser.addOptionalArg("err", "e", "An error output file presented to the walker. Will overwrite contents if file exists.", "errFileName" );
|
||||||
m_parser.addOptionalArg("outerr", "oe", "A joint file for 'normal' and error output presented to the walker. Will overwrite contents if file exists.", "outErrFileName");
|
m_parser.addOptionalArg("outerr", "oe", "A joint file for 'normal' and error output presented to the walker. Will overwrite contents if file exists.", "outErrFileName");
|
||||||
|
|
||||||
|
m_parser.addOptionalArg("numthreads", "nt", "How many threads should be allocated to running this analysis.", "numThreads");
|
||||||
|
m_parser.addOptionalFlag("enablethreading", "et", "Enable experimental threading support.", "ENABLE_THREADING");
|
||||||
//TODO: remove when walkers can ask for tracks
|
//TODO: remove when walkers can ask for tracks
|
||||||
m_parser.addOptionalArg("mother", "MOM", "Mother's genotype (SAM pileup)", "MOTHER_GENOTYPE_FILE");
|
m_parser.addOptionalArg("mother", "MOM", "Mother's genotype (SAM pileup)", "MOTHER_GENOTYPE_FILE");
|
||||||
m_parser.addOptionalArg("father", "DAD", "Father's genotype (SAM pileup)", "FATHER_GENOTYPE_FILE");
|
m_parser.addOptionalArg("father", "DAD", "Father's genotype (SAM pileup)", "FATHER_GENOTYPE_FILE");
|
||||||
|
|
@ -233,6 +242,15 @@ public class GenomeAnalysisTK extends CommandLineProgram {
|
||||||
throw new RuntimeException( "Unable to access walker", ex );
|
throw new RuntimeException( "Unable to access walker", ex );
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Prepare the sort ordering w.r.t. the sequence dictionary
|
||||||
|
FastaSequenceFile2 refFile = null;
|
||||||
|
if (REF_FILE_ARG != null) {
|
||||||
|
refFile = new FastaSequenceFile2(REF_FILE_ARG);
|
||||||
|
GenomeLoc.setupRefContigOrdering(refFile);
|
||||||
|
}
|
||||||
|
|
||||||
|
MicroManager microManager = null;
|
||||||
|
|
||||||
// Try to get the walker specified
|
// Try to get the walker specified
|
||||||
try {
|
try {
|
||||||
LocusWalker<?, ?> walker = (LocusWalker<?, ?>) my_walker;
|
LocusWalker<?, ?> walker = (LocusWalker<?, ?>) my_walker;
|
||||||
|
|
@ -245,9 +263,18 @@ public class GenomeAnalysisTK extends CommandLineProgram {
|
||||||
if ( walker.cannotHandleReads() )
|
if ( walker.cannotHandleReads() )
|
||||||
Utils.scareUser(String.format("Analysis %s doesn't support SAM/BAM reads, but a read file %s was provided", Analysis_Name, INPUT_FILE));
|
Utils.scareUser(String.format("Analysis %s doesn't support SAM/BAM reads, but a read file %s was provided", Analysis_Name, INPUT_FILE));
|
||||||
|
|
||||||
|
if ( WALK_ALL_LOCI ) {
|
||||||
if ( WALK_ALL_LOCI )
|
// TODO: Temporary debugging code. Activate the new debugging code only when the MicroManager
|
||||||
|
// is not filtered.
|
||||||
|
if( ENABLE_THREADING && REGION_STR == null ) {
|
||||||
|
logger.warn("Preliminary threading support enabled");
|
||||||
|
microManager = new MicroManager( INPUT_FILE, REF_FILE_ARG, numThreads );
|
||||||
|
this.engine = microManager.getTraversalEngine();
|
||||||
|
}
|
||||||
|
else {
|
||||||
this.engine = new TraverseByLociByReference(INPUT_FILE, REF_FILE_ARG, rods);
|
this.engine = new TraverseByLociByReference(INPUT_FILE, REF_FILE_ARG, rods);
|
||||||
|
}
|
||||||
|
}
|
||||||
else
|
else
|
||||||
this.engine = new TraverseByLoci(INPUT_FILE, REF_FILE_ARG, rods);
|
this.engine = new TraverseByLoci(INPUT_FILE, REF_FILE_ARG, rods);
|
||||||
}
|
}
|
||||||
|
|
@ -263,14 +290,13 @@ public class GenomeAnalysisTK extends CommandLineProgram {
|
||||||
final ReferenceSequenceFile refFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(REF_FILE_ARG);
|
final ReferenceSequenceFile refFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(REF_FILE_ARG);
|
||||||
GenomeLoc.setupRefContigOrdering(refFile);
|
GenomeLoc.setupRefContigOrdering(refFile);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Determine the validation stringency. Default to ValidationStringency.STRICT.
|
||||||
ValidationStringency strictness;
|
ValidationStringency strictness;
|
||||||
if (STRICTNESS_ARG == null) {
|
try {
|
||||||
strictness = ValidationStringency.STRICT;
|
strictness = Enum.valueOf(ValidationStringency.class, STRICTNESS_ARG);
|
||||||
} else if (STRICTNESS_ARG.toLowerCase().equals("lenient")) {
|
}
|
||||||
strictness = ValidationStringency.LENIENT;
|
catch( IllegalArgumentException ex ) {
|
||||||
} else if (STRICTNESS_ARG.toLowerCase().equals("silent")) {
|
|
||||||
strictness = ValidationStringency.SILENT;
|
|
||||||
} else {
|
|
||||||
strictness = ValidationStringency.STRICT;
|
strictness = ValidationStringency.STRICT;
|
||||||
}
|
}
|
||||||
logger.info("Strictness is " + strictness);
|
logger.info("Strictness is " + strictness);
|
||||||
|
|
@ -304,6 +330,11 @@ public class GenomeAnalysisTK extends CommandLineProgram {
|
||||||
engine.setWalkOverAllSites(WALK_ALL_LOCI);
|
engine.setWalkOverAllSites(WALK_ALL_LOCI);
|
||||||
engine.initialize();
|
engine.initialize();
|
||||||
|
|
||||||
|
if( microManager != null ) {
|
||||||
|
List<GenomeLoc> locations = GenomeLoc.parseGenomeLocs( REGION_STR );
|
||||||
|
microManager.execute( my_walker, locations );
|
||||||
|
}
|
||||||
|
else
|
||||||
engine.traverse(my_walker);
|
engine.traverse(my_walker);
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,112 @@
|
||||||
|
package org.broadinstitute.sting.gatk.dataSources.providers;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
|
import org.broadinstitute.sting.gatk.iterators.LocusIterator;
|
||||||
|
import org.broadinstitute.sting.gatk.iterators.LocusIteratorByHanger;
|
||||||
|
import org.broadinstitute.sting.gatk.LocusContext;
|
||||||
|
import org.broadinstitute.sting.gatk.traversals.TraversalStatistics;
|
||||||
|
import org.apache.log4j.Logger;
|
||||||
|
import net.sf.samtools.SAMRecord;
|
||||||
|
|
||||||
|
import java.util.Iterator;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
|
||||||
|
import edu.mit.broad.picard.filter.FilteringIterator;
|
||||||
|
import edu.mit.broad.picard.filter.SamRecordFilter;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Created by IntelliJ IDEA.
|
||||||
|
* User: hanna
|
||||||
|
* Date: Apr 8, 2009
|
||||||
|
* Time: 3:00:28 PM
|
||||||
|
* To change this template use File | Settings | File Templates.
|
||||||
|
*/
|
||||||
|
public class LocusContextProvider {
|
||||||
|
private Iterator<SAMRecord> reads;
|
||||||
|
|
||||||
|
// What's the last locus accessed? Used for sanity checking.
|
||||||
|
private GenomeLoc lastLoc = null;
|
||||||
|
private LocusIterator loci;
|
||||||
|
|
||||||
|
protected static Logger logger = Logger.getLogger(LocusContextProvider.class);
|
||||||
|
|
||||||
|
public LocusContextProvider( Iterator<SAMRecord> reads ) {
|
||||||
|
this.reads = new FilteringIterator(reads, new locusStreamFilterFunc());
|
||||||
|
// prepare the iterator by loci from reads
|
||||||
|
loci = new LocusIteratorByHanger(this.reads);
|
||||||
|
}
|
||||||
|
|
||||||
|
public LocusContext getLocusContext( GenomeLoc loc ) {
|
||||||
|
// Precondition checks
|
||||||
|
if( lastLoc != null && !loc.isPast( lastLoc ) )
|
||||||
|
throw new RuntimeException( "Internal error: LocusContextProvider assumes that queries it receives are ordered." );
|
||||||
|
|
||||||
|
if( (loc.getStop() - loc.getStart()) > 0 )
|
||||||
|
throw new RuntimeException( "Internal error :LocusContextProviders currently require 1-base genomeLocs.");
|
||||||
|
|
||||||
|
// jump to the first reference site
|
||||||
|
LocusContext locusContext = advanceReadsToLoc( loci, loc );
|
||||||
|
|
||||||
|
// if no locus context was found, create an empty locus
|
||||||
|
if ( locusContext == null || locusContext.getLocation().compareTo( loc ) != 0 )
|
||||||
|
locusContext = new LocusContext(loc, new ArrayList<SAMRecord>(), new ArrayList<Integer>());
|
||||||
|
|
||||||
|
lastLoc = loc;
|
||||||
|
|
||||||
|
return locusContext;
|
||||||
|
}
|
||||||
|
|
||||||
|
private LocusContext advanceReadsToLoc(LocusIterator locusIter, GenomeLoc target) {
|
||||||
|
if ( ! locusIter.hasNext() )
|
||||||
|
return null;
|
||||||
|
|
||||||
|
LocusContext locus = locusIter.next();
|
||||||
|
|
||||||
|
while ( ! target.containsP(locus.getLocation()) && locusIter.hasNext() ) {
|
||||||
|
logger.debug(String.format(" current locus is %s vs %s => %d", locus.getLocation(), target, locus.getLocation().compareTo(target)));
|
||||||
|
locus = locusIter.next();
|
||||||
|
}
|
||||||
|
|
||||||
|
logger.debug(String.format(" returning %s", locus.getLocation()));
|
||||||
|
return locus;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Class to filter out un-handle-able reads from the stream. We currently are skipping
|
||||||
|
* unmapped reads, non-primary reads, unaligned reads, and those with indels. We should
|
||||||
|
* really change this to handle indel containing reads.
|
||||||
|
* TODO: Update TraversalEngine with our runtime stats.
|
||||||
|
*/
|
||||||
|
private class locusStreamFilterFunc implements SamRecordFilter {
|
||||||
|
SAMRecord lastRead = null;
|
||||||
|
public boolean filterOut(SAMRecord rec) {
|
||||||
|
boolean result = false;
|
||||||
|
String why = "";
|
||||||
|
if (rec.getReadUnmappedFlag()) {
|
||||||
|
TraversalStatistics.nUnmappedReads++;
|
||||||
|
result = true;
|
||||||
|
why = "Unmapped";
|
||||||
|
} else if (rec.getNotPrimaryAlignmentFlag()) {
|
||||||
|
TraversalStatistics.nNotPrimary++;
|
||||||
|
result = true;
|
||||||
|
why = "Not Primary";
|
||||||
|
} else if (rec.getAlignmentStart() == SAMRecord.NO_ALIGNMENT_START) {
|
||||||
|
TraversalStatistics.nBadAlignments++;
|
||||||
|
result = true;
|
||||||
|
why = "No alignment start";
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
result = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (result) {
|
||||||
|
//nSkippedReads++;
|
||||||
|
//System.out.printf(" [filter] %s => %b %s", rec.getReadName(), result, why);
|
||||||
|
} else {
|
||||||
|
//nReads++;
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,27 @@
|
||||||
|
package org.broadinstitute.sting.gatk.dataSources.providers;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
|
import org.broadinstitute.sting.gatk.iterators.ReferenceIterator;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Created by IntelliJ IDEA.
|
||||||
|
* User: hanna
|
||||||
|
* Date: Apr 8, 2009
|
||||||
|
* Time: 5:01:37 PM
|
||||||
|
* To change this template use File | Settings | File Templates.
|
||||||
|
*/
|
||||||
|
public class ReferenceProvider {
|
||||||
|
private ReferenceIterator reference;
|
||||||
|
|
||||||
|
public ReferenceProvider( ReferenceIterator reference ) {
|
||||||
|
this.reference = reference;
|
||||||
|
}
|
||||||
|
|
||||||
|
public ReferenceIterator getReferenceSequence( GenomeLoc genomeLoc ) {
|
||||||
|
if( (genomeLoc.getStop() - genomeLoc.getStart()) > 0 )
|
||||||
|
throw new RuntimeException( "Internal error :LocusContextProviders currently require 1-base genomeLocs.");
|
||||||
|
|
||||||
|
// jump to the first reference site
|
||||||
|
return reference.seekForward(genomeLoc);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -1,22 +1,88 @@
|
||||||
package org.broadinstitute.sting.gatk.executive;
|
package org.broadinstitute.sting.gatk.executive;
|
||||||
|
|
||||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
|
import org.broadinstitute.sting.utils.FastaSequenceFile2;
|
||||||
|
import org.broadinstitute.sting.gatk.walkers.Walker;
|
||||||
|
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
||||||
|
import org.broadinstitute.sting.gatk.dataSources.shards.ShardStrategy;
|
||||||
|
import org.broadinstitute.sting.gatk.dataSources.shards.ShardStrategyFactory;
|
||||||
|
import org.broadinstitute.sting.gatk.dataSources.shards.Shard;
|
||||||
|
import org.broadinstitute.sting.gatk.dataSources.simpleDataSources.SAMBAMDataSource;
|
||||||
|
import org.broadinstitute.sting.gatk.dataSources.simpleDataSources.SimpleDataSourceLoadException;
|
||||||
|
import org.broadinstitute.sting.gatk.dataSources.providers.LocusContextProvider;
|
||||||
|
import org.broadinstitute.sting.gatk.dataSources.providers.ReferenceProvider;
|
||||||
|
import org.broadinstitute.sting.gatk.iterators.ReferenceIterator;
|
||||||
|
import org.broadinstitute.sting.gatk.traversals.TraverseLociByReference;
|
||||||
|
import org.broadinstitute.sting.gatk.traversals.TraversalEngine;
|
||||||
|
|
||||||
|
import net.sf.samtools.SAMRecord;
|
||||||
|
import org.apache.log4j.Logger;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.Iterator;
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A micro-scheduling manager for N-way threaded execution of a traversal
|
* A micro-scheduling manager for N-way threaded execution of a traversal
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
public class MicroManager {
|
public class MicroManager {
|
||||||
|
private static long SHARD_SIZE = 5L;
|
||||||
|
|
||||||
public MicroManager( TraversalEngineExecutive TEfactory, // makes worker units
|
private File reads;
|
||||||
GenomeLoc[] locations, // list of work to do
|
private FastaSequenceFile2 ref;
|
||||||
int nThreadsToUse, // maximum number of threads to use to do the work
|
|
||||||
int initialChunkSize ) { // the initial chunk size for dividing up the work
|
private TraverseLociByReference traversalEngine = null;
|
||||||
// do a lot of work here to organize the computation
|
|
||||||
|
protected static Logger logger = Logger.getLogger(MicroManager.class);
|
||||||
|
|
||||||
|
public TraversalEngine getTraversalEngine() {
|
||||||
|
return traversalEngine;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void execute() {
|
public MicroManager( File reads, // the reads file
|
||||||
// actually divide up the work, create worker units, and send them off to do work
|
File refFile, // the reference file driving the traversal
|
||||||
|
int nThreadsToUse ) { // maximum number of threads to use to do the work
|
||||||
|
|
||||||
|
this.reads = reads;
|
||||||
|
ref = new FastaSequenceFile2(refFile);
|
||||||
|
GenomeLoc.setupRefContigOrdering(ref);
|
||||||
|
|
||||||
|
traversalEngine = new TraverseLociByReference( reads, refFile, new java.util.ArrayList() );
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void execute( Walker walker, // the analysis technique to use.
|
||||||
|
List<GenomeLoc> locations ) { // list of work to do
|
||||||
|
ShardStrategy shardStrategy = ShardStrategyFactory.shatter( ShardStrategyFactory.SHATTER_STRATEGY.LINEAR,
|
||||||
|
ref.getSequenceDictionary(),
|
||||||
|
SHARD_SIZE );
|
||||||
|
Object accumulator = ((LocusWalker<?,?>)walker).reduceInit();
|
||||||
|
|
||||||
|
for(Shard shard: shardStrategy) {
|
||||||
|
Iterator<SAMRecord> readShard = null;
|
||||||
|
try {
|
||||||
|
SAMBAMDataSource dataSource = new SAMBAMDataSource( Arrays.asList( new String[] { reads.getCanonicalPath() } ) );
|
||||||
|
readShard = dataSource.seek( shard.getGenomeLoc() );
|
||||||
|
}
|
||||||
|
catch( SimpleDataSourceLoadException ex ) {
|
||||||
|
throw new RuntimeException( ex );
|
||||||
|
}
|
||||||
|
catch( IOException ex ) {
|
||||||
|
throw new RuntimeException( ex );
|
||||||
|
}
|
||||||
|
|
||||||
|
ReferenceProvider referenceProvider = new ReferenceProvider( new ReferenceIterator(ref) );
|
||||||
|
LocusContextProvider locusProvider = new LocusContextProvider( readShard );
|
||||||
|
|
||||||
|
accumulator = traversalEngine.traverse( walker, shard, referenceProvider, locusProvider, accumulator );
|
||||||
|
}
|
||||||
|
|
||||||
|
traversalEngine.printOnTraversalDone("loci", accumulator);
|
||||||
|
walker.onTraversalDone(accumulator);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -1,31 +1,21 @@
|
||||||
package org.broadinstitute.sting.gatk.traversals;
|
package org.broadinstitute.sting.gatk.traversals;
|
||||||
|
|
||||||
import edu.mit.broad.picard.filter.FilteringIterator;
|
|
||||||
import edu.mit.broad.picard.filter.SamRecordFilter;
|
import edu.mit.broad.picard.filter.SamRecordFilter;
|
||||||
import edu.mit.broad.picard.reference.ReferenceSequence;
|
import edu.mit.broad.picard.reference.ReferenceSequence;
|
||||||
import edu.mit.broad.picard.sam.SamFileHeaderMerger;
|
import edu.mit.broad.picard.sam.SamFileHeaderMerger;
|
||||||
import edu.mit.broad.picard.directed.IntervalList;
|
import edu.mit.broad.picard.directed.IntervalList;
|
||||||
import edu.mit.broad.picard.util.Interval;
|
import edu.mit.broad.picard.util.Interval;
|
||||||
import net.sf.functionalj.Function1;
|
|
||||||
import net.sf.functionalj.FunctionN;
|
|
||||||
import net.sf.functionalj.Functions;
|
|
||||||
import net.sf.functionalj.reflect.JdkStdReflect;
|
|
||||||
import net.sf.functionalj.reflect.StdReflect;
|
|
||||||
import net.sf.samtools.SAMFileHeader;
|
import net.sf.samtools.SAMFileHeader;
|
||||||
import net.sf.samtools.SAMFileReader;
|
import net.sf.samtools.SAMFileReader;
|
||||||
import net.sf.samtools.SAMFileReader.ValidationStringency;
|
import net.sf.samtools.SAMFileReader.ValidationStringency;
|
||||||
import net.sf.samtools.SAMRecord;
|
import net.sf.samtools.SAMRecord;
|
||||||
import net.sf.samtools.util.RuntimeIOException;
|
import net.sf.samtools.util.RuntimeIOException;
|
||||||
import net.sf.samtools.util.CloseableIterator;
|
|
||||||
import org.apache.log4j.Logger;
|
import org.apache.log4j.Logger;
|
||||||
import org.broadinstitute.sting.gatk.iterators.*;
|
import org.broadinstitute.sting.gatk.iterators.*;
|
||||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
|
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
|
||||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
|
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
|
||||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||||
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
|
||||||
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
|
|
||||||
import org.broadinstitute.sting.gatk.walkers.Walker;
|
import org.broadinstitute.sting.gatk.walkers.Walker;
|
||||||
import org.broadinstitute.sting.gatk.LocusContext;
|
|
||||||
import org.broadinstitute.sting.utils.*;
|
import org.broadinstitute.sting.utils.*;
|
||||||
|
|
||||||
import java.io.*;
|
import java.io.*;
|
||||||
|
|
@ -66,16 +56,6 @@ public abstract class TraversalEngine {
|
||||||
protected FastaSequenceFile2 refFile = null; // todo: merge FastaSequenceFile2 into picard!
|
protected FastaSequenceFile2 refFile = null; // todo: merge FastaSequenceFile2 into picard!
|
||||||
protected ReferenceIterator refIter = null;
|
protected ReferenceIterator refIter = null;
|
||||||
|
|
||||||
// Number of records (loci, reads) we've processed
|
|
||||||
protected long nRecords = 0;
|
|
||||||
// How many reads have we processed, along with those skipped for various reasons
|
|
||||||
protected int nReads = 0;
|
|
||||||
protected int nSkippedReads = 0;
|
|
||||||
protected int nUnmappedReads = 0;
|
|
||||||
protected int nNotPrimary = 0;
|
|
||||||
protected int nBadAlignments = 0;
|
|
||||||
protected int nSkippedIndels = 0;
|
|
||||||
|
|
||||||
// Progress tracker for the sam file
|
// Progress tracker for the sam file
|
||||||
protected FileProgressTracker samReadingTracker = null;
|
protected FileProgressTracker samReadingTracker = null;
|
||||||
|
|
||||||
|
|
@ -277,7 +257,7 @@ public abstract class TraversalEngine {
|
||||||
* @param loc Current location
|
* @param loc Current location
|
||||||
*/
|
*/
|
||||||
public void printProgress(boolean mustPrint, final String type, GenomeLoc loc) {
|
public void printProgress(boolean mustPrint, final String type, GenomeLoc loc) {
|
||||||
final long nRecords = this.nRecords;
|
final long nRecords = TraversalStatistics.nRecords;
|
||||||
final long curTime = System.currentTimeMillis();
|
final long curTime = System.currentTimeMillis();
|
||||||
final double elapsed = (curTime - startTime) / 1000.0;
|
final double elapsed = (curTime - startTime) / 1000.0;
|
||||||
//System.out.printf("Cur = %d, last print = %d, elapsed=%.2f, nRecords=%d, met=%b%n", curTime, lastProgressPrintTime, elapsed, nRecords, maxElapsedIntervalForPrinting(curTime));
|
//System.out.printf("Cur = %d, last print = %d, elapsed=%.2f, nRecords=%d, met=%b%n", curTime, lastProgressPrintTime, elapsed, nRecords, maxElapsedIntervalForPrinting(curTime));
|
||||||
|
|
@ -307,17 +287,20 @@ public abstract class TraversalEngine {
|
||||||
* @param sum The reduce result of the traversal
|
* @param sum The reduce result of the traversal
|
||||||
* @param <T> ReduceType of the traversal
|
* @param <T> ReduceType of the traversal
|
||||||
*/
|
*/
|
||||||
protected <T> void printOnTraversalDone(final String type, T sum) {
|
public <T> void printOnTraversalDone(final String type, T sum) {
|
||||||
printProgress(true, type, null);
|
printProgress(true, type, null);
|
||||||
logger.info("Traversal reduce result is " + sum);
|
logger.info("Traversal reduce result is " + sum);
|
||||||
final long curTime = System.currentTimeMillis();
|
final long curTime = System.currentTimeMillis();
|
||||||
final double elapsed = (curTime - startTime) / 1000.0;
|
final double elapsed = (curTime - startTime) / 1000.0;
|
||||||
logger.info(String.format("Total runtime %.2f secs, %.2f min, %.2f hours%n", elapsed, elapsed / 60, elapsed / 3600));
|
logger.info(String.format("Total runtime %.2f secs, %.2f min, %.2f hours%n", elapsed, elapsed / 60, elapsed / 3600));
|
||||||
logger.info(String.format("Traversal skipped %d reads out of %d total (%.2f%%)", nSkippedReads, nReads, (nSkippedReads * 100.0) / nReads));
|
logger.info(String.format("Traversal skipped %d reads out of %d total (%.2f%%)",
|
||||||
logger.info(String.format(" -> %d unmapped reads", nUnmappedReads));
|
TraversalStatistics.nSkippedReads,
|
||||||
logger.info(String.format(" -> %d non-primary reads", nNotPrimary));
|
TraversalStatistics.nReads,
|
||||||
logger.info(String.format(" -> %d reads with bad alignments", nBadAlignments));
|
(TraversalStatistics.nSkippedReads * 100.0) / TraversalStatistics.nReads));
|
||||||
logger.info(String.format(" -> %d reads with indels", nSkippedIndels));
|
logger.info(String.format(" -> %d unmapped reads", TraversalStatistics.nUnmappedReads));
|
||||||
|
logger.info(String.format(" -> %d non-primary reads", TraversalStatistics.nNotPrimary));
|
||||||
|
logger.info(String.format(" -> %d reads with bad alignments", TraversalStatistics.nBadAlignments));
|
||||||
|
logger.info(String.format(" -> %d reads with indels", TraversalStatistics.nSkippedIndels));
|
||||||
}
|
}
|
||||||
|
|
||||||
// --------------------------------------------------------------------------------------------------------------
|
// --------------------------------------------------------------------------------------------------------------
|
||||||
|
|
@ -520,15 +503,15 @@ public abstract class TraversalEngine {
|
||||||
boolean result = false;
|
boolean result = false;
|
||||||
String why = "";
|
String why = "";
|
||||||
if (rec.getReadUnmappedFlag()) {
|
if (rec.getReadUnmappedFlag()) {
|
||||||
nUnmappedReads++;
|
TraversalStatistics.nUnmappedReads++;
|
||||||
result = true;
|
result = true;
|
||||||
why = "Unmapped";
|
why = "Unmapped";
|
||||||
} else if (rec.getNotPrimaryAlignmentFlag()) {
|
} else if (rec.getNotPrimaryAlignmentFlag()) {
|
||||||
nNotPrimary++;
|
TraversalStatistics.nNotPrimary++;
|
||||||
result = true;
|
result = true;
|
||||||
why = "Not Primary";
|
why = "Not Primary";
|
||||||
} else if (rec.getAlignmentStart() == SAMRecord.NO_ALIGNMENT_START) {
|
} else if (rec.getAlignmentStart() == SAMRecord.NO_ALIGNMENT_START) {
|
||||||
nBadAlignments++;
|
TraversalStatistics.nBadAlignments++;
|
||||||
result = true;
|
result = true;
|
||||||
why = "No alignment start";
|
why = "No alignment start";
|
||||||
}
|
}
|
||||||
|
|
@ -537,10 +520,10 @@ public abstract class TraversalEngine {
|
||||||
}
|
}
|
||||||
|
|
||||||
if (result) {
|
if (result) {
|
||||||
nSkippedReads++;
|
TraversalStatistics.nSkippedReads++;
|
||||||
//System.out.printf(" [filter] %s => %b %s", rec.getReadName(), result, why);
|
//System.out.printf(" [filter] %s => %b %s", rec.getReadName(), result, why);
|
||||||
} else {
|
} else {
|
||||||
nReads++;
|
TraversalStatistics.nReads++;
|
||||||
}
|
}
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,36 @@
|
||||||
|
package org.broadinstitute.sting.gatk.traversals;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Created by IntelliJ IDEA.
|
||||||
|
* User: hanna
|
||||||
|
* Date: Apr 8, 2009
|
||||||
|
* Time: 4:13:40 PM
|
||||||
|
*
|
||||||
|
* Holds a bunch of basic information about the traversal.
|
||||||
|
* TODO: Make this a class that can be passed around from the TraversalEngine to other entries that want to update it.
|
||||||
|
*/
|
||||||
|
public class TraversalStatistics {
|
||||||
|
// Number of records (loci, reads) we've processed
|
||||||
|
public static long nRecords;
|
||||||
|
// How many reads have we processed, along with those skipped for various reasons
|
||||||
|
public static int nReads;
|
||||||
|
public static int nSkippedReads;
|
||||||
|
public static int nUnmappedReads;
|
||||||
|
public static int nNotPrimary;
|
||||||
|
public static int nBadAlignments;
|
||||||
|
public static int nSkippedIndels;
|
||||||
|
|
||||||
|
static {
|
||||||
|
reset();
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void reset() {
|
||||||
|
nRecords = 0;
|
||||||
|
nReads = 0;
|
||||||
|
nSkippedReads = 0;
|
||||||
|
nUnmappedReads = 0;
|
||||||
|
nNotPrimary = 0;
|
||||||
|
nBadAlignments = 0;
|
||||||
|
nSkippedIndels = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -107,7 +107,7 @@ public class TraverseByLoci extends TraversalEngine {
|
||||||
boolean done = false;
|
boolean done = false;
|
||||||
LocusIterator iter = new LocusIteratorByHanger(filterIter);
|
LocusIterator iter = new LocusIteratorByHanger(filterIter);
|
||||||
while (iter.hasNext() && !done) {
|
while (iter.hasNext() && !done) {
|
||||||
this.nRecords++;
|
TraversalStatistics.nRecords++;
|
||||||
|
|
||||||
// actually get the read and hand it to the walker
|
// actually get the read and hand it to the walker
|
||||||
LocusContext locus = iter.next();
|
LocusContext locus = iter.next();
|
||||||
|
|
@ -127,8 +127,8 @@ public class TraverseByLoci extends TraversalEngine {
|
||||||
|
|
||||||
//System.out.format("Working at %s\n", locus.getLocation().toString());
|
//System.out.format("Working at %s\n", locus.getLocation().toString());
|
||||||
|
|
||||||
if (this.maxReads > 0 && this.nRecords > this.maxReads) {
|
if (this.maxReads > 0 && TraversalStatistics.nRecords > this.maxReads) {
|
||||||
logger.warn(String.format("Maximum number of reads encountered, terminating traversal " + this.nRecords));
|
logger.warn(String.format("Maximum number of reads encountered, terminating traversal " + TraversalStatistics.nRecords));
|
||||||
done = true;
|
done = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -61,7 +61,7 @@ public class TraverseByLociByReference extends TraverseByLoci {
|
||||||
while ( interval.containsP(refSite.getLocation()) && ! done ) {
|
while ( interval.containsP(refSite.getLocation()) && ! done ) {
|
||||||
logger.debug(String.format(" LocusFromReads is %s", locusFromReads == null ? null : locusFromReads.getLocation()));
|
logger.debug(String.format(" LocusFromReads is %s", locusFromReads == null ? null : locusFromReads.getLocation()));
|
||||||
|
|
||||||
this.nRecords++;
|
TraversalStatistics.nRecords++;
|
||||||
GenomeLoc current = refSite.getLocation();
|
GenomeLoc current = refSite.getLocation();
|
||||||
|
|
||||||
// Iterate forward to get all reference ordered data covering this locus
|
// Iterate forward to get all reference ordered data covering this locus
|
||||||
|
|
@ -82,8 +82,8 @@ public class TraverseByLociByReference extends TraverseByLoci {
|
||||||
locus.downsampleToCoverage(downsamplingCoverage);
|
locus.downsampleToCoverage(downsamplingCoverage);
|
||||||
sum = walkAtLocus( walker, sum, locus, refSite, tracker );
|
sum = walkAtLocus( walker, sum, locus, refSite, tracker );
|
||||||
|
|
||||||
if (this.maxReads > 0 && this.nRecords > this.maxReads) {
|
if (this.maxReads > 0 && TraversalStatistics.nRecords > this.maxReads) {
|
||||||
logger.warn(String.format("Maximum number of reads encountered, terminating traversal " + this.nRecords));
|
logger.warn(String.format("Maximum number of reads encountered, terminating traversal " + TraversalStatistics.nRecords));
|
||||||
done = true;
|
done = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -76,7 +76,7 @@ public class TraverseByReads extends TraversalEngine {
|
||||||
// copy the locations here in case we ever want to use the full list again later and so that we can remove efficiently
|
// copy the locations here in case we ever want to use the full list again later and so that we can remove efficiently
|
||||||
LinkedList notYetTraversedLocations = new LinkedList(locations);
|
LinkedList notYetTraversedLocations = new LinkedList(locations);
|
||||||
while (samReadIter.hasNext() && !done) {
|
while (samReadIter.hasNext() && !done) {
|
||||||
this.nRecords++;
|
TraversalStatistics.nRecords++;
|
||||||
|
|
||||||
// get the next read
|
// get the next read
|
||||||
final SAMRecord read = samReadIter.next();
|
final SAMRecord read = samReadIter.next();
|
||||||
|
|
@ -102,8 +102,8 @@ public class TraverseByReads extends TraversalEngine {
|
||||||
sum = walker.reduce(x, sum);
|
sum = walker.reduce(x, sum);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (this.maxReads > 0 && this.nRecords > this.maxReads) {
|
if (this.maxReads > 0 && TraversalStatistics.nRecords > this.maxReads) {
|
||||||
logger.warn(String.format(("Maximum number of reads encountered, terminating traversal " + this.nRecords)));
|
logger.warn(String.format(("Maximum number of reads encountered, terminating traversal " + TraversalStatistics.nRecords)));
|
||||||
done = true;
|
done = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -87,7 +87,7 @@ public class TraverseByReference extends TraverseByLoci {
|
||||||
|
|
||||||
// We keep processing while the next reference location is within the interval
|
// We keep processing while the next reference location is within the interval
|
||||||
while ( (interval == null || interval.containsP(refSite.getLocation())) && ! done ) {
|
while ( (interval == null || interval.containsP(refSite.getLocation())) && ! done ) {
|
||||||
this.nRecords++;
|
TraversalStatistics.nRecords++;
|
||||||
GenomeLoc current = refSite.getLocation();
|
GenomeLoc current = refSite.getLocation();
|
||||||
|
|
||||||
// Iterate forward to get all reference ordered data covering this locus
|
// Iterate forward to get all reference ordered data covering this locus
|
||||||
|
|
@ -97,8 +97,8 @@ public class TraverseByReference extends TraverseByLoci {
|
||||||
locus.setReferenceContig(refSite.getCurrentContig());
|
locus.setReferenceContig(refSite.getCurrentContig());
|
||||||
sum = walkAtLocus( walker, sum, locus, refSite, tracker );
|
sum = walkAtLocus( walker, sum, locus, refSite, tracker );
|
||||||
|
|
||||||
if (this.maxReads > 0 && this.nRecords > this.maxReads) {
|
if (this.maxReads > 0 && TraversalStatistics.nRecords > this.maxReads) {
|
||||||
logger.warn(String.format("Maximum number of reads encountered, terminating traversal " + this.nRecords));
|
logger.warn(String.format("Maximum number of reads encountered, terminating traversal " + TraversalStatistics.nRecords));
|
||||||
done = true;
|
done = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,93 @@
|
||||||
|
package org.broadinstitute.sting.gatk.traversals;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
||||||
|
import org.broadinstitute.sting.gatk.walkers.Walker;
|
||||||
|
import org.broadinstitute.sting.gatk.LocusContext;
|
||||||
|
import org.broadinstitute.sting.gatk.dataSources.shards.Shard;
|
||||||
|
import org.broadinstitute.sting.gatk.dataSources.providers.LocusContextProvider;
|
||||||
|
import org.broadinstitute.sting.gatk.dataSources.providers.ReferenceProvider;
|
||||||
|
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
|
||||||
|
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
|
||||||
|
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||||
|
import org.broadinstitute.sting.gatk.iterators.ReferenceIterator;
|
||||||
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
|
import org.broadinstitute.sting.utils.Utils;
|
||||||
|
import org.apache.log4j.Logger;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.io.File;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A simple, short-term solution to iterating over all reference positions over a series of
|
||||||
|
* genomic locations. Simply overloads the superclass traverse function to go over the entire
|
||||||
|
* interval's reference positions.
|
||||||
|
* mhanna - Added better data source integration.
|
||||||
|
* TODO: Gain confidence in this implementation and remove the original.
|
||||||
|
*/
|
||||||
|
public class TraverseLociByReference extends TraversalEngine {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* our log, which we want to capture anything from this class
|
||||||
|
*/
|
||||||
|
protected static Logger logger = Logger.getLogger(TraversalEngine.class);
|
||||||
|
|
||||||
|
|
||||||
|
public TraverseLociByReference(File reads, File ref, List<ReferenceOrderedData<? extends ReferenceOrderedDatum>> rods) {
|
||||||
|
super( reads, ref, rods );
|
||||||
|
}
|
||||||
|
|
||||||
|
public <M,T> T traverse(Walker<M,T> walker, ArrayList<GenomeLoc> locations) {
|
||||||
|
if ( locations.isEmpty() )
|
||||||
|
Utils.scareUser("Requested all locations be processed without providing locations to be processed!");
|
||||||
|
|
||||||
|
throw new UnsupportedOperationException("This traversal type not supported by TraverseLociByReference");
|
||||||
|
}
|
||||||
|
|
||||||
|
public <M,T> T traverse( Walker<M,T> walker,
|
||||||
|
Shard shard,
|
||||||
|
ReferenceProvider referenceProvider,
|
||||||
|
LocusContextProvider locusProvider,
|
||||||
|
T sum ) {
|
||||||
|
logger.debug(String.format("TraverseLociByReference.traverse Genomic interval is %s", shard.getGenomeLoc()));
|
||||||
|
|
||||||
|
if ( !(walker instanceof LocusWalker) )
|
||||||
|
throw new IllegalArgumentException("Walker isn't a loci walker!");
|
||||||
|
|
||||||
|
LocusWalker<M, T> locusWalker = (LocusWalker<M, T>)walker;
|
||||||
|
GenomeLoc loc = shard.getGenomeLoc();
|
||||||
|
|
||||||
|
// We keep processing while the next reference location is within the interval
|
||||||
|
for( long pos = loc.getStart(); pos <= loc.getStop(); pos++ ) {
|
||||||
|
GenomeLoc site = new GenomeLoc( loc.getContig(), pos );
|
||||||
|
|
||||||
|
TraversalStatistics.nRecords++;
|
||||||
|
|
||||||
|
// Iterate forward to get all reference ordered data covering this locus
|
||||||
|
final RefMetaDataTracker tracker = getReferenceOrderedDataAtLocus( site );
|
||||||
|
|
||||||
|
ReferenceIterator refSite = referenceProvider.getReferenceSequence( site );
|
||||||
|
LocusContext locus = locusProvider.getLocusContext( site );
|
||||||
|
locus.setReferenceContig( refSite.getCurrentContig() );
|
||||||
|
|
||||||
|
if ( DOWNSAMPLE_BY_COVERAGE )
|
||||||
|
locus.downsampleToCoverage(downsamplingCoverage);
|
||||||
|
|
||||||
|
final char refBase = refSite.getBaseAsChar();
|
||||||
|
final boolean keepMeP = locusWalker.filter(tracker, refBase, locus);
|
||||||
|
if (keepMeP) {
|
||||||
|
M x = locusWalker.map(tracker, refBase, locus);
|
||||||
|
sum = locusWalker.reduce(x, sum);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (this.maxReads > 0 && TraversalStatistics.nRecords > this.maxReads) {
|
||||||
|
logger.warn(String.format("Maximum number of reads encountered, terminating traversal " + TraversalStatistics.nRecords));
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
printProgress("loci", locus.getLocation());
|
||||||
|
}
|
||||||
|
|
||||||
|
return sum;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -206,10 +206,13 @@ public class GenomeLoc implements Comparable<GenomeLoc> {
|
||||||
* Useful utility function that parses a location string into a coordinate-order sorted
|
* Useful utility function that parses a location string into a coordinate-order sorted
|
||||||
* array of GenomeLoc objects
|
* array of GenomeLoc objects
|
||||||
*
|
*
|
||||||
* @param str
|
* @param str String representation of genome locs. Null string corresponds to no filter.
|
||||||
* @return Array of GenomeLoc objects corresponding to the locations in the string, sorted by coordinate order
|
* @return Array of GenomeLoc objects corresponding to the locations in the string, sorted by coordinate order
|
||||||
*/
|
*/
|
||||||
public static ArrayList<GenomeLoc> parseGenomeLocs(final String str) {
|
public static ArrayList<GenomeLoc> parseGenomeLocs(final String str) {
|
||||||
|
// Null string means no filter.
|
||||||
|
if( str == null ) return null;
|
||||||
|
|
||||||
// Of the form: loc1;loc2;...
|
// Of the form: loc1;loc2;...
|
||||||
// Where each locN can be:
|
// Where each locN can be:
|
||||||
// 'chr2', 'chr2:1000000' or 'chr2:1,000,000-2,000,000'
|
// 'chr2', 'chr2:1000000' or 'chr2:1,000,000-2,000,000'
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue