Refactoring of the traversal engine base class, I removed a lot of old code.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1209 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
3519323156
commit
d86717db93
|
|
@ -27,20 +27,18 @@ package org.broadinstitute.sting.gatk;
|
|||
|
||||
import net.sf.picard.reference.ReferenceSequenceFile;
|
||||
import net.sf.picard.reference.ReferenceSequenceFileFactory;
|
||||
import net.sf.samtools.SAMFileReader;
|
||||
import net.sf.samtools.SAMFileReader.ValidationStringency;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.gatk.executive.MicroScheduler;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
|
||||
import org.broadinstitute.sting.gatk.traversals.*;
|
||||
import org.broadinstitute.sting.gatk.traversals.TraversalEngine;
|
||||
import org.broadinstitute.sting.gatk.walkers.*;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.cmdLine.ArgumentException;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.io.*;
|
||||
|
||||
public class GenomeAnalysisEngine {
|
||||
|
||||
|
|
@ -100,7 +98,7 @@ public class GenomeAnalysisEngine {
|
|||
if (argCollection.HAPMAPChipFile != null)
|
||||
bindConvenienceRods("hapmap-chip", "GFF", argCollection.HAPMAPChipFile);
|
||||
// TODO: The ROD iterator currently does not understand multiple intervals file. Fix this by cleaning the ROD system.
|
||||
if ( argCollection.intervals != null && argCollection.intervals.size() == 1) {
|
||||
if (argCollection.intervals != null && argCollection.intervals.size() == 1) {
|
||||
bindConvenienceRods("interval", "Intervals", argCollection.intervals.get(0).replaceAll(",", ""));
|
||||
}
|
||||
|
||||
|
|
@ -111,7 +109,7 @@ public class GenomeAnalysisEngine {
|
|||
validateInputsAgainstWalker(my_walker, argCollection, rods);
|
||||
|
||||
// create the output streams
|
||||
initializeOutputStreams( my_walker );
|
||||
initializeOutputStreams(my_walker);
|
||||
|
||||
// our microscheduler, which is in charge of running everything
|
||||
MicroScheduler microScheduler = null;
|
||||
|
|
@ -129,13 +127,10 @@ public class GenomeAnalysisEngine {
|
|||
// perform validation steps that are common to all the engines
|
||||
genericEngineSetup();
|
||||
|
||||
// parse out any genomic location they've provided
|
||||
//List<GenomeLoc> locationsList = setupIntervalRegion();
|
||||
List<GenomeLoc> locationsList = engine.getLocations();
|
||||
GenomeLocSortedSet locs = null;
|
||||
if (locationsList != null)
|
||||
locs = GenomeLocSortedSet.createSetFromList(locationsList);
|
||||
|
||||
if (argCollection.intervals != null) {
|
||||
locs = GenomeLocSortedSet.createSetFromList(parseIntervalRegion(argCollection.intervals));
|
||||
}
|
||||
// excute the microscheduler, storing the results
|
||||
walkerReturn = microScheduler.execute(my_walker, locs, argCollection.maximumEngineIterations);
|
||||
}
|
||||
|
|
@ -145,6 +140,7 @@ public class GenomeAnalysisEngine {
|
|||
*
|
||||
* @param my_walker our walker of type LocusWalker
|
||||
* @param rods the reference order data
|
||||
*
|
||||
* @return a new microscheduler
|
||||
*/
|
||||
private MicroScheduler createMicroscheduler(Walker my_walker, List<ReferenceOrderedData<? extends ReferenceOrderedDatum>> rods) {
|
||||
|
|
@ -156,38 +152,23 @@ public class GenomeAnalysisEngine {
|
|||
// create the MicroScheduler
|
||||
microScheduler = MicroScheduler.create(my_walker, extractSourceInfoFromArguments(argCollection), argCollection.referenceFile, rods, argCollection.numberOfThreads);
|
||||
engine = microScheduler.getTraversalEngine();
|
||||
}
|
||||
else if (my_walker instanceof ReadWalker || my_walker instanceof DuplicateWalker) {
|
||||
} else if (my_walker instanceof ReadWalker || my_walker instanceof DuplicateWalker) {
|
||||
if (argCollection.referenceFile == null)
|
||||
Utils.scareUser(String.format("Read-based traversals require a reference file but none was given"));
|
||||
microScheduler = MicroScheduler.create(my_walker, extractSourceInfoFromArguments(argCollection), argCollection.referenceFile, rods, argCollection.numberOfThreads);
|
||||
engine = microScheduler.getTraversalEngine();
|
||||
} else {
|
||||
Utils.scareUser(String.format("Unable to create the appropriate TraversalEngine for analysis type " + argCollection.analysisName));
|
||||
Utils.scareUser(String.format("Unable to create the appropriate TraversalEngine for analysis type " + argCollection.analysisName));
|
||||
}
|
||||
|
||||
return microScheduler;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* commands that get executed for each engine, regardless of the type
|
||||
*/
|
||||
/** commands that get executed for each engine, regardless of the type */
|
||||
private void genericEngineSetup() {
|
||||
Reads sourceInfo = extractSourceInfoFromArguments(argCollection);
|
||||
|
||||
engine.setStrictness(sourceInfo.getValidationStringency());
|
||||
|
||||
engine.setMaxReads(argCollection.maximumEngineIterations);
|
||||
engine.setFilterZeroMappingQualityReads(argCollection.filterZeroMappingQualityReads);
|
||||
|
||||
// we default interval files over the genome region string
|
||||
if (argCollection.intervals != null) {
|
||||
engine.setLocation(parseIntervalRegion(argCollection.intervals));
|
||||
}
|
||||
|
||||
engine.setReadFilters(sourceInfo);
|
||||
|
||||
engine.initialize();
|
||||
}
|
||||
|
||||
|
|
@ -196,9 +177,9 @@ public class GenomeAnalysisEngine {
|
|||
*
|
||||
* @return a list of genomeLoc representing the interval file
|
||||
*/
|
||||
public static List<GenomeLoc> parseIntervalRegion(final List<String> intervals ) {
|
||||
public static List<GenomeLoc> parseIntervalRegion(final List<String> intervals) {
|
||||
List<GenomeLoc> locs = new ArrayList<GenomeLoc>();
|
||||
for( String interval: intervals ) {
|
||||
for (String interval : intervals) {
|
||||
if (new File(interval).exists()) {
|
||||
locs.addAll(GenomeLocParser.intervalFileToList(interval));
|
||||
} else {
|
||||
|
|
@ -211,9 +192,12 @@ public class GenomeAnalysisEngine {
|
|||
|
||||
/**
|
||||
* Bundles all the source information about the reads into a unified data structure.
|
||||
*
|
||||
* @param argCollection The collection of arguments passed to the engine.
|
||||
*
|
||||
* @return The reads object providing reads source info.
|
||||
*/
|
||||
|
||||
private Reads extractSourceInfoFromArguments( GATKArgumentCollection argCollection ) {
|
||||
return new Reads( argCollection.samFiles,
|
||||
argCollection.strictnessLevel,
|
||||
|
|
@ -229,33 +213,33 @@ public class GenomeAnalysisEngine {
|
|||
String walkerName = WalkerManager.getWalkerName(walker.getClass());
|
||||
|
||||
// Check what the walker says is required against what was provided on the command line.
|
||||
if( WalkerManager.isRequired(walker,DataSource.READS) && (arguments.samFiles == null || arguments.samFiles.size() == 0) )
|
||||
throw new ArgumentException(String.format("Walker %s requires reads but none were provided. If this is incorrect, alter the walker's @Requires annotation.",walkerName));
|
||||
if( WalkerManager.isRequired(walker,DataSource.REFERENCE) && arguments.referenceFile == null )
|
||||
throw new ArgumentException(String.format("Walker %s requires a reference but none was provided. If this is incorrect, alter the walker's @Requires annotation.",walkerName));
|
||||
if (WalkerManager.isRequired(walker, DataSource.READS) && (arguments.samFiles == null || arguments.samFiles.size() == 0))
|
||||
throw new ArgumentException(String.format("Walker %s requires reads but none were provided. If this is incorrect, alter the walker's @Requires annotation.", walkerName));
|
||||
if (WalkerManager.isRequired(walker, DataSource.REFERENCE) && arguments.referenceFile == null)
|
||||
throw new ArgumentException(String.format("Walker %s requires a reference but none was provided. If this is incorrect, alter the walker's @Requires annotation.", walkerName));
|
||||
|
||||
// Check what the walker says is allowed against what was provided on the command line.
|
||||
if( (arguments.samFiles != null && arguments.samFiles.size() > 0) && !WalkerManager.isAllowed(walker,DataSource.READS) )
|
||||
throw new ArgumentException(String.format("Walker %s does not allow reads but reads were provided. If this is incorrect, alter the walker's @Allows annotation",walkerName));
|
||||
if( arguments.referenceFile != null && !WalkerManager.isAllowed(walker,DataSource.REFERENCE) )
|
||||
throw new ArgumentException(String.format("Walker %s does not allow a reference but one was provided. If this is incorrect, alter the walker's @Allows annotation",walkerName));
|
||||
if ((arguments.samFiles != null && arguments.samFiles.size() > 0) && !WalkerManager.isAllowed(walker, DataSource.READS))
|
||||
throw new ArgumentException(String.format("Walker %s does not allow reads but reads were provided. If this is incorrect, alter the walker's @Allows annotation", walkerName));
|
||||
if (arguments.referenceFile != null && !WalkerManager.isAllowed(walker, DataSource.REFERENCE))
|
||||
throw new ArgumentException(String.format("Walker %s does not allow a reference but one was provided. If this is incorrect, alter the walker's @Allows annotation", walkerName));
|
||||
|
||||
// Check to make sure that all required metadata is present.
|
||||
List<RMD> allRequired = WalkerManager.getRequiredMetaData(walker);
|
||||
for( RMD required: allRequired ) {
|
||||
for (RMD required : allRequired) {
|
||||
boolean found = false;
|
||||
for( ReferenceOrderedData<? extends ReferenceOrderedDatum> rod: rods ) {
|
||||
if( rod.matches(required.name(),required.type()) )
|
||||
for (ReferenceOrderedData<? extends ReferenceOrderedDatum> rod : rods) {
|
||||
if (rod.matches(required.name(), required.type()))
|
||||
found = true;
|
||||
}
|
||||
if( !found )
|
||||
throw new ArgumentException(String.format("Unable to find reference metadata (%s,%s)",required.name(),required.type()));
|
||||
if (!found)
|
||||
throw new ArgumentException(String.format("Unable to find reference metadata (%s,%s)", required.name(), required.type()));
|
||||
}
|
||||
|
||||
// Check to see that no forbidden rods are present.
|
||||
for( ReferenceOrderedData<? extends ReferenceOrderedDatum> rod: rods ) {
|
||||
if( !WalkerManager.isAllowed(walker,rod) )
|
||||
throw new ArgumentException(String.format("Walker does not allow access to metadata: %s. If this is correct, change the @Allows metadata",rod.getName()));
|
||||
for (ReferenceOrderedData<? extends ReferenceOrderedDatum> rod : rods) {
|
||||
if (!WalkerManager.isAllowed(walker, rod))
|
||||
throw new ArgumentException(String.format("Walker does not allow access to metadata: %s. If this is correct, change the @Allows metadata", rod.getName()));
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -266,7 +250,7 @@ public class GenomeAnalysisEngine {
|
|||
*/
|
||||
public int getBAMCompression() {
|
||||
return (argCollection.BAMcompression == null ||
|
||||
argCollection.BAMcompression < 1 ||
|
||||
argCollection.BAMcompression < 1 ||
|
||||
argCollection.BAMcompression > 8) ? 5 : argCollection.BAMcompression;
|
||||
}
|
||||
|
||||
|
|
@ -284,7 +268,7 @@ public class GenomeAnalysisEngine {
|
|||
|
||||
|
||||
/** Initialize the output streams as specified by the user. */
|
||||
private void initializeOutputStreams( Walker walker ) {
|
||||
private void initializeOutputStreams(Walker walker) {
|
||||
outputTracker = (argCollection.outErrFileName != null) ? new OutputTracker(argCollection.outErrFileName, argCollection.outErrFileName)
|
||||
: new OutputTracker(argCollection.outFileName, argCollection.errFileName);
|
||||
walker.initializeOutputStreams(outputTracker);
|
||||
|
|
@ -299,17 +283,6 @@ public class GenomeAnalysisEngine {
|
|||
return outputTracker;
|
||||
}
|
||||
|
||||
/**
|
||||
* This function is deprecated in the new traversal engines, if you need to get the header
|
||||
* please get the engine and then use the getHeader function
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
@Deprecated
|
||||
public SAMFileReader getSamReader() {
|
||||
return this.engine.getSamReader();
|
||||
}
|
||||
|
||||
public TraversalEngine getEngine() {
|
||||
return this.engine;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -101,13 +101,13 @@ public abstract class MicroScheduler {
|
|||
*/
|
||||
protected MicroScheduler(Walker walker, Reads reads, File refFile, List<ReferenceOrderedData<? extends ReferenceOrderedDatum>> rods) {
|
||||
if (walker instanceof ReadWalker) {
|
||||
traversalEngine = new TraverseReads(reads.getReadsFiles(), refFile, rods);
|
||||
traversalEngine = new TraverseReads();
|
||||
} else if (walker instanceof LocusWalker) {
|
||||
traversalEngine = new TraverseLoci(reads.getReadsFiles(), refFile, rods);
|
||||
traversalEngine = new TraverseLoci();
|
||||
} else if (walker instanceof LocusWindowWalker) {
|
||||
traversalEngine = new TraverseLocusWindows(reads.getReadsFiles(), refFile, rods);
|
||||
traversalEngine = new TraverseLocusWindows();
|
||||
} else if (walker instanceof DuplicateWalker) {
|
||||
traversalEngine = new TraverseDuplicates(reads.getReadsFiles(), refFile, rods);
|
||||
traversalEngine = new TraverseDuplicates();
|
||||
} else {
|
||||
throw new UnsupportedOperationException("Unable to determine traversal type, the walker is an unknown type.");
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,19 +1,6 @@
|
|||
package org.broadinstitute.sting.gatk.refdata;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.regex.MatchResult;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.regex.Matcher;
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.IOException;
|
||||
|
||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.xReadLines;
|
||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||
import org.apache.log4j.Logger;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* loc ref alt EM_alt_freq discovery_likelihood discovery_null discovery_prior discovery_lod EM_N n_ref n_het n_hom
|
||||
|
|
|
|||
|
|
@ -1,218 +1,63 @@
|
|||
package org.broadinstitute.sting.gatk.traversals;
|
||||
|
||||
import net.sf.picard.filter.SamRecordFilter;
|
||||
import net.sf.picard.filter.FilteringIterator;
|
||||
import net.sf.picard.reference.ReferenceSequence;
|
||||
import net.sf.samtools.SAMFileHeader;
|
||||
import net.sf.samtools.SAMFileReader;
|
||||
import net.sf.samtools.SAMFileReader.ValidationStringency;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import net.sf.samtools.SAMReadGroupRecord;
|
||||
import net.sf.samtools.util.RuntimeIOException;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.gatk.iterators.*;
|
||||
import org.broadinstitute.sting.gatk.refdata.*;
|
||||
import org.broadinstitute.sting.gatk.walkers.Walker;
|
||||
import org.broadinstitute.sting.gatk.datasources.shards.Shard;
|
||||
import org.broadinstitute.sting.gatk.datasources.providers.ShardDataProvider;
|
||||
import org.broadinstitute.sting.gatk.Reads;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.*;
|
||||
import org.broadinstitute.sting.gatk.datasources.shards.Shard;
|
||||
import org.broadinstitute.sting.gatk.walkers.Walker;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
|
||||
public abstract class TraversalEngine {
|
||||
// list of reference ordered data objects
|
||||
protected List<ReferenceOrderedData<? extends ReferenceOrderedDatum>> rods = null;
|
||||
|
||||
// Iterator over rods
|
||||
List<Pair<String, RODIterator<? extends ReferenceOrderedDatum>>> rodIters;
|
||||
|
||||
// How strict should we be with SAM/BAM parsing?
|
||||
protected ValidationStringency strictness = ValidationStringency.STRICT;
|
||||
|
||||
// Time in milliseconds since we initialized this engine
|
||||
protected long startTime = -1;
|
||||
protected long lastProgressPrintTime = -1; // When was the last time we printed our progress?
|
||||
private long startTime = -1;
|
||||
private long lastProgressPrintTime = -1; // When was the last time we printed our progress?
|
||||
|
||||
// How long can we go without printing some progress info?
|
||||
protected long MAX_PROGRESS_PRINT_TIME = 30 * 1000; // 10 seconds in millisecs
|
||||
protected long N_RECORDS_TO_PRINT = 1000000;
|
||||
private final long MAX_PROGRESS_PRINT_TIME = 30 * 1000; // 10 seconds in millisecs
|
||||
private final long N_RECORDS_TO_PRINT = 1000000;
|
||||
|
||||
// Maximum number of reads to process before finishing
|
||||
protected long maxReads = -1;
|
||||
|
||||
// Name of the reads file, in BAM/SAM format
|
||||
protected List<File> readsFiles = null; // the name of the reads file
|
||||
protected SAMFileReader samReader = null;
|
||||
// iterator over the sam records in the readsFile
|
||||
protected Iterator<SAMRecord> samReadIter = null;
|
||||
|
||||
// The verifying iterator, it does checking
|
||||
protected VerifyingSamIterator verifyingSamReadIter = null;
|
||||
|
||||
// The reference data -- filename, refSeqFile, and iterator
|
||||
protected File refFileName = null; // the name of the reference file
|
||||
|
||||
// Progress tracker for the sam file
|
||||
protected FileProgressTracker samReadingTracker = null;
|
||||
|
||||
protected boolean DEBUGGING = false;
|
||||
protected boolean beSafeP = true;
|
||||
protected boolean SORT_ON_FLY = false;
|
||||
protected boolean DOWNSAMPLE_BY_FRACTION = false;
|
||||
protected boolean DOWNSAMPLE_BY_COVERAGE = false;
|
||||
protected boolean FILTER_UNSORTED_READS = false;
|
||||
protected boolean walkOverAllSites = false;
|
||||
protected int maxOnFlySorts = 100000;
|
||||
protected double downsamplingFraction = 1.0;
|
||||
protected int downsamplingCoverage = 0;
|
||||
protected boolean filterZeroMappingQualityReads = false;
|
||||
|
||||
|
||||
// the stored header
|
||||
protected SAMFileHeader myHeader = null;
|
||||
private SAMFileHeader myHeader = null;
|
||||
|
||||
/**
|
||||
* our log, which we want to capture anything from this class
|
||||
*/
|
||||
/** our log, which we want to capture anything from this class */
|
||||
protected static Logger logger = Logger.getLogger(TraversalEngine.class);
|
||||
|
||||
|
||||
// Locations we are going to process during the traversal
|
||||
private List<GenomeLoc> locs = null;
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// Setting up the engine
|
||||
//
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Creates a new, uninitialized TraversalEngine
|
||||
*
|
||||
* @param reads SAM/BAM file of reads
|
||||
* @param ref Reference file in FASTA format, assumes a .dict file is also available
|
||||
* @param rods Array of reference ordered data sets
|
||||
*/
|
||||
public TraversalEngine(List<File> reads, File ref, List<ReferenceOrderedData<? extends ReferenceOrderedDatum>> rods) {
|
||||
readsFiles = reads;
|
||||
refFileName = ref;
|
||||
this.rods = rods;
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// Manipulating the underlying engine parameters
|
||||
//
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//public void setRegion(final String reg) { regionStr = regionStr; }
|
||||
//public void setTraversalType(final String type) { traversalType = type; }
|
||||
public void setStrictness(final ValidationStringency s) {
|
||||
strictness = s;
|
||||
}
|
||||
|
||||
public void setMaxReads(final int maxReads) {
|
||||
this.maxReads = maxReads;
|
||||
}
|
||||
|
||||
public void setFilterZeroMappingQualityReads(final boolean filterZeroMappingQualityReads) {
|
||||
this.filterZeroMappingQualityReads = filterZeroMappingQualityReads;
|
||||
}
|
||||
|
||||
private void setDebugging(final boolean d) {
|
||||
DEBUGGING = d;
|
||||
}
|
||||
|
||||
private void setSafetyChecking(final boolean beSafeP) {
|
||||
if (!beSafeP)
|
||||
logger.warn("*** Turning off safety checking, I hope you know what you are doing. Errors will result in debugging assert failures and other inscrutable messages...");
|
||||
this.beSafeP = beSafeP;
|
||||
}
|
||||
|
||||
public void setFilterUnsortedReads(final boolean filterUnsorted) {
|
||||
if (!filterUnsorted)
|
||||
logger.warn("*** Turning on filtering of out of order reads, I *really* hope you know what you are doing, as you are removing data...");
|
||||
this.FILTER_UNSORTED_READS = filterUnsorted;
|
||||
}
|
||||
|
||||
private void setSortOnFly(final int maxReadsToSort) {
|
||||
logger.info("Sorting read file on the fly: max reads allowed is " + maxReadsToSort);
|
||||
SORT_ON_FLY = true;
|
||||
maxOnFlySorts = maxReadsToSort;
|
||||
}
|
||||
|
||||
private void setSortOnFly() { setSortOnFly(100000); }
|
||||
|
||||
private void setDownsampleByFraction(final double fraction) {
|
||||
logger.info("Downsampling to approximately " + (fraction * 100.0) + "% of filtered reads");
|
||||
DOWNSAMPLE_BY_FRACTION = true;
|
||||
downsamplingFraction = fraction;
|
||||
}
|
||||
|
||||
private void setDownsampleByCoverage(final int coverage) {
|
||||
logger.info("Downsampling to coverage " + coverage);
|
||||
DOWNSAMPLE_BY_COVERAGE = true;
|
||||
downsamplingCoverage = coverage;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets up read filtering performed by the traversal engine.
|
||||
* @param reads Read filters to instantiate as the traversal engine walks over the data.
|
||||
* @deprecated Should only be used by old-style traversals.
|
||||
*/
|
||||
@Deprecated
|
||||
public void setReadFilters( Reads reads) {
|
||||
if( reads.getDownsamplingFraction() != null ) setDownsampleByFraction(reads.getDownsamplingFraction());
|
||||
if( reads.getDownsampleToCoverage() != null ) setDownsampleByCoverage(reads.getDownsampleToCoverage());
|
||||
if( reads.getSafetyChecking() != null ) setSafetyChecking(reads.getSafetyChecking());
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// functions for dealing locations (areas of the genome we're traversing over)
|
||||
//
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
|
||||
|
||||
/**
|
||||
* get the associated SAM header for our run
|
||||
*
|
||||
* @return the header if it's stored, null if not
|
||||
*/
|
||||
public SAMFileHeader getSAMHeader() {
|
||||
if ( myHeader == null )
|
||||
myHeader = samReader.getFileHeader();
|
||||
return myHeader;
|
||||
}
|
||||
|
||||
/**
|
||||
* set's the SAM header for this traversal, which should
|
||||
* be the merged header in the multiple BAM file case.
|
||||
*
|
||||
*
|
||||
* @param myHeader the passed in header
|
||||
*/
|
||||
|
||||
public void setSAMHeader(SAMFileHeader myHeader) {
|
||||
this.myHeader = myHeader;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the intervals over which the traversal(s) should happen.
|
||||
* @param locs
|
||||
*/
|
||||
public void setLocation(final List<GenomeLoc> locs) {
|
||||
this.locs = locs;
|
||||
}
|
||||
|
||||
|
||||
public boolean hasLocations() {
|
||||
return this.locs != null && !this.locs.isEmpty();
|
||||
}
|
||||
|
||||
public List<GenomeLoc> getLocations() {
|
||||
return this.locs;
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// printing
|
||||
|
|
@ -221,17 +66,18 @@ public abstract class TraversalEngine {
|
|||
|
||||
/**
|
||||
* @param curTime (current runtime, in millisecs)
|
||||
*
|
||||
* @return true if the maximum interval (in millisecs) has passed since the last printing
|
||||
*/
|
||||
protected boolean maxElapsedIntervalForPrinting(final long curTime) {
|
||||
private boolean maxElapsedIntervalForPrinting(final long curTime) {
|
||||
return (curTime - this.lastProgressPrintTime) > MAX_PROGRESS_PRINT_TIME;
|
||||
}
|
||||
|
||||
/**
|
||||
* Forward request to printProgress
|
||||
*
|
||||
* @param type
|
||||
* @param loc
|
||||
* @param type the type of traversal
|
||||
* @param loc the location
|
||||
*/
|
||||
public void printProgress(final String type, GenomeLoc loc) {
|
||||
printProgress(false, type, loc);
|
||||
|
|
@ -245,7 +91,7 @@ public abstract class TraversalEngine {
|
|||
* @param type String to print out describing our atomic traversal type ("read", "locus", etc)
|
||||
* @param loc Current location
|
||||
*/
|
||||
public void printProgress(boolean mustPrint, final String type, GenomeLoc loc) {
|
||||
private void printProgress(boolean mustPrint, final String type, GenomeLoc loc) {
|
||||
final long nRecords = TraversalStatistics.nRecords;
|
||||
final long curTime = System.currentTimeMillis();
|
||||
final double elapsed = (curTime - startTime) / 1000.0;
|
||||
|
|
@ -259,22 +105,16 @@ public abstract class TraversalEngine {
|
|||
else
|
||||
logger.info(String.format("[PROGRESS] Traversed %,d %s in %.2f secs (%.2f secs per 1M %s)", nRecords, type, elapsed, secsPer1MReads, type));
|
||||
|
||||
// Currently samReadingTracker will print misleading info if we're not processing the whole file
|
||||
|
||||
// If an index is enabled, file read progress is meaningless because a linear
|
||||
// traversal is not being performed. For now, don't bother printing progress.
|
||||
// TODO: Create a sam indexed read tracker that tracks based on percentage through the query.
|
||||
if (samReadingTracker != null && this.locs == null)
|
||||
logger.info(String.format("[PROGRESS] -> %s", samReadingTracker.progressMeter()));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* A passthrough method so that subclasses can report which types of traversals they're using.
|
||||
*
|
||||
* @param sum Result of the computation.
|
||||
* @param <T> Type of the computation.
|
||||
*/
|
||||
public abstract <T> void printOnTraversalDone( T sum );
|
||||
public abstract <T> void printOnTraversalDone(T sum);
|
||||
|
||||
/**
|
||||
* Called after a traversal to print out information about the traversal process
|
||||
|
|
@ -290,9 +130,9 @@ public abstract class TraversalEngine {
|
|||
final double elapsed = (curTime - startTime) / 1000.0;
|
||||
logger.info(String.format("Total runtime %.2f secs, %.2f min, %.2f hours%n", elapsed, elapsed / 60, elapsed / 3600));
|
||||
logger.info(String.format("Traversal skipped %d valid reads out of %d total (%.2f%%)",
|
||||
TraversalStatistics.nSkippedReads,
|
||||
TraversalStatistics.nReads,
|
||||
(TraversalStatistics.nSkippedReads * 100.0) / TraversalStatistics.nReads));
|
||||
TraversalStatistics.nSkippedReads,
|
||||
TraversalStatistics.nReads,
|
||||
(TraversalStatistics.nSkippedReads * 100.0) / TraversalStatistics.nReads));
|
||||
logger.info(String.format(" -> %d unmapped reads", TraversalStatistics.nUnmappedReads));
|
||||
logger.info(String.format(" -> %d duplicate reads", TraversalStatistics.nDuplicates));
|
||||
logger.info(String.format(" -> %d non-primary reads", TraversalStatistics.nNotPrimary));
|
||||
|
|
@ -306,147 +146,26 @@ public abstract class TraversalEngine {
|
|||
//
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Initialize the traversal engine. After this point traversals can be run over the data
|
||||
*
|
||||
* @return true on success
|
||||
*/
|
||||
public boolean initialize() {
|
||||
/** Initialize the traversal engine. After this point traversals can be run over the data */
|
||||
public void initialize() {
|
||||
lastProgressPrintTime = startTime = System.currentTimeMillis();
|
||||
// Initial the reference ordered data iterators
|
||||
initializeRODs();
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
protected SAMFileReader initializeSAMFile(File samFile) {
|
||||
// todo: fixme, this is a hack to try out dynamic merging
|
||||
if ( samFile.toString().endsWith(".list") ) {
|
||||
return null;
|
||||
// todo: omg, this is just scary, just it's just for testing purposes. fix with the new DataSource system
|
||||
} else {
|
||||
SAMFileReader samReader = new SAMFileReader(samFile, true);
|
||||
samReader.setValidationStringency(strictness);
|
||||
|
||||
final SAMFileHeader header = samReader.getFileHeader();
|
||||
logger.debug(String.format("Sort order is: " + header.getSortOrder()));
|
||||
|
||||
// Kludge filename into sam file header.
|
||||
if (samReader.getFileHeader().getReadGroups().size() < 1) {
|
||||
//logger.warn("Setting header in reader " + f.getName());
|
||||
SAMReadGroupRecord rec = new SAMReadGroupRecord(samFile.getName());
|
||||
rec.setLibrary(samFile.getName());
|
||||
rec.setSample(samFile.getName());
|
||||
|
||||
samReader.getFileHeader().addReadGroup(rec);
|
||||
}
|
||||
|
||||
return samReader;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Prepare the list of reference ordered data iterators for each of the rods
|
||||
* this method must be implemented by all traversal engines
|
||||
*
|
||||
* @return A list of ROD iterators for getting data from each ROD
|
||||
*/
|
||||
protected void initializeRODs() {
|
||||
// set up reference ordered data
|
||||
rodIters = new ArrayList<Pair<String, RODIterator<? extends ReferenceOrderedDatum>>>();
|
||||
for (ReferenceOrderedData<? extends ReferenceOrderedDatum> data : rods) {
|
||||
rodIters.add(new Pair<String, RODIterator<? extends ReferenceOrderedDatum>>(data.getName(), data.iterator()));
|
||||
}
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// shutdown
|
||||
//
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
|
||||
public boolean shutdown() {
|
||||
// todo: actually shutdown the resources
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// dealing with reference ordered data
|
||||
//
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Builds a list of the reference ordered datum at loc from each of the iterators. This function
|
||||
* assumes you are accessing the data in order. You can't use this function for random access. Each
|
||||
* successive call moves you along the file, consuming all data before loc.
|
||||
* @param walker the walker to run with
|
||||
* @param shard a shard of data
|
||||
* @param dataProvider the data provider that generates data given the shard
|
||||
* @param sum the accumulator
|
||||
* @param <M> an object of the map type
|
||||
* @param <T> an object of the reduce type
|
||||
*
|
||||
* @param loc The location to get the rods at
|
||||
* @return A list of ReferenceOrderDatum at loc. ROD without a datum at loc will be null in the list
|
||||
* @return an object of the reduce type
|
||||
*/
|
||||
protected RefMetaDataTracker getReferenceOrderedDataAtLocus(final GenomeLoc loc) {
|
||||
RefMetaDataTracker tracks = new RefMetaDataTracker();
|
||||
for (Pair<String, RODIterator<? extends ReferenceOrderedDatum>> pair : rodIters) {
|
||||
String name = pair.getFirst();
|
||||
tracks.bind(name, pair.getSecond().seekForward(loc));
|
||||
}
|
||||
|
||||
return tracks;
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// processing
|
||||
//
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
|
||||
public <M, T> T traverse(Walker<M, T> walker) {
|
||||
T sum = null;
|
||||
if ( hasLocations() && walker.isReduceByInterval() ) {
|
||||
logger.info("Doing reduce by interval processing");
|
||||
if ( ! hasLocations() )
|
||||
Utils.scareUser("ReduceByInterval requested by no interval was provided");
|
||||
List<Pair<GenomeLoc, T>> map = new ArrayList<Pair<GenomeLoc, T>>(locs.size());
|
||||
for ( GenomeLoc loc : locs ) {
|
||||
ArrayList<GenomeLoc> l = new ArrayList<GenomeLoc>();
|
||||
l.add(loc);
|
||||
T intervalSum = traverse(walker, l);
|
||||
sum = intervalSum;
|
||||
map.add(new Pair<GenomeLoc, T>(loc, intervalSum));
|
||||
}
|
||||
walker.onTraversalDone(map);
|
||||
} else {
|
||||
List<GenomeLoc> l = new ArrayList<GenomeLoc>();
|
||||
if ( hasLocations() )
|
||||
l = locs;
|
||||
sum = traverse(walker, l);
|
||||
}
|
||||
|
||||
printOnTraversalDone("elements", sum);
|
||||
return sum;
|
||||
}
|
||||
|
||||
public <M, T> T traverse(Walker<M, T> walker, List<GenomeLoc> locations) {
|
||||
return null;
|
||||
}
|
||||
|
||||
public <M,T> T traverse( Walker<M,T> walker,
|
||||
Shard shard,
|
||||
ShardDataProvider dataProvider,
|
||||
T sum ) {
|
||||
throw new UnsupportedOperationException( "This method is a required override for new traversal engines. Please port your traversal engine to the new style." );
|
||||
}
|
||||
|
||||
public void verifySortOrder(final boolean requiresSortedOrder) {
|
||||
if (beSafeP && !SORT_ON_FLY && samReader.getFileHeader().getSortOrder() != SAMFileHeader.SortOrder.coordinate) {
|
||||
final String msg = "SAM file is not sorted in coordinate order (according to header) Walker type with given arguments requires a sorted file for correct processing";
|
||||
if (requiresSortedOrder || strictness == SAMFileReader.ValidationStringency.STRICT)
|
||||
throw new RuntimeIOException(msg);
|
||||
else if (strictness == SAMFileReader.ValidationStringency.LENIENT)
|
||||
logger.warn(msg);
|
||||
}
|
||||
}
|
||||
@Deprecated
|
||||
public SAMFileReader getSamReader() { return this.samReader; }
|
||||
public abstract <M, T> T traverse(Walker<M, T> walker,
|
||||
Shard shard,
|
||||
ShardDataProvider dataProvider,
|
||||
T sum);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -35,15 +35,12 @@ import org.broadinstitute.sting.gatk.datasources.providers.ShardDataProvider;
|
|||
import org.broadinstitute.sting.gatk.datasources.shards.ReadShard;
|
||||
import org.broadinstitute.sting.gatk.datasources.shards.Shard;
|
||||
import org.broadinstitute.sting.gatk.iterators.PushbackIterator;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
|
||||
import org.broadinstitute.sting.gatk.walkers.DuplicateWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.Walker;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.Pair;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Iterator;
|
||||
|
|
@ -71,17 +68,6 @@ public class TraverseDuplicates extends TraversalEngine {
|
|||
|
||||
private final boolean DEBUG = false;
|
||||
|
||||
/**
|
||||
* Creates a new, uninitialized TraversalEngine
|
||||
*
|
||||
* @param reads SAM/BAM file of reads
|
||||
* @param ref Reference file in FASTA format, assumes a .dict file is also available
|
||||
* @param rods Array of reference ordered data sets
|
||||
*/
|
||||
public TraverseDuplicates(List<File> reads, File ref, List<ReferenceOrderedData<? extends ReferenceOrderedDatum>> rods) {
|
||||
super(reads, ref, rods);
|
||||
}
|
||||
|
||||
private List<SAMRecord> readsAtLoc(final SAMRecord read, PushbackIterator<SAMRecord> iter) {
|
||||
GenomeLoc site = GenomeLocParser.createGenomeLoc(read);
|
||||
ArrayList<SAMRecord> l = new ArrayList<SAMRecord>();
|
||||
|
|
|
|||
|
|
@ -1,27 +1,18 @@
|
|||
package org.broadinstitute.sting.gatk.traversals;
|
||||
|
||||
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.Walker;
|
||||
import org.broadinstitute.sting.gatk.walkers.DataSource;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.gatk.LocusContext;
|
||||
import org.broadinstitute.sting.gatk.WalkerManager;
|
||||
import org.broadinstitute.sting.gatk.datasources.providers.*;
|
||||
import org.broadinstitute.sting.gatk.datasources.shards.Shard;
|
||||
import org.broadinstitute.sting.gatk.datasources.providers.ShardDataProvider;
|
||||
import org.broadinstitute.sting.gatk.datasources.providers.AllLocusView;
|
||||
import org.broadinstitute.sting.gatk.datasources.providers.CoveredLocusView;
|
||||
import org.broadinstitute.sting.gatk.datasources.providers.LocusView;
|
||||
import org.broadinstitute.sting.gatk.datasources.providers.ReferenceOrderedView;
|
||||
import org.broadinstitute.sting.gatk.datasources.providers.LocusReferenceView;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.DataSource;
|
||||
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.Walker;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.apache.log4j.Logger;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.ArrayList;
|
||||
import java.io.File;
|
||||
|
||||
/**
|
||||
* A simple solution to iterating over all reference positions over a series of genomic locations.
|
||||
|
|
@ -33,10 +24,6 @@ public class TraverseLoci extends TraversalEngine {
|
|||
*/
|
||||
protected static Logger logger = Logger.getLogger(TraversalEngine.class);
|
||||
|
||||
public TraverseLoci(List<File> reads, File ref, List<ReferenceOrderedData<? extends ReferenceOrderedDatum>> rods) {
|
||||
super( reads, ref, rods );
|
||||
}
|
||||
|
||||
public <M,T> T traverse(Walker<M,T> walker, ArrayList<GenomeLoc> locations) {
|
||||
if ( locations.isEmpty() )
|
||||
Utils.scareUser("Requested all locations be processed without providing locations to be processed!");
|
||||
|
|
|
|||
|
|
@ -1,20 +1,20 @@
|
|||
package org.broadinstitute.sting.gatk.traversals;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.gatk.LocusContext;
|
||||
import org.broadinstitute.sting.gatk.datasources.providers.LocusReferenceView;
|
||||
import org.broadinstitute.sting.gatk.datasources.providers.ReadView;
|
||||
import org.broadinstitute.sting.gatk.datasources.providers.ReferenceOrderedView;
|
||||
import org.broadinstitute.sting.gatk.datasources.providers.ShardDataProvider;
|
||||
import org.broadinstitute.sting.gatk.datasources.shards.Shard;
|
||||
import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.LocusWindowWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.Walker;
|
||||
import org.broadinstitute.sting.gatk.LocusContext;
|
||||
import org.broadinstitute.sting.gatk.datasources.shards.Shard;
|
||||
import org.broadinstitute.sting.gatk.datasources.providers.*;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
|
||||
import java.util.*;
|
||||
import java.io.File;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import java.util.ArrayList;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
|
|
@ -25,10 +25,6 @@ import net.sf.samtools.SAMRecord;
|
|||
*/
|
||||
public class TraverseLocusWindows extends TraversalEngine {
|
||||
|
||||
public TraverseLocusWindows(List<File> reads, File ref, List<ReferenceOrderedData<? extends ReferenceOrderedDatum>> rods) {
|
||||
super(reads, ref, rods);
|
||||
}
|
||||
|
||||
public <M,T> T traverse( Walker<M,T> walker,
|
||||
Shard shard,
|
||||
ShardDataProvider dataProvider,
|
||||
|
|
@ -70,7 +66,7 @@ public class TraverseLocusWindows extends TraversalEngine {
|
|||
return sum;
|
||||
}
|
||||
|
||||
private LocusContext getLocusContext(Iterator<SAMRecord> readIter, GenomeLoc interval) {
|
||||
private LocusContext getLocusContext(StingSAMIterator readIter, GenomeLoc interval) {
|
||||
ArrayList<SAMRecord> reads = new ArrayList<SAMRecord>();
|
||||
boolean done = false;
|
||||
long leftmostIndex = interval.getStart(),
|
||||
|
|
@ -92,8 +88,8 @@ public class TraverseLocusWindows extends TraversalEngine {
|
|||
|
||||
GenomeLoc window = GenomeLocParser.createGenomeLoc(interval.getContig(), leftmostIndex, rightmostIndex);
|
||||
LocusContext locus = new LocusContext(window, reads, null);
|
||||
if ( DOWNSAMPLE_BY_COVERAGE )
|
||||
locus.downsampleToCoverage(downsamplingCoverage);
|
||||
if ( readIter.getSourceInfo().getDownsampleToCoverage() != null )
|
||||
locus.downsampleToCoverage(readIter.getSourceInfo().getDownsampleToCoverage());
|
||||
|
||||
return locus;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -4,24 +4,20 @@ import net.sf.samtools.SAMRecord;
|
|||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.gatk.LocusContext;
|
||||
import org.broadinstitute.sting.gatk.WalkerManager;
|
||||
import org.broadinstitute.sting.gatk.datasources.providers.ShardDataProvider;
|
||||
import org.broadinstitute.sting.gatk.datasources.providers.ReadView;
|
||||
import org.broadinstitute.sting.gatk.datasources.providers.ReadReferenceView;
|
||||
import org.broadinstitute.sting.gatk.datasources.providers.ReadView;
|
||||
import org.broadinstitute.sting.gatk.datasources.providers.ShardDataProvider;
|
||||
import org.broadinstitute.sting.gatk.datasources.shards.IntervalShard;
|
||||
import org.broadinstitute.sting.gatk.datasources.shards.ReadShard;
|
||||
import org.broadinstitute.sting.gatk.datasources.shards.Shard;
|
||||
import org.broadinstitute.sting.gatk.datasources.shards.IntervalShard;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
|
||||
import org.broadinstitute.sting.gatk.walkers.DataSource;
|
||||
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.Walker;
|
||||
import org.broadinstitute.sting.gatk.walkers.DataSource;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
/*
|
||||
* Copyright (c) 2009 The Broad Institute
|
||||
|
|
@ -63,22 +59,9 @@ public class TraverseReads extends TraversalEngine {
|
|||
/** our log, which we want to capture anything from this class */
|
||||
protected static Logger logger = Logger.getLogger(TraverseReads.class);
|
||||
|
||||
|
||||
/**
|
||||
* Creates a new, uninitialized TraversalEngine
|
||||
*
|
||||
* @param reads SAM/BAM file of reads
|
||||
* @param ref Reference file in FASTA format, assumes a .dict file is also available
|
||||
* @param rods Array of reference ordered data sets
|
||||
*/
|
||||
public TraverseReads(List<File> reads, File ref, List<ReferenceOrderedData<? extends ReferenceOrderedDatum>> rods) {
|
||||
super(reads, ref, rods);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Traverse by reads, given the data and the walker
|
||||
*
|
||||
*
|
||||
* @param walker the walker to traverse with
|
||||
* @param shard the shard, specifying the range of data to iterate over
|
||||
* @param dataProvider the provider of the reads data
|
||||
|
|
@ -107,7 +90,7 @@ public class TraverseReads extends TraversalEngine {
|
|||
|
||||
ReadWalker<M, T> readWalker = (ReadWalker<M, T>) walker;
|
||||
boolean needsReferenceBasesP = WalkerManager.isRequired(walker, DataSource.REFERENCE_BASES);
|
||||
|
||||
|
||||
ReadView reads = new ReadView(dataProvider);
|
||||
ReadReferenceView reference = new ReadReferenceView(dataProvider);
|
||||
|
||||
|
|
|
|||
|
|
@ -1,249 +0,0 @@
|
|||
package org.broadinstitute.sting.playground.gatk.walkers;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileWriter;
|
||||
import java.io.IOException;
|
||||
|
||||
import org.broadinstitute.sting.gatk.LocusContext;
|
||||
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.WalkerName;
|
||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||
import org.broadinstitute.sting.playground.indels.*;
|
||||
import net.sf.samtools.*;
|
||||
|
||||
import net.sf.picard.reference.ReferenceSequence;
|
||||
|
||||
// Investigates indels called in the alignment data
|
||||
|
||||
@WalkerName("InspectIndels")
|
||||
public class IndelInspector extends ReadWalker<Integer, Integer> {
|
||||
@Argument(fullName="IndelVerbosity", shortName="iverb", required=false, doc="Verbosity level: SILENT, PILESUMMARY, ALIGNMENTS") public String VERBOSITY_LEVEL = "SILENT";
|
||||
@Argument(fullName="OutputNormal", shortName="outNorm", required=true, doc="Output file (sam or bam) for non-indel related reads and indel reads that were not improved") public String OUT1;
|
||||
@Argument(fullName="OutputIndel", shortName="outIndel", required=true, doc="Output file (sam or bam) for improved (realigned) indel related reads") public String OUT2;
|
||||
@Argument(fullName="ControlRun", shortName="paranoid", required=false, doc="If true, all reads that would be otherwise picked and processed by this tool will be saved, unmodified, into the OutputNormal file") public boolean CONTROL_RUN = false;
|
||||
@Argument(fullName="ErrorCheckMode", shortName="error", required=false, doc="Error counting mode: MM (mismatches only (from sam tags)), MC (mismatches only doing actual mismatch count on the fly (use this if tags are incorrectly set)), ERR (errors (arachne style: mm+gap lengths)), MG (count mismatches and gaps as one error each)") public String ERR_MODE;
|
||||
@Argument(fullName="MaxErrors", shortName="maxError", required=false, doc="Maximum number of errors allowed (see ERR_MODE)") public Integer MAX_ERRS = 10000;
|
||||
@Argument(fullName="MaxReadLength", shortName="maxRead", required=false, doc="Ignore reads that are longer than the specified cutoff (not a good way to do things but might be necessary because of performance issues)") public int MAX_READ_LENGTH = -1;
|
||||
|
||||
private int discarded_cigar_count = 0;
|
||||
private int discarded_long_read_count = 0;
|
||||
private int discarded_maxerr_count = 0;
|
||||
private int unmapped_read_count = 0;
|
||||
private int total_reads = 0;
|
||||
private IndelRecordPileCollector collector;
|
||||
private PileBuilder pileBuilder = null;
|
||||
private PassThroughWriter ptWriter;
|
||||
private String cur_contig = null;
|
||||
|
||||
|
||||
// public boolean filter(LocusContext context, SAMRecord read) {
|
||||
// // we only want aligned reads
|
||||
// return !read.getReadUnmappedFlag();
|
||||
// }
|
||||
|
||||
public void initialize() {
|
||||
|
||||
if ( ERR_MODE != null && ! ERR_MODE.equals("MM") && ! ERR_MODE.equals("MG") && ! ERR_MODE.equals("ERR") && ! ERR_MODE.equals("MC") ) {
|
||||
err.println("Unknown value specified for ERR_MODE: "+ERR_MODE+ "... skipping it.");
|
||||
ERR_MODE = null;
|
||||
}
|
||||
|
||||
SAMFileHeader header = getToolkit().getSamReader().getFileHeader();
|
||||
header.setSortOrder(SAMFileHeader.SortOrder.unsorted);
|
||||
ptWriter = new PassThroughWriter(OUT1, header);
|
||||
if ( ! CONTROL_RUN ) pileBuilder = new PileBuilder(OUT2, header, ptWriter);
|
||||
|
||||
try {
|
||||
if ( CONTROL_RUN ) collector = new IndelRecordPileCollector(ptWriter, new DiscardingPileReceiver() );
|
||||
else collector = new IndelRecordPileCollector(ptWriter, pileBuilder );
|
||||
} catch(Exception e) {
|
||||
err.println(e.getMessage());
|
||||
}
|
||||
if ( collector == null )
|
||||
return;
|
||||
|
||||
collector.setControlRun(CONTROL_RUN);
|
||||
|
||||
if ( ! CONTROL_RUN ) {
|
||||
if ( VERBOSITY_LEVEL.toUpperCase().equals("SILENT")) pileBuilder.setVerbosity(PileBuilder.SILENT);
|
||||
else if ( VERBOSITY_LEVEL.toUpperCase().equals("PILESUMMARY") ) pileBuilder.setVerbosity(PileBuilder.PILESUMMARY);
|
||||
else if ( VERBOSITY_LEVEL.toUpperCase().equals("ALIGNMENTS") ) pileBuilder.setVerbosity(PileBuilder.ALIGNMENTS);
|
||||
else {
|
||||
err.println("Unrecognized VERBOSITY_LEVEL setting... skipping it.");
|
||||
pileBuilder.setVerbosity(PileBuilder.SILENT);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public Integer map(char[] ref, SAMRecord read) {
|
||||
|
||||
total_reads++;
|
||||
|
||||
if ( read.getReadUnmappedFlag() ) {
|
||||
unmapped_read_count++;
|
||||
ptWriter.receive(read); // make sure we still write the read to the output, we do not want to lose data!
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* 11 May 2009 - Commented out because we no longer have the full reference sequence for sanity checking.
|
||||
ReferenceSequence contig_seq = context.getReferenceContig();
|
||||
if ( read.getReferenceName() != cur_contig) {
|
||||
cur_contig = read.getReferenceName();
|
||||
out.println("Contig "+cur_contig);
|
||||
String refstr = new String(contig_seq.getBases());
|
||||
if ( !CONTROL_RUN )
|
||||
pileBuilder.setReferenceSequence(refstr);
|
||||
out.println("loaded contig "+cur_contig+" (index="+read.getReferenceIndex()+"); length="+contig_seq.getBases().length+" tst="+contig_seq.toString());
|
||||
}
|
||||
*/
|
||||
|
||||
// if ( cur_contig.equals("chrM") || GenomeLoc.compareContigs(cur_contig,"chrY") > 0 ) continue; // skip chrM and unplaced contigs for now
|
||||
|
||||
if ( MAX_READ_LENGTH > 0 && read.getReadLength() > MAX_READ_LENGTH ) {
|
||||
ptWriter.receive(read);
|
||||
discarded_long_read_count++;
|
||||
return 0;
|
||||
}
|
||||
|
||||
// we currently do not know how to deal with cigars containing elements other than M,I,D, so
|
||||
// let's just skip the reads that contain those other elements (clipped reads?)
|
||||
Cigar c = read.getCigar();
|
||||
boolean cigar_acceptable = true;
|
||||
for ( int z = 0 ; z < c.numCigarElements() ; z++ ) {
|
||||
CigarElement ce = c.getCigarElement(z);
|
||||
switch ( ce.getOperator() ) {
|
||||
case M:
|
||||
case I:
|
||||
case D: break;
|
||||
default:
|
||||
cigar_acceptable = false;
|
||||
}
|
||||
}
|
||||
if ( ! cigar_acceptable ) {
|
||||
ptWriter.receive(read);
|
||||
discarded_cigar_count++;
|
||||
return 0;
|
||||
}
|
||||
|
||||
int err = -1;
|
||||
/*
|
||||
System.out.println("MM: "+numMismatches(r));
|
||||
System.out.println("direct: "+numMismatchesDirect(r,contig_seq));
|
||||
System.out.print(" ");
|
||||
for ( int i = read.getAlignmentStart() - 1 ; i < read.getAlignmentEnd() ; i++ ) System.out.print((char)contig_seq.getBases()[i]);
|
||||
System.out.println();
|
||||
System.out.println((read.getReadNegativeStrandFlag()?"<-":"->")+read.getReadString());
|
||||
System.out.println("cigar: "+read.getCigarString());
|
||||
System.out.println();
|
||||
if (counter++ == 20 ) break;
|
||||
continue;
|
||||
*/
|
||||
|
||||
if ( ERR_MODE != null ) {
|
||||
if ( ERR_MODE.equals("MM")) err = numMismatches(read,ref);
|
||||
else if ( ERR_MODE.equals("MC") ) err = AlignmentUtils.numMismatches(read,ref);
|
||||
else if ( ERR_MODE.equals("ERR")) err = numErrors(read,ref);
|
||||
else if ( ERR_MODE.equals("MG")) err = numMismatchesGaps(read,ref);
|
||||
if ( err > MAX_ERRS ) {
|
||||
ptWriter.receive(read);
|
||||
discarded_maxerr_count++;
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
collector.receive(read);
|
||||
return 1;
|
||||
}
|
||||
|
||||
/** This method is a HACK: it is designed to work around the current bug in NM tags created at CRD
|
||||
*
|
||||
* @param r SAM record that must specify an alignment
|
||||
* @return number of errors (number of mismatches plus total length of all insertions/deletions
|
||||
* @throws RuntimeException
|
||||
*/
|
||||
private static int numErrors(SAMRecord r, char[] ref) throws RuntimeException {
|
||||
|
||||
// NM currently stores the total number of mismatches in all blocks + 1
|
||||
int errs = numMismatches(r,ref);
|
||||
|
||||
// now we have to add the total length of all indels:
|
||||
Cigar c = r.getCigar();
|
||||
for ( int i = 0 ; i < c.numCigarElements() ; i++ ) {
|
||||
CigarElement ce = c.getCigarElement(i);
|
||||
switch( ce.getOperator()) {
|
||||
case M : break; // we already have correct number of mismatches
|
||||
case I :
|
||||
case D :
|
||||
errs += ce.getLength();
|
||||
break;
|
||||
default: throw new RuntimeException("Unrecognized cigar element");
|
||||
}
|
||||
}
|
||||
return errs;
|
||||
}
|
||||
|
||||
/** This method is a HACK: it is designed to work around the current bug in NM tags created at CRD
|
||||
*
|
||||
* @param r SAM record that must specify an alignment
|
||||
* @return number of errors (number of mismatches plus total number of all insertions/deletions (each insertion or
|
||||
* deletion will be counted as a single error regardless of the length)
|
||||
* @throws RuntimeException
|
||||
*/
|
||||
private static int numMismatchesGaps(SAMRecord r,char[] ref) throws RuntimeException {
|
||||
|
||||
// NM currently stores the total number of mismatches in all blocks + 1
|
||||
int errs = numMismatches(r,ref);
|
||||
|
||||
// now we have to add the total length of all indels:
|
||||
Cigar c = r.getCigar();
|
||||
for ( int i = 0 ; i < c.numCigarElements() ; i++ ) {
|
||||
CigarElement ce = c.getCigarElement(i);
|
||||
switch( ce.getOperator()) {
|
||||
case M : break; // we already have correct number of mismatches
|
||||
case I :
|
||||
case D :
|
||||
errs++;
|
||||
break;
|
||||
default: throw new RuntimeException("Unrecognized cigar element");
|
||||
}
|
||||
}
|
||||
return errs;
|
||||
}
|
||||
|
||||
/** This method is a HACK: it is designed to work around the current bug in NM tags created at CRD */
|
||||
private static int numMismatches(SAMRecord r, char[] refseq) throws RuntimeException {
|
||||
|
||||
// NM currently stores the total number of mismatches in all blocks + 1
|
||||
Integer i = (Integer)r.getAttribute("NM");
|
||||
if ( i == null ) return AlignmentUtils.numMismatches(r,refseq);
|
||||
return ((Integer)r.getAttribute("NM")).intValue() - 1;
|
||||
|
||||
}
|
||||
|
||||
// Given result of map function
|
||||
public Integer reduceInit() { return 0; }
|
||||
public Integer reduce(Integer value, Integer sum) {
|
||||
return value + sum;
|
||||
}
|
||||
|
||||
public void onTraversalDone(Integer result) {
|
||||
collector.close(); // write the reads collector might be still holding
|
||||
|
||||
if ( ! CONTROL_RUN ) {
|
||||
pileBuilder.printStats();
|
||||
pileBuilder.close();
|
||||
}
|
||||
out.println("done.");
|
||||
out.println("Total reads processed: "+ total_reads);
|
||||
out.println("Unmodified: "+ ptWriter.getNumReadsReceived());
|
||||
if ( pileBuilder != null) out.println("Written into realigned piles: "+ pileBuilder.getNumReadsWritten());
|
||||
if ( pileBuilder != null) out.println("Received for realignment: "+ pileBuilder.getNumReadsReceived());
|
||||
out.println("Discarded reads with non-M,I,D cigar elements: "+ discarded_cigar_count);
|
||||
out.println("Discarded long reads (above "+MAX_READ_LENGTH+" bp): "+ discarded_long_read_count);
|
||||
out.println("Discarded reads with error counts exceeding the threshold: "+ discarded_maxerr_count);
|
||||
out.println("Unmapped reads (passed through to the output): "+ unmapped_read_count);
|
||||
out.println();
|
||||
collector.printLengthHistograms();
|
||||
ptWriter.close();
|
||||
}
|
||||
}
|
||||
|
|
@ -56,7 +56,6 @@ public class ArtificialReadsTraversal extends TraversalEngine {
|
|||
|
||||
/** Creates a new, uninitialized ArtificialReadsTraversal */
|
||||
public ArtificialReadsTraversal() {
|
||||
super(null, null, null);
|
||||
}
|
||||
|
||||
// what read ordering are we using
|
||||
|
|
|
|||
|
|
@ -7,8 +7,6 @@ import org.broadinstitute.sting.gatk.datasources.shards.Shard;
|
|||
import org.broadinstitute.sting.gatk.datasources.shards.ShardStrategy;
|
||||
import org.broadinstitute.sting.gatk.datasources.shards.ShardStrategyFactory;
|
||||
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.SAMDataSource;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
|
||||
import org.broadinstitute.sting.gatk.walkers.CountReadsWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.Walker;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
|
|
@ -98,9 +96,8 @@ public class TraverseReadsTest extends BaseTest {
|
|||
e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates.
|
||||
fail("Couldn't set the walkers printstream");
|
||||
}
|
||||
List<ReferenceOrderedData<? extends ReferenceOrderedDatum>> rods = new ArrayList<ReferenceOrderedData<? extends ReferenceOrderedDatum>>();
|
||||
|
||||
traversalEngine = new TraverseReads(bamList, refFile, rods);
|
||||
|
||||
traversalEngine = new TraverseReads();
|
||||
|
||||
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue