Refactoring of the traversal engine base class, I removed a lot of old code.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1209 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
3519323156
commit
d86717db93
|
|
@ -27,20 +27,18 @@ package org.broadinstitute.sting.gatk;
|
||||||
|
|
||||||
import net.sf.picard.reference.ReferenceSequenceFile;
|
import net.sf.picard.reference.ReferenceSequenceFile;
|
||||||
import net.sf.picard.reference.ReferenceSequenceFileFactory;
|
import net.sf.picard.reference.ReferenceSequenceFileFactory;
|
||||||
import net.sf.samtools.SAMFileReader;
|
|
||||||
import net.sf.samtools.SAMFileReader.ValidationStringency;
|
|
||||||
import org.apache.log4j.Logger;
|
import org.apache.log4j.Logger;
|
||||||
import org.broadinstitute.sting.gatk.executive.MicroScheduler;
|
import org.broadinstitute.sting.gatk.executive.MicroScheduler;
|
||||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
|
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
|
||||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
|
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
|
||||||
import org.broadinstitute.sting.gatk.traversals.*;
|
import org.broadinstitute.sting.gatk.traversals.TraversalEngine;
|
||||||
import org.broadinstitute.sting.gatk.walkers.*;
|
import org.broadinstitute.sting.gatk.walkers.*;
|
||||||
import org.broadinstitute.sting.utils.*;
|
import org.broadinstitute.sting.utils.*;
|
||||||
import org.broadinstitute.sting.utils.cmdLine.ArgumentException;
|
import org.broadinstitute.sting.utils.cmdLine.ArgumentException;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.io.*;
|
|
||||||
|
|
||||||
public class GenomeAnalysisEngine {
|
public class GenomeAnalysisEngine {
|
||||||
|
|
||||||
|
|
@ -100,7 +98,7 @@ public class GenomeAnalysisEngine {
|
||||||
if (argCollection.HAPMAPChipFile != null)
|
if (argCollection.HAPMAPChipFile != null)
|
||||||
bindConvenienceRods("hapmap-chip", "GFF", argCollection.HAPMAPChipFile);
|
bindConvenienceRods("hapmap-chip", "GFF", argCollection.HAPMAPChipFile);
|
||||||
// TODO: The ROD iterator currently does not understand multiple intervals file. Fix this by cleaning the ROD system.
|
// TODO: The ROD iterator currently does not understand multiple intervals file. Fix this by cleaning the ROD system.
|
||||||
if ( argCollection.intervals != null && argCollection.intervals.size() == 1) {
|
if (argCollection.intervals != null && argCollection.intervals.size() == 1) {
|
||||||
bindConvenienceRods("interval", "Intervals", argCollection.intervals.get(0).replaceAll(",", ""));
|
bindConvenienceRods("interval", "Intervals", argCollection.intervals.get(0).replaceAll(",", ""));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -111,7 +109,7 @@ public class GenomeAnalysisEngine {
|
||||||
validateInputsAgainstWalker(my_walker, argCollection, rods);
|
validateInputsAgainstWalker(my_walker, argCollection, rods);
|
||||||
|
|
||||||
// create the output streams
|
// create the output streams
|
||||||
initializeOutputStreams( my_walker );
|
initializeOutputStreams(my_walker);
|
||||||
|
|
||||||
// our microscheduler, which is in charge of running everything
|
// our microscheduler, which is in charge of running everything
|
||||||
MicroScheduler microScheduler = null;
|
MicroScheduler microScheduler = null;
|
||||||
|
|
@ -129,13 +127,10 @@ public class GenomeAnalysisEngine {
|
||||||
// perform validation steps that are common to all the engines
|
// perform validation steps that are common to all the engines
|
||||||
genericEngineSetup();
|
genericEngineSetup();
|
||||||
|
|
||||||
// parse out any genomic location they've provided
|
|
||||||
//List<GenomeLoc> locationsList = setupIntervalRegion();
|
|
||||||
List<GenomeLoc> locationsList = engine.getLocations();
|
|
||||||
GenomeLocSortedSet locs = null;
|
GenomeLocSortedSet locs = null;
|
||||||
if (locationsList != null)
|
if (argCollection.intervals != null) {
|
||||||
locs = GenomeLocSortedSet.createSetFromList(locationsList);
|
locs = GenomeLocSortedSet.createSetFromList(parseIntervalRegion(argCollection.intervals));
|
||||||
|
}
|
||||||
// excute the microscheduler, storing the results
|
// excute the microscheduler, storing the results
|
||||||
walkerReturn = microScheduler.execute(my_walker, locs, argCollection.maximumEngineIterations);
|
walkerReturn = microScheduler.execute(my_walker, locs, argCollection.maximumEngineIterations);
|
||||||
}
|
}
|
||||||
|
|
@ -145,6 +140,7 @@ public class GenomeAnalysisEngine {
|
||||||
*
|
*
|
||||||
* @param my_walker our walker of type LocusWalker
|
* @param my_walker our walker of type LocusWalker
|
||||||
* @param rods the reference order data
|
* @param rods the reference order data
|
||||||
|
*
|
||||||
* @return a new microscheduler
|
* @return a new microscheduler
|
||||||
*/
|
*/
|
||||||
private MicroScheduler createMicroscheduler(Walker my_walker, List<ReferenceOrderedData<? extends ReferenceOrderedDatum>> rods) {
|
private MicroScheduler createMicroscheduler(Walker my_walker, List<ReferenceOrderedData<? extends ReferenceOrderedDatum>> rods) {
|
||||||
|
|
@ -156,38 +152,23 @@ public class GenomeAnalysisEngine {
|
||||||
// create the MicroScheduler
|
// create the MicroScheduler
|
||||||
microScheduler = MicroScheduler.create(my_walker, extractSourceInfoFromArguments(argCollection), argCollection.referenceFile, rods, argCollection.numberOfThreads);
|
microScheduler = MicroScheduler.create(my_walker, extractSourceInfoFromArguments(argCollection), argCollection.referenceFile, rods, argCollection.numberOfThreads);
|
||||||
engine = microScheduler.getTraversalEngine();
|
engine = microScheduler.getTraversalEngine();
|
||||||
}
|
} else if (my_walker instanceof ReadWalker || my_walker instanceof DuplicateWalker) {
|
||||||
else if (my_walker instanceof ReadWalker || my_walker instanceof DuplicateWalker) {
|
|
||||||
if (argCollection.referenceFile == null)
|
if (argCollection.referenceFile == null)
|
||||||
Utils.scareUser(String.format("Read-based traversals require a reference file but none was given"));
|
Utils.scareUser(String.format("Read-based traversals require a reference file but none was given"));
|
||||||
microScheduler = MicroScheduler.create(my_walker, extractSourceInfoFromArguments(argCollection), argCollection.referenceFile, rods, argCollection.numberOfThreads);
|
microScheduler = MicroScheduler.create(my_walker, extractSourceInfoFromArguments(argCollection), argCollection.referenceFile, rods, argCollection.numberOfThreads);
|
||||||
engine = microScheduler.getTraversalEngine();
|
engine = microScheduler.getTraversalEngine();
|
||||||
} else {
|
} else {
|
||||||
Utils.scareUser(String.format("Unable to create the appropriate TraversalEngine for analysis type " + argCollection.analysisName));
|
Utils.scareUser(String.format("Unable to create the appropriate TraversalEngine for analysis type " + argCollection.analysisName));
|
||||||
}
|
}
|
||||||
|
|
||||||
return microScheduler;
|
return microScheduler;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/** commands that get executed for each engine, regardless of the type */
|
||||||
* commands that get executed for each engine, regardless of the type
|
|
||||||
*/
|
|
||||||
private void genericEngineSetup() {
|
private void genericEngineSetup() {
|
||||||
Reads sourceInfo = extractSourceInfoFromArguments(argCollection);
|
Reads sourceInfo = extractSourceInfoFromArguments(argCollection);
|
||||||
|
|
||||||
engine.setStrictness(sourceInfo.getValidationStringency());
|
|
||||||
|
|
||||||
engine.setMaxReads(argCollection.maximumEngineIterations);
|
engine.setMaxReads(argCollection.maximumEngineIterations);
|
||||||
engine.setFilterZeroMappingQualityReads(argCollection.filterZeroMappingQualityReads);
|
|
||||||
|
|
||||||
// we default interval files over the genome region string
|
|
||||||
if (argCollection.intervals != null) {
|
|
||||||
engine.setLocation(parseIntervalRegion(argCollection.intervals));
|
|
||||||
}
|
|
||||||
|
|
||||||
engine.setReadFilters(sourceInfo);
|
|
||||||
|
|
||||||
engine.initialize();
|
engine.initialize();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -196,9 +177,9 @@ public class GenomeAnalysisEngine {
|
||||||
*
|
*
|
||||||
* @return a list of genomeLoc representing the interval file
|
* @return a list of genomeLoc representing the interval file
|
||||||
*/
|
*/
|
||||||
public static List<GenomeLoc> parseIntervalRegion(final List<String> intervals ) {
|
public static List<GenomeLoc> parseIntervalRegion(final List<String> intervals) {
|
||||||
List<GenomeLoc> locs = new ArrayList<GenomeLoc>();
|
List<GenomeLoc> locs = new ArrayList<GenomeLoc>();
|
||||||
for( String interval: intervals ) {
|
for (String interval : intervals) {
|
||||||
if (new File(interval).exists()) {
|
if (new File(interval).exists()) {
|
||||||
locs.addAll(GenomeLocParser.intervalFileToList(interval));
|
locs.addAll(GenomeLocParser.intervalFileToList(interval));
|
||||||
} else {
|
} else {
|
||||||
|
|
@ -211,9 +192,12 @@ public class GenomeAnalysisEngine {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Bundles all the source information about the reads into a unified data structure.
|
* Bundles all the source information about the reads into a unified data structure.
|
||||||
|
*
|
||||||
* @param argCollection The collection of arguments passed to the engine.
|
* @param argCollection The collection of arguments passed to the engine.
|
||||||
|
*
|
||||||
* @return The reads object providing reads source info.
|
* @return The reads object providing reads source info.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
private Reads extractSourceInfoFromArguments( GATKArgumentCollection argCollection ) {
|
private Reads extractSourceInfoFromArguments( GATKArgumentCollection argCollection ) {
|
||||||
return new Reads( argCollection.samFiles,
|
return new Reads( argCollection.samFiles,
|
||||||
argCollection.strictnessLevel,
|
argCollection.strictnessLevel,
|
||||||
|
|
@ -229,33 +213,33 @@ public class GenomeAnalysisEngine {
|
||||||
String walkerName = WalkerManager.getWalkerName(walker.getClass());
|
String walkerName = WalkerManager.getWalkerName(walker.getClass());
|
||||||
|
|
||||||
// Check what the walker says is required against what was provided on the command line.
|
// Check what the walker says is required against what was provided on the command line.
|
||||||
if( WalkerManager.isRequired(walker,DataSource.READS) && (arguments.samFiles == null || arguments.samFiles.size() == 0) )
|
if (WalkerManager.isRequired(walker, DataSource.READS) && (arguments.samFiles == null || arguments.samFiles.size() == 0))
|
||||||
throw new ArgumentException(String.format("Walker %s requires reads but none were provided. If this is incorrect, alter the walker's @Requires annotation.",walkerName));
|
throw new ArgumentException(String.format("Walker %s requires reads but none were provided. If this is incorrect, alter the walker's @Requires annotation.", walkerName));
|
||||||
if( WalkerManager.isRequired(walker,DataSource.REFERENCE) && arguments.referenceFile == null )
|
if (WalkerManager.isRequired(walker, DataSource.REFERENCE) && arguments.referenceFile == null)
|
||||||
throw new ArgumentException(String.format("Walker %s requires a reference but none was provided. If this is incorrect, alter the walker's @Requires annotation.",walkerName));
|
throw new ArgumentException(String.format("Walker %s requires a reference but none was provided. If this is incorrect, alter the walker's @Requires annotation.", walkerName));
|
||||||
|
|
||||||
// Check what the walker says is allowed against what was provided on the command line.
|
// Check what the walker says is allowed against what was provided on the command line.
|
||||||
if( (arguments.samFiles != null && arguments.samFiles.size() > 0) && !WalkerManager.isAllowed(walker,DataSource.READS) )
|
if ((arguments.samFiles != null && arguments.samFiles.size() > 0) && !WalkerManager.isAllowed(walker, DataSource.READS))
|
||||||
throw new ArgumentException(String.format("Walker %s does not allow reads but reads were provided. If this is incorrect, alter the walker's @Allows annotation",walkerName));
|
throw new ArgumentException(String.format("Walker %s does not allow reads but reads were provided. If this is incorrect, alter the walker's @Allows annotation", walkerName));
|
||||||
if( arguments.referenceFile != null && !WalkerManager.isAllowed(walker,DataSource.REFERENCE) )
|
if (arguments.referenceFile != null && !WalkerManager.isAllowed(walker, DataSource.REFERENCE))
|
||||||
throw new ArgumentException(String.format("Walker %s does not allow a reference but one was provided. If this is incorrect, alter the walker's @Allows annotation",walkerName));
|
throw new ArgumentException(String.format("Walker %s does not allow a reference but one was provided. If this is incorrect, alter the walker's @Allows annotation", walkerName));
|
||||||
|
|
||||||
// Check to make sure that all required metadata is present.
|
// Check to make sure that all required metadata is present.
|
||||||
List<RMD> allRequired = WalkerManager.getRequiredMetaData(walker);
|
List<RMD> allRequired = WalkerManager.getRequiredMetaData(walker);
|
||||||
for( RMD required: allRequired ) {
|
for (RMD required : allRequired) {
|
||||||
boolean found = false;
|
boolean found = false;
|
||||||
for( ReferenceOrderedData<? extends ReferenceOrderedDatum> rod: rods ) {
|
for (ReferenceOrderedData<? extends ReferenceOrderedDatum> rod : rods) {
|
||||||
if( rod.matches(required.name(),required.type()) )
|
if (rod.matches(required.name(), required.type()))
|
||||||
found = true;
|
found = true;
|
||||||
}
|
}
|
||||||
if( !found )
|
if (!found)
|
||||||
throw new ArgumentException(String.format("Unable to find reference metadata (%s,%s)",required.name(),required.type()));
|
throw new ArgumentException(String.format("Unable to find reference metadata (%s,%s)", required.name(), required.type()));
|
||||||
}
|
}
|
||||||
|
|
||||||
// Check to see that no forbidden rods are present.
|
// Check to see that no forbidden rods are present.
|
||||||
for( ReferenceOrderedData<? extends ReferenceOrderedDatum> rod: rods ) {
|
for (ReferenceOrderedData<? extends ReferenceOrderedDatum> rod : rods) {
|
||||||
if( !WalkerManager.isAllowed(walker,rod) )
|
if (!WalkerManager.isAllowed(walker, rod))
|
||||||
throw new ArgumentException(String.format("Walker does not allow access to metadata: %s. If this is correct, change the @Allows metadata",rod.getName()));
|
throw new ArgumentException(String.format("Walker does not allow access to metadata: %s. If this is correct, change the @Allows metadata", rod.getName()));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -266,7 +250,7 @@ public class GenomeAnalysisEngine {
|
||||||
*/
|
*/
|
||||||
public int getBAMCompression() {
|
public int getBAMCompression() {
|
||||||
return (argCollection.BAMcompression == null ||
|
return (argCollection.BAMcompression == null ||
|
||||||
argCollection.BAMcompression < 1 ||
|
argCollection.BAMcompression < 1 ||
|
||||||
argCollection.BAMcompression > 8) ? 5 : argCollection.BAMcompression;
|
argCollection.BAMcompression > 8) ? 5 : argCollection.BAMcompression;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -284,7 +268,7 @@ public class GenomeAnalysisEngine {
|
||||||
|
|
||||||
|
|
||||||
/** Initialize the output streams as specified by the user. */
|
/** Initialize the output streams as specified by the user. */
|
||||||
private void initializeOutputStreams( Walker walker ) {
|
private void initializeOutputStreams(Walker walker) {
|
||||||
outputTracker = (argCollection.outErrFileName != null) ? new OutputTracker(argCollection.outErrFileName, argCollection.outErrFileName)
|
outputTracker = (argCollection.outErrFileName != null) ? new OutputTracker(argCollection.outErrFileName, argCollection.outErrFileName)
|
||||||
: new OutputTracker(argCollection.outFileName, argCollection.errFileName);
|
: new OutputTracker(argCollection.outFileName, argCollection.errFileName);
|
||||||
walker.initializeOutputStreams(outputTracker);
|
walker.initializeOutputStreams(outputTracker);
|
||||||
|
|
@ -299,17 +283,6 @@ public class GenomeAnalysisEngine {
|
||||||
return outputTracker;
|
return outputTracker;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* This function is deprecated in the new traversal engines, if you need to get the header
|
|
||||||
* please get the engine and then use the getHeader function
|
|
||||||
*
|
|
||||||
* @return
|
|
||||||
*/
|
|
||||||
@Deprecated
|
|
||||||
public SAMFileReader getSamReader() {
|
|
||||||
return this.engine.getSamReader();
|
|
||||||
}
|
|
||||||
|
|
||||||
public TraversalEngine getEngine() {
|
public TraversalEngine getEngine() {
|
||||||
return this.engine;
|
return this.engine;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -101,13 +101,13 @@ public abstract class MicroScheduler {
|
||||||
*/
|
*/
|
||||||
protected MicroScheduler(Walker walker, Reads reads, File refFile, List<ReferenceOrderedData<? extends ReferenceOrderedDatum>> rods) {
|
protected MicroScheduler(Walker walker, Reads reads, File refFile, List<ReferenceOrderedData<? extends ReferenceOrderedDatum>> rods) {
|
||||||
if (walker instanceof ReadWalker) {
|
if (walker instanceof ReadWalker) {
|
||||||
traversalEngine = new TraverseReads(reads.getReadsFiles(), refFile, rods);
|
traversalEngine = new TraverseReads();
|
||||||
} else if (walker instanceof LocusWalker) {
|
} else if (walker instanceof LocusWalker) {
|
||||||
traversalEngine = new TraverseLoci(reads.getReadsFiles(), refFile, rods);
|
traversalEngine = new TraverseLoci();
|
||||||
} else if (walker instanceof LocusWindowWalker) {
|
} else if (walker instanceof LocusWindowWalker) {
|
||||||
traversalEngine = new TraverseLocusWindows(reads.getReadsFiles(), refFile, rods);
|
traversalEngine = new TraverseLocusWindows();
|
||||||
} else if (walker instanceof DuplicateWalker) {
|
} else if (walker instanceof DuplicateWalker) {
|
||||||
traversalEngine = new TraverseDuplicates(reads.getReadsFiles(), refFile, rods);
|
traversalEngine = new TraverseDuplicates();
|
||||||
} else {
|
} else {
|
||||||
throw new UnsupportedOperationException("Unable to determine traversal type, the walker is an unknown type.");
|
throw new UnsupportedOperationException("Unable to determine traversal type, the walker is an unknown type.");
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -1,19 +1,6 @@
|
||||||
package org.broadinstitute.sting.gatk.refdata;
|
package org.broadinstitute.sting.gatk.refdata;
|
||||||
|
|
||||||
import java.util.*;
|
import java.util.List;
|
||||||
import java.util.regex.MatchResult;
|
|
||||||
import java.util.regex.Pattern;
|
|
||||||
import java.util.regex.Matcher;
|
|
||||||
import java.io.File;
|
|
||||||
import java.io.FileNotFoundException;
|
|
||||||
import java.io.IOException;
|
|
||||||
|
|
||||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
|
|
||||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
|
||||||
import org.broadinstitute.sting.utils.Utils;
|
|
||||||
import org.broadinstitute.sting.utils.xReadLines;
|
|
||||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
|
||||||
import org.apache.log4j.Logger;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* loc ref alt EM_alt_freq discovery_likelihood discovery_null discovery_prior discovery_lod EM_N n_ref n_het n_hom
|
* loc ref alt EM_alt_freq discovery_likelihood discovery_null discovery_prior discovery_lod EM_N n_ref n_het n_hom
|
||||||
|
|
|
||||||
|
|
@ -1,218 +1,63 @@
|
||||||
package org.broadinstitute.sting.gatk.traversals;
|
package org.broadinstitute.sting.gatk.traversals;
|
||||||
|
|
||||||
import net.sf.picard.filter.SamRecordFilter;
|
|
||||||
import net.sf.picard.filter.FilteringIterator;
|
|
||||||
import net.sf.picard.reference.ReferenceSequence;
|
|
||||||
import net.sf.samtools.SAMFileHeader;
|
import net.sf.samtools.SAMFileHeader;
|
||||||
import net.sf.samtools.SAMFileReader;
|
|
||||||
import net.sf.samtools.SAMFileReader.ValidationStringency;
|
|
||||||
import net.sf.samtools.SAMRecord;
|
|
||||||
import net.sf.samtools.SAMReadGroupRecord;
|
|
||||||
import net.sf.samtools.util.RuntimeIOException;
|
|
||||||
import org.apache.log4j.Logger;
|
import org.apache.log4j.Logger;
|
||||||
import org.broadinstitute.sting.gatk.iterators.*;
|
|
||||||
import org.broadinstitute.sting.gatk.refdata.*;
|
|
||||||
import org.broadinstitute.sting.gatk.walkers.Walker;
|
|
||||||
import org.broadinstitute.sting.gatk.datasources.shards.Shard;
|
|
||||||
import org.broadinstitute.sting.gatk.datasources.providers.ShardDataProvider;
|
import org.broadinstitute.sting.gatk.datasources.providers.ShardDataProvider;
|
||||||
import org.broadinstitute.sting.gatk.Reads;
|
import org.broadinstitute.sting.gatk.datasources.shards.Shard;
|
||||||
import org.broadinstitute.sting.utils.*;
|
import org.broadinstitute.sting.gatk.walkers.Walker;
|
||||||
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
import java.io.File;
|
|
||||||
import java.util.*;
|
|
||||||
|
|
||||||
public abstract class TraversalEngine {
|
public abstract class TraversalEngine {
|
||||||
// list of reference ordered data objects
|
|
||||||
protected List<ReferenceOrderedData<? extends ReferenceOrderedDatum>> rods = null;
|
|
||||||
|
|
||||||
// Iterator over rods
|
|
||||||
List<Pair<String, RODIterator<? extends ReferenceOrderedDatum>>> rodIters;
|
|
||||||
|
|
||||||
// How strict should we be with SAM/BAM parsing?
|
|
||||||
protected ValidationStringency strictness = ValidationStringency.STRICT;
|
|
||||||
|
|
||||||
// Time in milliseconds since we initialized this engine
|
// Time in milliseconds since we initialized this engine
|
||||||
protected long startTime = -1;
|
private long startTime = -1;
|
||||||
protected long lastProgressPrintTime = -1; // When was the last time we printed our progress?
|
private long lastProgressPrintTime = -1; // When was the last time we printed our progress?
|
||||||
|
|
||||||
// How long can we go without printing some progress info?
|
// How long can we go without printing some progress info?
|
||||||
protected long MAX_PROGRESS_PRINT_TIME = 30 * 1000; // 10 seconds in millisecs
|
private final long MAX_PROGRESS_PRINT_TIME = 30 * 1000; // 10 seconds in millisecs
|
||||||
protected long N_RECORDS_TO_PRINT = 1000000;
|
private final long N_RECORDS_TO_PRINT = 1000000;
|
||||||
|
|
||||||
// Maximum number of reads to process before finishing
|
// Maximum number of reads to process before finishing
|
||||||
protected long maxReads = -1;
|
protected long maxReads = -1;
|
||||||
|
|
||||||
// Name of the reads file, in BAM/SAM format
|
|
||||||
protected List<File> readsFiles = null; // the name of the reads file
|
|
||||||
protected SAMFileReader samReader = null;
|
|
||||||
// iterator over the sam records in the readsFile
|
|
||||||
protected Iterator<SAMRecord> samReadIter = null;
|
|
||||||
|
|
||||||
// The verifying iterator, it does checking
|
|
||||||
protected VerifyingSamIterator verifyingSamReadIter = null;
|
|
||||||
|
|
||||||
// The reference data -- filename, refSeqFile, and iterator
|
|
||||||
protected File refFileName = null; // the name of the reference file
|
|
||||||
|
|
||||||
// Progress tracker for the sam file
|
|
||||||
protected FileProgressTracker samReadingTracker = null;
|
|
||||||
|
|
||||||
protected boolean DEBUGGING = false;
|
|
||||||
protected boolean beSafeP = true;
|
|
||||||
protected boolean SORT_ON_FLY = false;
|
|
||||||
protected boolean DOWNSAMPLE_BY_FRACTION = false;
|
|
||||||
protected boolean DOWNSAMPLE_BY_COVERAGE = false;
|
|
||||||
protected boolean FILTER_UNSORTED_READS = false;
|
|
||||||
protected boolean walkOverAllSites = false;
|
|
||||||
protected int maxOnFlySorts = 100000;
|
|
||||||
protected double downsamplingFraction = 1.0;
|
|
||||||
protected int downsamplingCoverage = 0;
|
|
||||||
protected boolean filterZeroMappingQualityReads = false;
|
|
||||||
|
|
||||||
|
|
||||||
// the stored header
|
// the stored header
|
||||||
protected SAMFileHeader myHeader = null;
|
private SAMFileHeader myHeader = null;
|
||||||
|
|
||||||
/**
|
/** our log, which we want to capture anything from this class */
|
||||||
* our log, which we want to capture anything from this class
|
|
||||||
*/
|
|
||||||
protected static Logger logger = Logger.getLogger(TraversalEngine.class);
|
protected static Logger logger = Logger.getLogger(TraversalEngine.class);
|
||||||
|
|
||||||
|
|
||||||
// Locations we are going to process during the traversal
|
|
||||||
private List<GenomeLoc> locs = null;
|
|
||||||
|
|
||||||
// --------------------------------------------------------------------------------------------------------------
|
|
||||||
//
|
|
||||||
// Setting up the engine
|
|
||||||
//
|
|
||||||
// --------------------------------------------------------------------------------------------------------------
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Creates a new, uninitialized TraversalEngine
|
|
||||||
*
|
|
||||||
* @param reads SAM/BAM file of reads
|
|
||||||
* @param ref Reference file in FASTA format, assumes a .dict file is also available
|
|
||||||
* @param rods Array of reference ordered data sets
|
|
||||||
*/
|
|
||||||
public TraversalEngine(List<File> reads, File ref, List<ReferenceOrderedData<? extends ReferenceOrderedDatum>> rods) {
|
|
||||||
readsFiles = reads;
|
|
||||||
refFileName = ref;
|
|
||||||
this.rods = rods;
|
|
||||||
}
|
|
||||||
|
|
||||||
// --------------------------------------------------------------------------------------------------------------
|
// --------------------------------------------------------------------------------------------------------------
|
||||||
//
|
//
|
||||||
// Manipulating the underlying engine parameters
|
// Manipulating the underlying engine parameters
|
||||||
//
|
//
|
||||||
// --------------------------------------------------------------------------------------------------------------
|
// --------------------------------------------------------------------------------------------------------------
|
||||||
//public void setRegion(final String reg) { regionStr = regionStr; }
|
|
||||||
//public void setTraversalType(final String type) { traversalType = type; }
|
|
||||||
public void setStrictness(final ValidationStringency s) {
|
|
||||||
strictness = s;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setMaxReads(final int maxReads) {
|
public void setMaxReads(final int maxReads) {
|
||||||
this.maxReads = maxReads;
|
this.maxReads = maxReads;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setFilterZeroMappingQualityReads(final boolean filterZeroMappingQualityReads) {
|
|
||||||
this.filterZeroMappingQualityReads = filterZeroMappingQualityReads;
|
|
||||||
}
|
|
||||||
|
|
||||||
private void setDebugging(final boolean d) {
|
|
||||||
DEBUGGING = d;
|
|
||||||
}
|
|
||||||
|
|
||||||
private void setSafetyChecking(final boolean beSafeP) {
|
|
||||||
if (!beSafeP)
|
|
||||||
logger.warn("*** Turning off safety checking, I hope you know what you are doing. Errors will result in debugging assert failures and other inscrutable messages...");
|
|
||||||
this.beSafeP = beSafeP;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setFilterUnsortedReads(final boolean filterUnsorted) {
|
|
||||||
if (!filterUnsorted)
|
|
||||||
logger.warn("*** Turning on filtering of out of order reads, I *really* hope you know what you are doing, as you are removing data...");
|
|
||||||
this.FILTER_UNSORTED_READS = filterUnsorted;
|
|
||||||
}
|
|
||||||
|
|
||||||
private void setSortOnFly(final int maxReadsToSort) {
|
|
||||||
logger.info("Sorting read file on the fly: max reads allowed is " + maxReadsToSort);
|
|
||||||
SORT_ON_FLY = true;
|
|
||||||
maxOnFlySorts = maxReadsToSort;
|
|
||||||
}
|
|
||||||
|
|
||||||
private void setSortOnFly() { setSortOnFly(100000); }
|
|
||||||
|
|
||||||
private void setDownsampleByFraction(final double fraction) {
|
|
||||||
logger.info("Downsampling to approximately " + (fraction * 100.0) + "% of filtered reads");
|
|
||||||
DOWNSAMPLE_BY_FRACTION = true;
|
|
||||||
downsamplingFraction = fraction;
|
|
||||||
}
|
|
||||||
|
|
||||||
private void setDownsampleByCoverage(final int coverage) {
|
|
||||||
logger.info("Downsampling to coverage " + coverage);
|
|
||||||
DOWNSAMPLE_BY_COVERAGE = true;
|
|
||||||
downsamplingCoverage = coverage;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Sets up read filtering performed by the traversal engine.
|
|
||||||
* @param reads Read filters to instantiate as the traversal engine walks over the data.
|
|
||||||
* @deprecated Should only be used by old-style traversals.
|
|
||||||
*/
|
|
||||||
@Deprecated
|
|
||||||
public void setReadFilters( Reads reads) {
|
|
||||||
if( reads.getDownsamplingFraction() != null ) setDownsampleByFraction(reads.getDownsamplingFraction());
|
|
||||||
if( reads.getDownsampleToCoverage() != null ) setDownsampleByCoverage(reads.getDownsampleToCoverage());
|
|
||||||
if( reads.getSafetyChecking() != null ) setSafetyChecking(reads.getSafetyChecking());
|
|
||||||
}
|
|
||||||
|
|
||||||
// --------------------------------------------------------------------------------------------------------------
|
// --------------------------------------------------------------------------------------------------------------
|
||||||
//
|
//
|
||||||
// functions for dealing locations (areas of the genome we're traversing over)
|
// functions for dealing locations (areas of the genome we're traversing over)
|
||||||
//
|
//
|
||||||
// --------------------------------------------------------------------------------------------------------------
|
// --------------------------------------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* get the associated SAM header for our run
|
* get the associated SAM header for our run
|
||||||
|
*
|
||||||
* @return the header if it's stored, null if not
|
* @return the header if it's stored, null if not
|
||||||
*/
|
*/
|
||||||
public SAMFileHeader getSAMHeader() {
|
public SAMFileHeader getSAMHeader() {
|
||||||
if ( myHeader == null )
|
|
||||||
myHeader = samReader.getFileHeader();
|
|
||||||
return myHeader;
|
return myHeader;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* set's the SAM header for this traversal, which should
|
* set's the SAM header for this traversal, which should
|
||||||
* be the merged header in the multiple BAM file case.
|
* be the merged header in the multiple BAM file case.
|
||||||
*
|
*
|
||||||
* @param myHeader the passed in header
|
* @param myHeader the passed in header
|
||||||
*/
|
*/
|
||||||
|
|
||||||
public void setSAMHeader(SAMFileHeader myHeader) {
|
public void setSAMHeader(SAMFileHeader myHeader) {
|
||||||
this.myHeader = myHeader;
|
this.myHeader = myHeader;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Sets the intervals over which the traversal(s) should happen.
|
|
||||||
* @param locs
|
|
||||||
*/
|
|
||||||
public void setLocation(final List<GenomeLoc> locs) {
|
|
||||||
this.locs = locs;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public boolean hasLocations() {
|
|
||||||
return this.locs != null && !this.locs.isEmpty();
|
|
||||||
}
|
|
||||||
|
|
||||||
public List<GenomeLoc> getLocations() {
|
|
||||||
return this.locs;
|
|
||||||
}
|
|
||||||
|
|
||||||
// --------------------------------------------------------------------------------------------------------------
|
// --------------------------------------------------------------------------------------------------------------
|
||||||
//
|
//
|
||||||
// printing
|
// printing
|
||||||
|
|
@ -221,17 +66,18 @@ public abstract class TraversalEngine {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @param curTime (current runtime, in millisecs)
|
* @param curTime (current runtime, in millisecs)
|
||||||
|
*
|
||||||
* @return true if the maximum interval (in millisecs) has passed since the last printing
|
* @return true if the maximum interval (in millisecs) has passed since the last printing
|
||||||
*/
|
*/
|
||||||
protected boolean maxElapsedIntervalForPrinting(final long curTime) {
|
private boolean maxElapsedIntervalForPrinting(final long curTime) {
|
||||||
return (curTime - this.lastProgressPrintTime) > MAX_PROGRESS_PRINT_TIME;
|
return (curTime - this.lastProgressPrintTime) > MAX_PROGRESS_PRINT_TIME;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Forward request to printProgress
|
* Forward request to printProgress
|
||||||
*
|
*
|
||||||
* @param type
|
* @param type the type of traversal
|
||||||
* @param loc
|
* @param loc the location
|
||||||
*/
|
*/
|
||||||
public void printProgress(final String type, GenomeLoc loc) {
|
public void printProgress(final String type, GenomeLoc loc) {
|
||||||
printProgress(false, type, loc);
|
printProgress(false, type, loc);
|
||||||
|
|
@ -245,7 +91,7 @@ public abstract class TraversalEngine {
|
||||||
* @param type String to print out describing our atomic traversal type ("read", "locus", etc)
|
* @param type String to print out describing our atomic traversal type ("read", "locus", etc)
|
||||||
* @param loc Current location
|
* @param loc Current location
|
||||||
*/
|
*/
|
||||||
public void printProgress(boolean mustPrint, final String type, GenomeLoc loc) {
|
private void printProgress(boolean mustPrint, final String type, GenomeLoc loc) {
|
||||||
final long nRecords = TraversalStatistics.nRecords;
|
final long nRecords = TraversalStatistics.nRecords;
|
||||||
final long curTime = System.currentTimeMillis();
|
final long curTime = System.currentTimeMillis();
|
||||||
final double elapsed = (curTime - startTime) / 1000.0;
|
final double elapsed = (curTime - startTime) / 1000.0;
|
||||||
|
|
@ -259,22 +105,16 @@ public abstract class TraversalEngine {
|
||||||
else
|
else
|
||||||
logger.info(String.format("[PROGRESS] Traversed %,d %s in %.2f secs (%.2f secs per 1M %s)", nRecords, type, elapsed, secsPer1MReads, type));
|
logger.info(String.format("[PROGRESS] Traversed %,d %s in %.2f secs (%.2f secs per 1M %s)", nRecords, type, elapsed, secsPer1MReads, type));
|
||||||
|
|
||||||
// Currently samReadingTracker will print misleading info if we're not processing the whole file
|
|
||||||
|
|
||||||
// If an index is enabled, file read progress is meaningless because a linear
|
|
||||||
// traversal is not being performed. For now, don't bother printing progress.
|
|
||||||
// TODO: Create a sam indexed read tracker that tracks based on percentage through the query.
|
|
||||||
if (samReadingTracker != null && this.locs == null)
|
|
||||||
logger.info(String.format("[PROGRESS] -> %s", samReadingTracker.progressMeter()));
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A passthrough method so that subclasses can report which types of traversals they're using.
|
* A passthrough method so that subclasses can report which types of traversals they're using.
|
||||||
|
*
|
||||||
* @param sum Result of the computation.
|
* @param sum Result of the computation.
|
||||||
* @param <T> Type of the computation.
|
* @param <T> Type of the computation.
|
||||||
*/
|
*/
|
||||||
public abstract <T> void printOnTraversalDone( T sum );
|
public abstract <T> void printOnTraversalDone(T sum);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Called after a traversal to print out information about the traversal process
|
* Called after a traversal to print out information about the traversal process
|
||||||
|
|
@ -290,9 +130,9 @@ public abstract class TraversalEngine {
|
||||||
final double elapsed = (curTime - startTime) / 1000.0;
|
final double elapsed = (curTime - startTime) / 1000.0;
|
||||||
logger.info(String.format("Total runtime %.2f secs, %.2f min, %.2f hours%n", elapsed, elapsed / 60, elapsed / 3600));
|
logger.info(String.format("Total runtime %.2f secs, %.2f min, %.2f hours%n", elapsed, elapsed / 60, elapsed / 3600));
|
||||||
logger.info(String.format("Traversal skipped %d valid reads out of %d total (%.2f%%)",
|
logger.info(String.format("Traversal skipped %d valid reads out of %d total (%.2f%%)",
|
||||||
TraversalStatistics.nSkippedReads,
|
TraversalStatistics.nSkippedReads,
|
||||||
TraversalStatistics.nReads,
|
TraversalStatistics.nReads,
|
||||||
(TraversalStatistics.nSkippedReads * 100.0) / TraversalStatistics.nReads));
|
(TraversalStatistics.nSkippedReads * 100.0) / TraversalStatistics.nReads));
|
||||||
logger.info(String.format(" -> %d unmapped reads", TraversalStatistics.nUnmappedReads));
|
logger.info(String.format(" -> %d unmapped reads", TraversalStatistics.nUnmappedReads));
|
||||||
logger.info(String.format(" -> %d duplicate reads", TraversalStatistics.nDuplicates));
|
logger.info(String.format(" -> %d duplicate reads", TraversalStatistics.nDuplicates));
|
||||||
logger.info(String.format(" -> %d non-primary reads", TraversalStatistics.nNotPrimary));
|
logger.info(String.format(" -> %d non-primary reads", TraversalStatistics.nNotPrimary));
|
||||||
|
|
@ -306,147 +146,26 @@ public abstract class TraversalEngine {
|
||||||
//
|
//
|
||||||
// --------------------------------------------------------------------------------------------------------------
|
// --------------------------------------------------------------------------------------------------------------
|
||||||
|
|
||||||
/**
|
/** Initialize the traversal engine. After this point traversals can be run over the data */
|
||||||
* Initialize the traversal engine. After this point traversals can be run over the data
|
public void initialize() {
|
||||||
*
|
|
||||||
* @return true on success
|
|
||||||
*/
|
|
||||||
public boolean initialize() {
|
|
||||||
lastProgressPrintTime = startTime = System.currentTimeMillis();
|
lastProgressPrintTime = startTime = System.currentTimeMillis();
|
||||||
// Initial the reference ordered data iterators
|
// Initial the reference ordered data iterators
|
||||||
initializeRODs();
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
protected SAMFileReader initializeSAMFile(File samFile) {
|
|
||||||
// todo: fixme, this is a hack to try out dynamic merging
|
|
||||||
if ( samFile.toString().endsWith(".list") ) {
|
|
||||||
return null;
|
|
||||||
// todo: omg, this is just scary, just it's just for testing purposes. fix with the new DataSource system
|
|
||||||
} else {
|
|
||||||
SAMFileReader samReader = new SAMFileReader(samFile, true);
|
|
||||||
samReader.setValidationStringency(strictness);
|
|
||||||
|
|
||||||
final SAMFileHeader header = samReader.getFileHeader();
|
|
||||||
logger.debug(String.format("Sort order is: " + header.getSortOrder()));
|
|
||||||
|
|
||||||
// Kludge filename into sam file header.
|
|
||||||
if (samReader.getFileHeader().getReadGroups().size() < 1) {
|
|
||||||
//logger.warn("Setting header in reader " + f.getName());
|
|
||||||
SAMReadGroupRecord rec = new SAMReadGroupRecord(samFile.getName());
|
|
||||||
rec.setLibrary(samFile.getName());
|
|
||||||
rec.setSample(samFile.getName());
|
|
||||||
|
|
||||||
samReader.getFileHeader().addReadGroup(rec);
|
|
||||||
}
|
|
||||||
|
|
||||||
return samReader;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Prepare the list of reference ordered data iterators for each of the rods
|
* this method must be implemented by all traversal engines
|
||||||
*
|
*
|
||||||
* @return A list of ROD iterators for getting data from each ROD
|
* @param walker the walker to run with
|
||||||
*/
|
* @param shard a shard of data
|
||||||
protected void initializeRODs() {
|
* @param dataProvider the data provider that generates data given the shard
|
||||||
// set up reference ordered data
|
* @param sum the accumulator
|
||||||
rodIters = new ArrayList<Pair<String, RODIterator<? extends ReferenceOrderedDatum>>>();
|
* @param <M> an object of the map type
|
||||||
for (ReferenceOrderedData<? extends ReferenceOrderedDatum> data : rods) {
|
* @param <T> an object of the reduce type
|
||||||
rodIters.add(new Pair<String, RODIterator<? extends ReferenceOrderedDatum>>(data.getName(), data.iterator()));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// --------------------------------------------------------------------------------------------------------------
|
|
||||||
//
|
|
||||||
// shutdown
|
|
||||||
//
|
|
||||||
// --------------------------------------------------------------------------------------------------------------
|
|
||||||
|
|
||||||
public boolean shutdown() {
|
|
||||||
// todo: actually shutdown the resources
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
// --------------------------------------------------------------------------------------------------------------
|
|
||||||
//
|
|
||||||
// dealing with reference ordered data
|
|
||||||
//
|
|
||||||
// --------------------------------------------------------------------------------------------------------------
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Builds a list of the reference ordered datum at loc from each of the iterators. This function
|
|
||||||
* assumes you are accessing the data in order. You can't use this function for random access. Each
|
|
||||||
* successive call moves you along the file, consuming all data before loc.
|
|
||||||
*
|
*
|
||||||
* @param loc The location to get the rods at
|
* @return an object of the reduce type
|
||||||
* @return A list of ReferenceOrderDatum at loc. ROD without a datum at loc will be null in the list
|
|
||||||
*/
|
*/
|
||||||
protected RefMetaDataTracker getReferenceOrderedDataAtLocus(final GenomeLoc loc) {
|
public abstract <M, T> T traverse(Walker<M, T> walker,
|
||||||
RefMetaDataTracker tracks = new RefMetaDataTracker();
|
Shard shard,
|
||||||
for (Pair<String, RODIterator<? extends ReferenceOrderedDatum>> pair : rodIters) {
|
ShardDataProvider dataProvider,
|
||||||
String name = pair.getFirst();
|
T sum);
|
||||||
tracks.bind(name, pair.getSecond().seekForward(loc));
|
|
||||||
}
|
|
||||||
|
|
||||||
return tracks;
|
|
||||||
}
|
|
||||||
|
|
||||||
// --------------------------------------------------------------------------------------------------------------
|
|
||||||
//
|
|
||||||
// processing
|
|
||||||
//
|
|
||||||
// --------------------------------------------------------------------------------------------------------------
|
|
||||||
|
|
||||||
public <M, T> T traverse(Walker<M, T> walker) {
|
|
||||||
T sum = null;
|
|
||||||
if ( hasLocations() && walker.isReduceByInterval() ) {
|
|
||||||
logger.info("Doing reduce by interval processing");
|
|
||||||
if ( ! hasLocations() )
|
|
||||||
Utils.scareUser("ReduceByInterval requested by no interval was provided");
|
|
||||||
List<Pair<GenomeLoc, T>> map = new ArrayList<Pair<GenomeLoc, T>>(locs.size());
|
|
||||||
for ( GenomeLoc loc : locs ) {
|
|
||||||
ArrayList<GenomeLoc> l = new ArrayList<GenomeLoc>();
|
|
||||||
l.add(loc);
|
|
||||||
T intervalSum = traverse(walker, l);
|
|
||||||
sum = intervalSum;
|
|
||||||
map.add(new Pair<GenomeLoc, T>(loc, intervalSum));
|
|
||||||
}
|
|
||||||
walker.onTraversalDone(map);
|
|
||||||
} else {
|
|
||||||
List<GenomeLoc> l = new ArrayList<GenomeLoc>();
|
|
||||||
if ( hasLocations() )
|
|
||||||
l = locs;
|
|
||||||
sum = traverse(walker, l);
|
|
||||||
}
|
|
||||||
|
|
||||||
printOnTraversalDone("elements", sum);
|
|
||||||
return sum;
|
|
||||||
}
|
|
||||||
|
|
||||||
public <M, T> T traverse(Walker<M, T> walker, List<GenomeLoc> locations) {
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
public <M,T> T traverse( Walker<M,T> walker,
|
|
||||||
Shard shard,
|
|
||||||
ShardDataProvider dataProvider,
|
|
||||||
T sum ) {
|
|
||||||
throw new UnsupportedOperationException( "This method is a required override for new traversal engines. Please port your traversal engine to the new style." );
|
|
||||||
}
|
|
||||||
|
|
||||||
public void verifySortOrder(final boolean requiresSortedOrder) {
|
|
||||||
if (beSafeP && !SORT_ON_FLY && samReader.getFileHeader().getSortOrder() != SAMFileHeader.SortOrder.coordinate) {
|
|
||||||
final String msg = "SAM file is not sorted in coordinate order (according to header) Walker type with given arguments requires a sorted file for correct processing";
|
|
||||||
if (requiresSortedOrder || strictness == SAMFileReader.ValidationStringency.STRICT)
|
|
||||||
throw new RuntimeIOException(msg);
|
|
||||||
else if (strictness == SAMFileReader.ValidationStringency.LENIENT)
|
|
||||||
logger.warn(msg);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@Deprecated
|
|
||||||
public SAMFileReader getSamReader() { return this.samReader; }
|
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -35,15 +35,12 @@ import org.broadinstitute.sting.gatk.datasources.providers.ShardDataProvider;
|
||||||
import org.broadinstitute.sting.gatk.datasources.shards.ReadShard;
|
import org.broadinstitute.sting.gatk.datasources.shards.ReadShard;
|
||||||
import org.broadinstitute.sting.gatk.datasources.shards.Shard;
|
import org.broadinstitute.sting.gatk.datasources.shards.Shard;
|
||||||
import org.broadinstitute.sting.gatk.iterators.PushbackIterator;
|
import org.broadinstitute.sting.gatk.iterators.PushbackIterator;
|
||||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
|
|
||||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
|
|
||||||
import org.broadinstitute.sting.gatk.walkers.DuplicateWalker;
|
import org.broadinstitute.sting.gatk.walkers.DuplicateWalker;
|
||||||
import org.broadinstitute.sting.gatk.walkers.Walker;
|
import org.broadinstitute.sting.gatk.walkers.Walker;
|
||||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||||
import org.broadinstitute.sting.utils.Pair;
|
import org.broadinstitute.sting.utils.Pair;
|
||||||
|
|
||||||
import java.io.File;
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.Iterator;
|
import java.util.Iterator;
|
||||||
|
|
@ -71,17 +68,6 @@ public class TraverseDuplicates extends TraversalEngine {
|
||||||
|
|
||||||
private final boolean DEBUG = false;
|
private final boolean DEBUG = false;
|
||||||
|
|
||||||
/**
|
|
||||||
* Creates a new, uninitialized TraversalEngine
|
|
||||||
*
|
|
||||||
* @param reads SAM/BAM file of reads
|
|
||||||
* @param ref Reference file in FASTA format, assumes a .dict file is also available
|
|
||||||
* @param rods Array of reference ordered data sets
|
|
||||||
*/
|
|
||||||
public TraverseDuplicates(List<File> reads, File ref, List<ReferenceOrderedData<? extends ReferenceOrderedDatum>> rods) {
|
|
||||||
super(reads, ref, rods);
|
|
||||||
}
|
|
||||||
|
|
||||||
private List<SAMRecord> readsAtLoc(final SAMRecord read, PushbackIterator<SAMRecord> iter) {
|
private List<SAMRecord> readsAtLoc(final SAMRecord read, PushbackIterator<SAMRecord> iter) {
|
||||||
GenomeLoc site = GenomeLocParser.createGenomeLoc(read);
|
GenomeLoc site = GenomeLocParser.createGenomeLoc(read);
|
||||||
ArrayList<SAMRecord> l = new ArrayList<SAMRecord>();
|
ArrayList<SAMRecord> l = new ArrayList<SAMRecord>();
|
||||||
|
|
|
||||||
|
|
@ -1,27 +1,18 @@
|
||||||
package org.broadinstitute.sting.gatk.traversals;
|
package org.broadinstitute.sting.gatk.traversals;
|
||||||
|
|
||||||
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
import org.apache.log4j.Logger;
|
||||||
import org.broadinstitute.sting.gatk.walkers.Walker;
|
|
||||||
import org.broadinstitute.sting.gatk.walkers.DataSource;
|
|
||||||
import org.broadinstitute.sting.gatk.LocusContext;
|
import org.broadinstitute.sting.gatk.LocusContext;
|
||||||
import org.broadinstitute.sting.gatk.WalkerManager;
|
import org.broadinstitute.sting.gatk.WalkerManager;
|
||||||
|
import org.broadinstitute.sting.gatk.datasources.providers.*;
|
||||||
import org.broadinstitute.sting.gatk.datasources.shards.Shard;
|
import org.broadinstitute.sting.gatk.datasources.shards.Shard;
|
||||||
import org.broadinstitute.sting.gatk.datasources.providers.ShardDataProvider;
|
|
||||||
import org.broadinstitute.sting.gatk.datasources.providers.AllLocusView;
|
|
||||||
import org.broadinstitute.sting.gatk.datasources.providers.CoveredLocusView;
|
|
||||||
import org.broadinstitute.sting.gatk.datasources.providers.LocusView;
|
|
||||||
import org.broadinstitute.sting.gatk.datasources.providers.ReferenceOrderedView;
|
|
||||||
import org.broadinstitute.sting.gatk.datasources.providers.LocusReferenceView;
|
|
||||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
|
|
||||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
|
|
||||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||||
|
import org.broadinstitute.sting.gatk.walkers.DataSource;
|
||||||
|
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
||||||
|
import org.broadinstitute.sting.gatk.walkers.Walker;
|
||||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
import org.broadinstitute.sting.utils.Utils;
|
import org.broadinstitute.sting.utils.Utils;
|
||||||
import org.apache.log4j.Logger;
|
|
||||||
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.io.File;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A simple solution to iterating over all reference positions over a series of genomic locations.
|
* A simple solution to iterating over all reference positions over a series of genomic locations.
|
||||||
|
|
@ -33,10 +24,6 @@ public class TraverseLoci extends TraversalEngine {
|
||||||
*/
|
*/
|
||||||
protected static Logger logger = Logger.getLogger(TraversalEngine.class);
|
protected static Logger logger = Logger.getLogger(TraversalEngine.class);
|
||||||
|
|
||||||
public TraverseLoci(List<File> reads, File ref, List<ReferenceOrderedData<? extends ReferenceOrderedDatum>> rods) {
|
|
||||||
super( reads, ref, rods );
|
|
||||||
}
|
|
||||||
|
|
||||||
public <M,T> T traverse(Walker<M,T> walker, ArrayList<GenomeLoc> locations) {
|
public <M,T> T traverse(Walker<M,T> walker, ArrayList<GenomeLoc> locations) {
|
||||||
if ( locations.isEmpty() )
|
if ( locations.isEmpty() )
|
||||||
Utils.scareUser("Requested all locations be processed without providing locations to be processed!");
|
Utils.scareUser("Requested all locations be processed without providing locations to be processed!");
|
||||||
|
|
|
||||||
|
|
@ -1,20 +1,20 @@
|
||||||
package org.broadinstitute.sting.gatk.traversals;
|
package org.broadinstitute.sting.gatk.traversals;
|
||||||
|
|
||||||
|
import net.sf.samtools.SAMRecord;
|
||||||
|
import org.broadinstitute.sting.gatk.LocusContext;
|
||||||
|
import org.broadinstitute.sting.gatk.datasources.providers.LocusReferenceView;
|
||||||
|
import org.broadinstitute.sting.gatk.datasources.providers.ReadView;
|
||||||
|
import org.broadinstitute.sting.gatk.datasources.providers.ReferenceOrderedView;
|
||||||
|
import org.broadinstitute.sting.gatk.datasources.providers.ShardDataProvider;
|
||||||
|
import org.broadinstitute.sting.gatk.datasources.shards.Shard;
|
||||||
|
import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
|
||||||
|
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||||
import org.broadinstitute.sting.gatk.walkers.LocusWindowWalker;
|
import org.broadinstitute.sting.gatk.walkers.LocusWindowWalker;
|
||||||
import org.broadinstitute.sting.gatk.walkers.Walker;
|
import org.broadinstitute.sting.gatk.walkers.Walker;
|
||||||
import org.broadinstitute.sting.gatk.LocusContext;
|
|
||||||
import org.broadinstitute.sting.gatk.datasources.shards.Shard;
|
|
||||||
import org.broadinstitute.sting.gatk.datasources.providers.*;
|
|
||||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
|
|
||||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
|
|
||||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
|
||||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||||
|
|
||||||
import java.util.*;
|
import java.util.ArrayList;
|
||||||
import java.io.File;
|
|
||||||
|
|
||||||
import net.sf.samtools.SAMRecord;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Created by IntelliJ IDEA.
|
* Created by IntelliJ IDEA.
|
||||||
|
|
@ -25,10 +25,6 @@ import net.sf.samtools.SAMRecord;
|
||||||
*/
|
*/
|
||||||
public class TraverseLocusWindows extends TraversalEngine {
|
public class TraverseLocusWindows extends TraversalEngine {
|
||||||
|
|
||||||
public TraverseLocusWindows(List<File> reads, File ref, List<ReferenceOrderedData<? extends ReferenceOrderedDatum>> rods) {
|
|
||||||
super(reads, ref, rods);
|
|
||||||
}
|
|
||||||
|
|
||||||
public <M,T> T traverse( Walker<M,T> walker,
|
public <M,T> T traverse( Walker<M,T> walker,
|
||||||
Shard shard,
|
Shard shard,
|
||||||
ShardDataProvider dataProvider,
|
ShardDataProvider dataProvider,
|
||||||
|
|
@ -70,7 +66,7 @@ public class TraverseLocusWindows extends TraversalEngine {
|
||||||
return sum;
|
return sum;
|
||||||
}
|
}
|
||||||
|
|
||||||
private LocusContext getLocusContext(Iterator<SAMRecord> readIter, GenomeLoc interval) {
|
private LocusContext getLocusContext(StingSAMIterator readIter, GenomeLoc interval) {
|
||||||
ArrayList<SAMRecord> reads = new ArrayList<SAMRecord>();
|
ArrayList<SAMRecord> reads = new ArrayList<SAMRecord>();
|
||||||
boolean done = false;
|
boolean done = false;
|
||||||
long leftmostIndex = interval.getStart(),
|
long leftmostIndex = interval.getStart(),
|
||||||
|
|
@ -92,8 +88,8 @@ public class TraverseLocusWindows extends TraversalEngine {
|
||||||
|
|
||||||
GenomeLoc window = GenomeLocParser.createGenomeLoc(interval.getContig(), leftmostIndex, rightmostIndex);
|
GenomeLoc window = GenomeLocParser.createGenomeLoc(interval.getContig(), leftmostIndex, rightmostIndex);
|
||||||
LocusContext locus = new LocusContext(window, reads, null);
|
LocusContext locus = new LocusContext(window, reads, null);
|
||||||
if ( DOWNSAMPLE_BY_COVERAGE )
|
if ( readIter.getSourceInfo().getDownsampleToCoverage() != null )
|
||||||
locus.downsampleToCoverage(downsamplingCoverage);
|
locus.downsampleToCoverage(readIter.getSourceInfo().getDownsampleToCoverage());
|
||||||
|
|
||||||
return locus;
|
return locus;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -4,24 +4,20 @@ import net.sf.samtools.SAMRecord;
|
||||||
import org.apache.log4j.Logger;
|
import org.apache.log4j.Logger;
|
||||||
import org.broadinstitute.sting.gatk.LocusContext;
|
import org.broadinstitute.sting.gatk.LocusContext;
|
||||||
import org.broadinstitute.sting.gatk.WalkerManager;
|
import org.broadinstitute.sting.gatk.WalkerManager;
|
||||||
import org.broadinstitute.sting.gatk.datasources.providers.ShardDataProvider;
|
|
||||||
import org.broadinstitute.sting.gatk.datasources.providers.ReadView;
|
|
||||||
import org.broadinstitute.sting.gatk.datasources.providers.ReadReferenceView;
|
import org.broadinstitute.sting.gatk.datasources.providers.ReadReferenceView;
|
||||||
|
import org.broadinstitute.sting.gatk.datasources.providers.ReadView;
|
||||||
|
import org.broadinstitute.sting.gatk.datasources.providers.ShardDataProvider;
|
||||||
|
import org.broadinstitute.sting.gatk.datasources.shards.IntervalShard;
|
||||||
import org.broadinstitute.sting.gatk.datasources.shards.ReadShard;
|
import org.broadinstitute.sting.gatk.datasources.shards.ReadShard;
|
||||||
import org.broadinstitute.sting.gatk.datasources.shards.Shard;
|
import org.broadinstitute.sting.gatk.datasources.shards.Shard;
|
||||||
import org.broadinstitute.sting.gatk.datasources.shards.IntervalShard;
|
import org.broadinstitute.sting.gatk.walkers.DataSource;
|
||||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
|
|
||||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
|
|
||||||
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
|
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
|
||||||
import org.broadinstitute.sting.gatk.walkers.Walker;
|
import org.broadinstitute.sting.gatk.walkers.Walker;
|
||||||
import org.broadinstitute.sting.gatk.walkers.DataSource;
|
|
||||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||||
|
|
||||||
import java.io.File;
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Copyright (c) 2009 The Broad Institute
|
* Copyright (c) 2009 The Broad Institute
|
||||||
|
|
@ -63,22 +59,9 @@ public class TraverseReads extends TraversalEngine {
|
||||||
/** our log, which we want to capture anything from this class */
|
/** our log, which we want to capture anything from this class */
|
||||||
protected static Logger logger = Logger.getLogger(TraverseReads.class);
|
protected static Logger logger = Logger.getLogger(TraverseReads.class);
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Creates a new, uninitialized TraversalEngine
|
|
||||||
*
|
|
||||||
* @param reads SAM/BAM file of reads
|
|
||||||
* @param ref Reference file in FASTA format, assumes a .dict file is also available
|
|
||||||
* @param rods Array of reference ordered data sets
|
|
||||||
*/
|
|
||||||
public TraverseReads(List<File> reads, File ref, List<ReferenceOrderedData<? extends ReferenceOrderedDatum>> rods) {
|
|
||||||
super(reads, ref, rods);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Traverse by reads, given the data and the walker
|
* Traverse by reads, given the data and the walker
|
||||||
*
|
*
|
||||||
* @param walker the walker to traverse with
|
* @param walker the walker to traverse with
|
||||||
* @param shard the shard, specifying the range of data to iterate over
|
* @param shard the shard, specifying the range of data to iterate over
|
||||||
* @param dataProvider the provider of the reads data
|
* @param dataProvider the provider of the reads data
|
||||||
|
|
@ -107,7 +90,7 @@ public class TraverseReads extends TraversalEngine {
|
||||||
|
|
||||||
ReadWalker<M, T> readWalker = (ReadWalker<M, T>) walker;
|
ReadWalker<M, T> readWalker = (ReadWalker<M, T>) walker;
|
||||||
boolean needsReferenceBasesP = WalkerManager.isRequired(walker, DataSource.REFERENCE_BASES);
|
boolean needsReferenceBasesP = WalkerManager.isRequired(walker, DataSource.REFERENCE_BASES);
|
||||||
|
|
||||||
ReadView reads = new ReadView(dataProvider);
|
ReadView reads = new ReadView(dataProvider);
|
||||||
ReadReferenceView reference = new ReadReferenceView(dataProvider);
|
ReadReferenceView reference = new ReadReferenceView(dataProvider);
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,249 +0,0 @@
|
||||||
package org.broadinstitute.sting.playground.gatk.walkers;
|
|
||||||
|
|
||||||
import java.io.File;
|
|
||||||
import java.io.FileWriter;
|
|
||||||
import java.io.IOException;
|
|
||||||
|
|
||||||
import org.broadinstitute.sting.gatk.LocusContext;
|
|
||||||
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
|
|
||||||
import org.broadinstitute.sting.gatk.walkers.WalkerName;
|
|
||||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
|
||||||
import org.broadinstitute.sting.playground.indels.*;
|
|
||||||
import net.sf.samtools.*;
|
|
||||||
|
|
||||||
import net.sf.picard.reference.ReferenceSequence;
|
|
||||||
|
|
||||||
// Investigates indels called in the alignment data
|
|
||||||
|
|
||||||
@WalkerName("InspectIndels")
|
|
||||||
public class IndelInspector extends ReadWalker<Integer, Integer> {
|
|
||||||
@Argument(fullName="IndelVerbosity", shortName="iverb", required=false, doc="Verbosity level: SILENT, PILESUMMARY, ALIGNMENTS") public String VERBOSITY_LEVEL = "SILENT";
|
|
||||||
@Argument(fullName="OutputNormal", shortName="outNorm", required=true, doc="Output file (sam or bam) for non-indel related reads and indel reads that were not improved") public String OUT1;
|
|
||||||
@Argument(fullName="OutputIndel", shortName="outIndel", required=true, doc="Output file (sam or bam) for improved (realigned) indel related reads") public String OUT2;
|
|
||||||
@Argument(fullName="ControlRun", shortName="paranoid", required=false, doc="If true, all reads that would be otherwise picked and processed by this tool will be saved, unmodified, into the OutputNormal file") public boolean CONTROL_RUN = false;
|
|
||||||
@Argument(fullName="ErrorCheckMode", shortName="error", required=false, doc="Error counting mode: MM (mismatches only (from sam tags)), MC (mismatches only doing actual mismatch count on the fly (use this if tags are incorrectly set)), ERR (errors (arachne style: mm+gap lengths)), MG (count mismatches and gaps as one error each)") public String ERR_MODE;
|
|
||||||
@Argument(fullName="MaxErrors", shortName="maxError", required=false, doc="Maximum number of errors allowed (see ERR_MODE)") public Integer MAX_ERRS = 10000;
|
|
||||||
@Argument(fullName="MaxReadLength", shortName="maxRead", required=false, doc="Ignore reads that are longer than the specified cutoff (not a good way to do things but might be necessary because of performance issues)") public int MAX_READ_LENGTH = -1;
|
|
||||||
|
|
||||||
private int discarded_cigar_count = 0;
|
|
||||||
private int discarded_long_read_count = 0;
|
|
||||||
private int discarded_maxerr_count = 0;
|
|
||||||
private int unmapped_read_count = 0;
|
|
||||||
private int total_reads = 0;
|
|
||||||
private IndelRecordPileCollector collector;
|
|
||||||
private PileBuilder pileBuilder = null;
|
|
||||||
private PassThroughWriter ptWriter;
|
|
||||||
private String cur_contig = null;
|
|
||||||
|
|
||||||
|
|
||||||
// public boolean filter(LocusContext context, SAMRecord read) {
|
|
||||||
// // we only want aligned reads
|
|
||||||
// return !read.getReadUnmappedFlag();
|
|
||||||
// }
|
|
||||||
|
|
||||||
public void initialize() {
|
|
||||||
|
|
||||||
if ( ERR_MODE != null && ! ERR_MODE.equals("MM") && ! ERR_MODE.equals("MG") && ! ERR_MODE.equals("ERR") && ! ERR_MODE.equals("MC") ) {
|
|
||||||
err.println("Unknown value specified for ERR_MODE: "+ERR_MODE+ "... skipping it.");
|
|
||||||
ERR_MODE = null;
|
|
||||||
}
|
|
||||||
|
|
||||||
SAMFileHeader header = getToolkit().getSamReader().getFileHeader();
|
|
||||||
header.setSortOrder(SAMFileHeader.SortOrder.unsorted);
|
|
||||||
ptWriter = new PassThroughWriter(OUT1, header);
|
|
||||||
if ( ! CONTROL_RUN ) pileBuilder = new PileBuilder(OUT2, header, ptWriter);
|
|
||||||
|
|
||||||
try {
|
|
||||||
if ( CONTROL_RUN ) collector = new IndelRecordPileCollector(ptWriter, new DiscardingPileReceiver() );
|
|
||||||
else collector = new IndelRecordPileCollector(ptWriter, pileBuilder );
|
|
||||||
} catch(Exception e) {
|
|
||||||
err.println(e.getMessage());
|
|
||||||
}
|
|
||||||
if ( collector == null )
|
|
||||||
return;
|
|
||||||
|
|
||||||
collector.setControlRun(CONTROL_RUN);
|
|
||||||
|
|
||||||
if ( ! CONTROL_RUN ) {
|
|
||||||
if ( VERBOSITY_LEVEL.toUpperCase().equals("SILENT")) pileBuilder.setVerbosity(PileBuilder.SILENT);
|
|
||||||
else if ( VERBOSITY_LEVEL.toUpperCase().equals("PILESUMMARY") ) pileBuilder.setVerbosity(PileBuilder.PILESUMMARY);
|
|
||||||
else if ( VERBOSITY_LEVEL.toUpperCase().equals("ALIGNMENTS") ) pileBuilder.setVerbosity(PileBuilder.ALIGNMENTS);
|
|
||||||
else {
|
|
||||||
err.println("Unrecognized VERBOSITY_LEVEL setting... skipping it.");
|
|
||||||
pileBuilder.setVerbosity(PileBuilder.SILENT);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public Integer map(char[] ref, SAMRecord read) {
|
|
||||||
|
|
||||||
total_reads++;
|
|
||||||
|
|
||||||
if ( read.getReadUnmappedFlag() ) {
|
|
||||||
unmapped_read_count++;
|
|
||||||
ptWriter.receive(read); // make sure we still write the read to the output, we do not want to lose data!
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* 11 May 2009 - Commented out because we no longer have the full reference sequence for sanity checking.
|
|
||||||
ReferenceSequence contig_seq = context.getReferenceContig();
|
|
||||||
if ( read.getReferenceName() != cur_contig) {
|
|
||||||
cur_contig = read.getReferenceName();
|
|
||||||
out.println("Contig "+cur_contig);
|
|
||||||
String refstr = new String(contig_seq.getBases());
|
|
||||||
if ( !CONTROL_RUN )
|
|
||||||
pileBuilder.setReferenceSequence(refstr);
|
|
||||||
out.println("loaded contig "+cur_contig+" (index="+read.getReferenceIndex()+"); length="+contig_seq.getBases().length+" tst="+contig_seq.toString());
|
|
||||||
}
|
|
||||||
*/
|
|
||||||
|
|
||||||
// if ( cur_contig.equals("chrM") || GenomeLoc.compareContigs(cur_contig,"chrY") > 0 ) continue; // skip chrM and unplaced contigs for now
|
|
||||||
|
|
||||||
if ( MAX_READ_LENGTH > 0 && read.getReadLength() > MAX_READ_LENGTH ) {
|
|
||||||
ptWriter.receive(read);
|
|
||||||
discarded_long_read_count++;
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
// we currently do not know how to deal with cigars containing elements other than M,I,D, so
|
|
||||||
// let's just skip the reads that contain those other elements (clipped reads?)
|
|
||||||
Cigar c = read.getCigar();
|
|
||||||
boolean cigar_acceptable = true;
|
|
||||||
for ( int z = 0 ; z < c.numCigarElements() ; z++ ) {
|
|
||||||
CigarElement ce = c.getCigarElement(z);
|
|
||||||
switch ( ce.getOperator() ) {
|
|
||||||
case M:
|
|
||||||
case I:
|
|
||||||
case D: break;
|
|
||||||
default:
|
|
||||||
cigar_acceptable = false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if ( ! cigar_acceptable ) {
|
|
||||||
ptWriter.receive(read);
|
|
||||||
discarded_cigar_count++;
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
int err = -1;
|
|
||||||
/*
|
|
||||||
System.out.println("MM: "+numMismatches(r));
|
|
||||||
System.out.println("direct: "+numMismatchesDirect(r,contig_seq));
|
|
||||||
System.out.print(" ");
|
|
||||||
for ( int i = read.getAlignmentStart() - 1 ; i < read.getAlignmentEnd() ; i++ ) System.out.print((char)contig_seq.getBases()[i]);
|
|
||||||
System.out.println();
|
|
||||||
System.out.println((read.getReadNegativeStrandFlag()?"<-":"->")+read.getReadString());
|
|
||||||
System.out.println("cigar: "+read.getCigarString());
|
|
||||||
System.out.println();
|
|
||||||
if (counter++ == 20 ) break;
|
|
||||||
continue;
|
|
||||||
*/
|
|
||||||
|
|
||||||
if ( ERR_MODE != null ) {
|
|
||||||
if ( ERR_MODE.equals("MM")) err = numMismatches(read,ref);
|
|
||||||
else if ( ERR_MODE.equals("MC") ) err = AlignmentUtils.numMismatches(read,ref);
|
|
||||||
else if ( ERR_MODE.equals("ERR")) err = numErrors(read,ref);
|
|
||||||
else if ( ERR_MODE.equals("MG")) err = numMismatchesGaps(read,ref);
|
|
||||||
if ( err > MAX_ERRS ) {
|
|
||||||
ptWriter.receive(read);
|
|
||||||
discarded_maxerr_count++;
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
collector.receive(read);
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
/** This method is a HACK: it is designed to work around the current bug in NM tags created at CRD
|
|
||||||
*
|
|
||||||
* @param r SAM record that must specify an alignment
|
|
||||||
* @return number of errors (number of mismatches plus total length of all insertions/deletions
|
|
||||||
* @throws RuntimeException
|
|
||||||
*/
|
|
||||||
private static int numErrors(SAMRecord r, char[] ref) throws RuntimeException {
|
|
||||||
|
|
||||||
// NM currently stores the total number of mismatches in all blocks + 1
|
|
||||||
int errs = numMismatches(r,ref);
|
|
||||||
|
|
||||||
// now we have to add the total length of all indels:
|
|
||||||
Cigar c = r.getCigar();
|
|
||||||
for ( int i = 0 ; i < c.numCigarElements() ; i++ ) {
|
|
||||||
CigarElement ce = c.getCigarElement(i);
|
|
||||||
switch( ce.getOperator()) {
|
|
||||||
case M : break; // we already have correct number of mismatches
|
|
||||||
case I :
|
|
||||||
case D :
|
|
||||||
errs += ce.getLength();
|
|
||||||
break;
|
|
||||||
default: throw new RuntimeException("Unrecognized cigar element");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return errs;
|
|
||||||
}
|
|
||||||
|
|
||||||
/** This method is a HACK: it is designed to work around the current bug in NM tags created at CRD
|
|
||||||
*
|
|
||||||
* @param r SAM record that must specify an alignment
|
|
||||||
* @return number of errors (number of mismatches plus total number of all insertions/deletions (each insertion or
|
|
||||||
* deletion will be counted as a single error regardless of the length)
|
|
||||||
* @throws RuntimeException
|
|
||||||
*/
|
|
||||||
private static int numMismatchesGaps(SAMRecord r,char[] ref) throws RuntimeException {
|
|
||||||
|
|
||||||
// NM currently stores the total number of mismatches in all blocks + 1
|
|
||||||
int errs = numMismatches(r,ref);
|
|
||||||
|
|
||||||
// now we have to add the total length of all indels:
|
|
||||||
Cigar c = r.getCigar();
|
|
||||||
for ( int i = 0 ; i < c.numCigarElements() ; i++ ) {
|
|
||||||
CigarElement ce = c.getCigarElement(i);
|
|
||||||
switch( ce.getOperator()) {
|
|
||||||
case M : break; // we already have correct number of mismatches
|
|
||||||
case I :
|
|
||||||
case D :
|
|
||||||
errs++;
|
|
||||||
break;
|
|
||||||
default: throw new RuntimeException("Unrecognized cigar element");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return errs;
|
|
||||||
}
|
|
||||||
|
|
||||||
/** This method is a HACK: it is designed to work around the current bug in NM tags created at CRD */
|
|
||||||
private static int numMismatches(SAMRecord r, char[] refseq) throws RuntimeException {
|
|
||||||
|
|
||||||
// NM currently stores the total number of mismatches in all blocks + 1
|
|
||||||
Integer i = (Integer)r.getAttribute("NM");
|
|
||||||
if ( i == null ) return AlignmentUtils.numMismatches(r,refseq);
|
|
||||||
return ((Integer)r.getAttribute("NM")).intValue() - 1;
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
// Given result of map function
|
|
||||||
public Integer reduceInit() { return 0; }
|
|
||||||
public Integer reduce(Integer value, Integer sum) {
|
|
||||||
return value + sum;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void onTraversalDone(Integer result) {
|
|
||||||
collector.close(); // write the reads collector might be still holding
|
|
||||||
|
|
||||||
if ( ! CONTROL_RUN ) {
|
|
||||||
pileBuilder.printStats();
|
|
||||||
pileBuilder.close();
|
|
||||||
}
|
|
||||||
out.println("done.");
|
|
||||||
out.println("Total reads processed: "+ total_reads);
|
|
||||||
out.println("Unmodified: "+ ptWriter.getNumReadsReceived());
|
|
||||||
if ( pileBuilder != null) out.println("Written into realigned piles: "+ pileBuilder.getNumReadsWritten());
|
|
||||||
if ( pileBuilder != null) out.println("Received for realignment: "+ pileBuilder.getNumReadsReceived());
|
|
||||||
out.println("Discarded reads with non-M,I,D cigar elements: "+ discarded_cigar_count);
|
|
||||||
out.println("Discarded long reads (above "+MAX_READ_LENGTH+" bp): "+ discarded_long_read_count);
|
|
||||||
out.println("Discarded reads with error counts exceeding the threshold: "+ discarded_maxerr_count);
|
|
||||||
out.println("Unmapped reads (passed through to the output): "+ unmapped_read_count);
|
|
||||||
out.println();
|
|
||||||
collector.printLengthHistograms();
|
|
||||||
ptWriter.close();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
@ -56,7 +56,6 @@ public class ArtificialReadsTraversal extends TraversalEngine {
|
||||||
|
|
||||||
/** Creates a new, uninitialized ArtificialReadsTraversal */
|
/** Creates a new, uninitialized ArtificialReadsTraversal */
|
||||||
public ArtificialReadsTraversal() {
|
public ArtificialReadsTraversal() {
|
||||||
super(null, null, null);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// what read ordering are we using
|
// what read ordering are we using
|
||||||
|
|
|
||||||
|
|
@ -7,8 +7,6 @@ import org.broadinstitute.sting.gatk.datasources.shards.Shard;
|
||||||
import org.broadinstitute.sting.gatk.datasources.shards.ShardStrategy;
|
import org.broadinstitute.sting.gatk.datasources.shards.ShardStrategy;
|
||||||
import org.broadinstitute.sting.gatk.datasources.shards.ShardStrategyFactory;
|
import org.broadinstitute.sting.gatk.datasources.shards.ShardStrategyFactory;
|
||||||
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.SAMDataSource;
|
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.SAMDataSource;
|
||||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
|
|
||||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
|
|
||||||
import org.broadinstitute.sting.gatk.walkers.CountReadsWalker;
|
import org.broadinstitute.sting.gatk.walkers.CountReadsWalker;
|
||||||
import org.broadinstitute.sting.gatk.walkers.Walker;
|
import org.broadinstitute.sting.gatk.walkers.Walker;
|
||||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||||
|
|
@ -98,9 +96,8 @@ public class TraverseReadsTest extends BaseTest {
|
||||||
e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates.
|
e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates.
|
||||||
fail("Couldn't set the walkers printstream");
|
fail("Couldn't set the walkers printstream");
|
||||||
}
|
}
|
||||||
List<ReferenceOrderedData<? extends ReferenceOrderedDatum>> rods = new ArrayList<ReferenceOrderedData<? extends ReferenceOrderedDatum>>();
|
|
||||||
|
traversalEngine = new TraverseReads();
|
||||||
traversalEngine = new TraverseReads(bamList, refFile, rods);
|
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue