Switch TraverseByLoci over to new sharding system, and cleanup some code in passing read files along

the pathway from command line to traversal engine.


git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@727 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
hanna 2009-05-15 21:02:12 +00:00
parent 57e5f22987
commit 2c4de7b5c5
34 changed files with 596 additions and 228 deletions

View File

@ -1,11 +1,18 @@
package org.broadinstitute.sting.gatk;
import org.broadinstitute.sting.gatk.walkers.Walker;
import org.broadinstitute.sting.gatk.traversals.TraversalEngine;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.xReadLines;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.broadinstitute.sting.utils.cmdLine.ArgumentCollection;
import org.broadinstitute.sting.utils.cmdLine.CommandLineProgram;
import java.io.FileNotFoundException;
import java.io.File;
import java.util.List;
import java.util.ArrayList;
/**
*
* User: aaron
@ -80,6 +87,9 @@ public class CommandLineGATK extends CommandLineProgram {
}
loadArgumentsIntoObject(argCollection);
loadArgumentsIntoObject(mWalker);
processArguments(argCollection);
this.argCollection.analysisName = this.analysisName;
try {
GATKEngine = new GenomeAnalysisEngine(argCollection, mWalker);
@ -132,4 +142,38 @@ public class CommandLineGATK extends CommandLineProgram {
this.argCollection = argCollection;
}
/**
* Preprocess the arguments before submitting them to the GATK engine.
* @param argCollection Collection of arguments to preprocess.
*/
private void processArguments( GATKArgumentCollection argCollection ) {
argCollection.samFiles = unpackReads( argCollection.samFiles );
}
/**
* Unpack the files to be processed, given a list of files. That list of files can
* itself contain lists of other files to be read.
* @param inputFiles
* @return
*/
private List<File> unpackReads( List<File> inputFiles ) {
List<File> unpackedReads = new ArrayList<File>();
for( File inputFile: inputFiles ) {
if( inputFile.getName().endsWith(".list") ) {
try {
for( String fileName : new xReadLines(inputFile) )
unpackedReads.add( new File(fileName) );
}
catch( FileNotFoundException ex ) {
throw new StingException("Unable to find file while unpacking reads", ex);
}
}
else
unpackedReads.add( inputFile );
}
return unpackedReads;
}
}

View File

@ -96,15 +96,15 @@ public class GATKArgumentCollection {
@Element(required=false)
@Argument(fullName = "sort_on_the_fly", shortName = "sort", doc = "Maximum number of reads to sort on the fly", required = false)
public String maximumReadSorts = null;
public Integer maximumReadSorts = null;
@Element(required=false)
@Argument(fullName = "downsample_to_fraction", shortName = "dfrac", doc = "Fraction [0.0-1.0] of reads to downsample to", required = false)
public String downsampleFraction = null;
public Double downsampleFraction = null;
@Element(required=false)
@Argument(fullName = "downsample_to_coverage", shortName = "dcov", doc = "Coverage [integer] to downsample to", required = false)
public String downsampleCoverage = null;
public Integer downsampleCoverage = null;
@Element(required=false)
@Argument(fullName = "intervals", shortName = "L", doc = "A list of genomic intervals over which to operate. Can be explicitly specified on the command line or in a file.", required = false)
@ -238,13 +238,16 @@ public class GATKArgumentCollection {
if (!other.unsafe.equals(this.unsafe)) {
return false;
}
if (!other.maximumReadSorts.equals(this.maximumReadSorts)) {
if ((other.maximumReadSorts == null && this.maximumReadSorts != null) ||
(other.maximumReadSorts != null && !other.maximumReadSorts.equals(this.maximumReadSorts))) {
return false;
}
if (!other.downsampleFraction.equals(this.downsampleFraction)) {
if ((other.downsampleFraction == null && this.downsampleFraction != null) ||
(other.downsampleFraction != null && !other.downsampleFraction.equals(this.downsampleFraction))) {
return false;
}
if (!other.downsampleCoverage.equals(this.downsampleCoverage)) {
if ((other.downsampleCoverage == null && this.downsampleCoverage != null) ||
(other.downsampleCoverage != null && !other.downsampleCoverage.equals(this.downsampleCoverage))) {
return false;
}
if (!other.walkAllLoci.equals(this.walkAllLoci)) {

View File

@ -84,7 +84,7 @@ public class GenomeAnalysisEngine {
// if we're a read or a locus walker, we use the new system. Right now we have complicated
// branching based on the input data, but this should disapear when all the traversals are switched over
if ((my_walker instanceof LocusWalker && argCollection.walkAllLoci && !(argCollection.samFiles == null || argCollection.samFiles.size() == 0)) ||
if ((my_walker instanceof LocusWalker && !(argCollection.samFiles == null || argCollection.samFiles.size() == 0)) ||
my_walker instanceof ReadWalker) {
microScheduler = createMicroscheduler(my_walker, rods);
} else { // we have an old style traversal, once we're done return
@ -181,13 +181,15 @@ public class GenomeAnalysisEngine {
Utils.scareUser(String.format("Analysis %s doesn't support SAM/BAM reads, but a read file %s was provided", argCollection.analysisName, argCollection.samFiles));
// create the MicroScheduler
microScheduler = MicroScheduler.create(my_walker, argCollection.samFiles, argCollection.referenceFile, rods, argCollection.numberOfThreads);
if( argCollection.walkAllLoci )
Utils.scareUser("Argument --all_loci is deprecated. Please annotate your walker with @By(DataSource.REFERENCE) to perform a by-reference traversal.");
microScheduler = MicroScheduler.create(my_walker, new Reads(argCollection), argCollection.referenceFile, rods, argCollection.numberOfThreads);
engine = microScheduler.getTraversalEngine();
}
else if (my_walker instanceof ReadWalker) {
if (argCollection.referenceFile == null)
Utils.scareUser(String.format("Locus-based traversals require a reference file but none was given"));
microScheduler = MicroScheduler.create(my_walker, argCollection.samFiles, argCollection.referenceFile, rods, argCollection.numberOfThreads);
microScheduler = MicroScheduler.create(my_walker, new Reads(argCollection), argCollection.referenceFile, rods, argCollection.numberOfThreads);
engine = microScheduler.getTraversalEngine();
}
@ -209,26 +211,14 @@ public class GenomeAnalysisEngine {
if (argCollection.intervals != null) {
engine.setLocation(setupIntervalRegion());
}
// hmm...
if (argCollection.maximumReadSorts != null) {
engine.setSortOnFly(Integer.parseInt(argCollection.maximumReadSorts));
}
if (argCollection.downsampleFraction != null) {
engine.setDownsampleByFraction(Double.parseDouble(argCollection.downsampleFraction));
}
engine.setReadFilters(new Reads(argCollection));
if (argCollection.downsampleCoverage != null) {
engine.setDownsampleByCoverage(Integer.parseInt(argCollection.downsampleCoverage));
}
engine.setSafetyChecking(!argCollection.unsafe);
engine.setThreadedIO(argCollection.enabledThreadedIO);
engine.setWalkOverAllSites(argCollection.walkAllLoci);
engine.initialize();
}
/**
* setup the interval regions, from either the interval file of the genome region string
*

View File

@ -0,0 +1,95 @@
package org.broadinstitute.sting.gatk;
import org.broadinstitute.sting.gatk.traversals.TraversalEngine;
import org.broadinstitute.sting.utils.StingException;
import java.io.File;
import java.io.FileNotFoundException;
import java.util.List;
/**
* User: hanna
* Date: May 14, 2009
* Time: 4:06:26 PM
* BROAD INSTITUTE SOFTWARE COPYRIGHT NOTICE AND AGREEMENT
* Software and documentation are copyright 2005 by the Broad Institute.
* All rights are reserved.
*
* Users acknowledge that this software is supplied without any warranty or support.
* The Broad Institute is not responsible for its use, misuse, or
* functionality.
*/
/**
* A data structure containing information about the reads data sources as well as
* information about how they should be downsampled, sorted, and filtered.
*/
public class Reads {
private List<File> readsFiles = null;
private Double downsamplingFraction = null;
private Integer downsampleToCoverage = null;
private Integer maxOnFlySorts = null;
private Boolean beSafe = null;
/**
* Gets a list of the files acting as sources of reads.
* @return A list of files storing reads data.
*/
public List<File> getReadsFiles() {
return readsFiles;
}
/**
* Get the fraction of reads to downsample.
* @return Downsample fraction.
*/
public Double getDownsamplingFraction() {
return downsamplingFraction;
}
/**
* Downsample each locus to the specified coverage.
* @return Coverage to which to downsample.
*/
public Integer getDownsampleToCoverage() {
return downsampleToCoverage;
}
/**
* Get the maximum number of supported on-the-fly sorts.
* @return Max number of on-the-fly sorts.
*/
public Integer getMaxOnTheFlySorts() {
return maxOnFlySorts;
}
/**
* Return whether to 'verify' the reads as we pass through them.
* @return Whether to verify the reads.
*/
public Boolean getSafetyChecking() {
return beSafe;
}
/**
* Simple constructor for unit testing.
* @param readsFiles List of reads files to open.
*/
public Reads( List<File> readsFiles ) {
this.readsFiles = readsFiles;
}
/**
* Extract the command-line arguments having to do with reads input
* files and store them in an easy-to-work-with package. Constructor
* is package protected.
* @param arguments GATK parsed command-line arguments.
*/
Reads( GATKArgumentCollection arguments ) {
this.readsFiles = arguments.samFiles;
if (arguments.downsampleFraction != null) downsamplingFraction = arguments.downsampleFraction;
if (arguments.downsampleCoverage != null) downsampleToCoverage = arguments.downsampleCoverage;
if (arguments.maximumReadSorts != null) maxOnFlySorts = arguments.maximumReadSorts;
beSafe = !arguments.unsafe;
}
}

View File

@ -15,8 +15,11 @@ import java.util.Map;
import org.broadinstitute.sting.gatk.walkers.Walker;
import org.broadinstitute.sting.gatk.walkers.WalkerName;
import org.broadinstitute.sting.gatk.walkers.DataSource;
import org.broadinstitute.sting.gatk.walkers.By;
import org.broadinstitute.sting.utils.JVMUtils;
import org.broadinstitute.sting.utils.PathUtils;
import org.broadinstitute.sting.utils.StingException;
import org.apache.log4j.Logger;
/**
@ -95,10 +98,28 @@ public class WalkerManager {
return (Walker) walker.newInstance();
}
/**
* Retrieves the walker class given a walker name.
* @param walkerName Name of the walker.
* @return Class representing the walker.
*/
public Class getWalkerClassByName(String walkerName) {
return walkers.get(walkerName);
}
/**
* Gets the data source for the provided walker.
* @param walker The walker.
* @return Which type of data source to traverse over...reads or reference?
*/
public static DataSource getWalkerDataSource(Walker walker) {
Class<? extends Walker> walkerClass = walker.getClass();
By byDataSource = walkerClass.getAnnotation(By.class);
if( byDataSource == null )
throw new StingException("Unable to find By annotation for walker class " + walkerClass.getName());
return byDataSource.value();
}
/**
* Load classes internal to the classpath from an arbitrary location.
*

View File

@ -1,17 +1,9 @@
package org.broadinstitute.sting.gatk.dataSources.providers;
import net.sf.samtools.SAMRecord;
import java.util.Iterator;
import java.util.NoSuchElementException;
import edu.mit.broad.picard.filter.FilteringIterator;
import org.broadinstitute.sting.gatk.traversals.TraversalEngine;
import org.broadinstitute.sting.gatk.iterators.LocusContextIteratorByHanger;
import org.broadinstitute.sting.gatk.iterators.LocusContextIterator;
import org.broadinstitute.sting.gatk.iterators.LocusIterator;
import org.broadinstitute.sting.gatk.LocusContext;
import org.broadinstitute.sting.gatk.dataSources.shards.Shard;
import org.broadinstitute.sting.utils.GenomeLoc;
/**
* User: hanna
@ -30,24 +22,25 @@ import org.broadinstitute.sting.utils.GenomeLoc;
* A LocusContextQueue over which the user can iterate.
*/
public class IterableLocusContextQueue implements LocusContextQueue, LocusIterator {
private Shard shard;
private LocusContextIterator loci;
public class IterableLocusContextQueue extends LocusContextQueue implements LocusIterator {
/**
* What's the context for the last locus accessed?
* @param provider
*/
private LocusContext nextLocusContext = null;
private LocusContext prefetched = null;
/**
* Has this prefetch been consumed? If this flag is set,
* the prefetch will skip to the next argument in the system.
*/
private boolean prefetchConsumed = true;
/**
* Create a new queue of locus contexts.
* @param provider
*/
public IterableLocusContextQueue(ShardDataProvider provider) {
Iterator<SAMRecord> reads = new FilteringIterator(provider.getReadIterator(), new TraversalEngine.locusStreamFilterFunc());
this.loci = new LocusContextIteratorByHanger(reads);
this.shard = provider.getShard();
super( provider );
}
/**
@ -55,7 +48,8 @@ public class IterableLocusContextQueue implements LocusContextQueue, LocusIterat
* @return True if another locus present in this iterator. Otherwise, false.
*/
public boolean hasNext() {
return loci.hasNext();
prefetchLocusContext();
return prefetched != null;
}
/**
@ -63,12 +57,36 @@ public class IterableLocusContextQueue implements LocusContextQueue, LocusIterat
* @return Next element in the queue.
*/
public GenomeLoc next() {
do {
nextLocusContext = loci.next();
prefetchLocusContext();
prefetchConsumed = true;
// Signal that the prefetcher needs to grab another entry off the queue.
return prefetched.getLocation();
}
/**
* Find the next locus context within the bounds of a member variable and store
* it in the prefetched member variable. When the prefetch is consumed, the 'consumer'
* should signal it as such by marking prefetchConsumed = true.
*/
private void prefetchLocusContext() {
if( !prefetchConsumed )
return;
prefetched = null;
prefetchConsumed = false;
// If another locus context bounded by this shard exists, find it.
boolean prefetchOutOfBounds = true;
while( hasNextLocusContext() && prefetchOutOfBounds ) {
prefetched = getNextLocusContext();
prefetchOutOfBounds = (prefetched.getLocation().isBefore(shard.getGenomeLoc()) ||
prefetched.getLocation().isPast(shard.getGenomeLoc()));
}
while( nextLocusContext.getLocation().isBefore(shard.getGenomeLoc()) );
return nextLocusContext.getLocation();
// Can't find a valid prefetch? Set prefetch to null. If prefetched == null and
// prefetchConsumed == false, the queue is out of entries.
if( prefetchOutOfBounds )
prefetched = null;
}
/**
@ -83,9 +101,9 @@ public class IterableLocusContextQueue implements LocusContextQueue, LocusIterat
* @return
*/
public LocusContext peek() {
if( nextLocusContext == null )
if( prefetched == null )
throw new NoSuchElementException("No more elements remaining in queue");
return nextLocusContext;
return prefetched;
}
/**
@ -93,10 +111,8 @@ public class IterableLocusContextQueue implements LocusContextQueue, LocusIterat
* @param seekPoint
*/
public LocusContextQueue seek( GenomeLoc seekPoint ) {
if( nextLocusContext == null || !seekPoint.equals(nextLocusContext.getLocation()) ) {
nextLocusContext = null;
if( prefetched == null || !seekPoint.equals(prefetched.getLocation()) )
throw new IllegalArgumentException("IterableLocusContextQueue doesn't support seeking and iterator is in the wrong position.");
}
return this;
}

View File

@ -1,7 +1,17 @@
package org.broadinstitute.sting.gatk.dataSources.providers;
import org.broadinstitute.sting.gatk.LocusContext;
import org.broadinstitute.sting.gatk.Reads;
import org.broadinstitute.sting.gatk.dataSources.shards.Shard;
import org.broadinstitute.sting.gatk.iterators.LocusContextIteratorByHanger;
import org.broadinstitute.sting.gatk.iterators.LocusContextIterator;
import org.broadinstitute.sting.gatk.traversals.TraversalEngine;
import org.broadinstitute.sting.utils.GenomeLoc;
import net.sf.samtools.SAMRecord;
import java.util.Iterator;
import edu.mit.broad.picard.filter.FilteringIterator;
/**
* User: hanna
* Date: May 13, 2009
@ -19,17 +29,50 @@ import org.broadinstitute.sting.utils.GenomeLoc;
* A queue of locus context entries.
*/
public interface LocusContextQueue {
public abstract class LocusContextQueue {
protected Shard shard;
private Reads sourceInfo;
private LocusContextIterator loci;
public LocusContextQueue(ShardDataProvider provider) {
Iterator<SAMRecord> reads = new FilteringIterator(provider.getReadIterator(), new TraversalEngine.locusStreamFilterFunc());
this.loci = new LocusContextIteratorByHanger(reads);
this.sourceInfo = provider.getReadIterator().getSourceInfo();
this.shard = provider.getShard();
}
/**
* Get the locus context at the given position.
* @return Locus context, or null if no locus context exists at this position.
*/
LocusContext peek();
public abstract LocusContext peek();
/**
* Seek to the given point the queue of locus contexts.
* @param target Target base pair to which to seek. Must be a single base pair.
* @return an instance of itself for parameter chaining.
*/
public LocusContextQueue seek(GenomeLoc target);
public abstract LocusContextQueue seek(GenomeLoc target);
/**
* Gets the next locus context, applying filtering as necessary.
* @return Locus context to work with.
*/
protected LocusContext getNextLocusContext() {
LocusContext next = loci.next();
if( sourceInfo.getDownsampleToCoverage() != null )
next.downsampleToCoverage( sourceInfo.getDownsampleToCoverage() );
return next;
}
/**
* hasNext()-style iterator for base iterator.
* @return
*/
protected boolean hasNextLocusContext() {
return loci.hasNext();
}
}

View File

@ -3,6 +3,7 @@ package org.broadinstitute.sting.gatk.dataSources.providers;
import org.broadinstitute.sting.gatk.iterators.LocusContextIterator;
import org.broadinstitute.sting.gatk.iterators.LocusContextIteratorByHanger;
import org.broadinstitute.sting.gatk.LocusContext;
import org.broadinstitute.sting.gatk.Reads;
import org.broadinstitute.sting.gatk.traversals.TraversalEngine;
import org.broadinstitute.sting.gatk.dataSources.shards.Shard;
import org.broadinstitute.sting.utils.GenomeLoc;
@ -31,10 +32,7 @@ import edu.mit.broad.picard.filter.FilteringIterator;
* implementation of java.util.Queue interface.
*/
public class SeekableLocusContextQueue implements LocusContextQueue {
private Shard shard;
private LocusContextIterator loci;
public class SeekableLocusContextQueue extends LocusContextQueue {
/**
* Gets the position to which the last seek was requested.
*/
@ -53,15 +51,13 @@ public class SeekableLocusContextQueue implements LocusContextQueue {
* @param provider
*/
public SeekableLocusContextQueue(ShardDataProvider provider) {
Iterator<SAMRecord> reads = new FilteringIterator(provider.getReadIterator(), new TraversalEngine.locusStreamFilterFunc());
this.loci = new LocusContextIteratorByHanger(reads);
this.shard = provider.getShard();
super(provider);
// Seed the state tracking members with the first possible seek position and the first possible locus context.
seekPoint = new GenomeLoc(shard.getGenomeLoc().getContigIndex(),shard.getGenomeLoc().getStart());
if( loci.hasNext() )
nextLocusContext = loci.next();
if( hasNextLocusContext() )
nextLocusContext = getNextLocusContext();
else
nextLocusContext = this.createEmptyLocusContext(seekPoint);
}
@ -94,15 +90,15 @@ public class SeekableLocusContextQueue implements LocusContextQueue {
seekPoint = (GenomeLoc)target.clone();
// Search for the next locus context following the target positions.
while (nextLocusContext.getLocation().isBefore(target) && loci.hasNext() ) {
while (nextLocusContext.getLocation().isBefore(target) && hasNextLocusContext() ) {
logger.debug(String.format(" current locus is %s vs %s => %d", nextLocusContext.getLocation(),
target,
nextLocusContext.getLocation().compareTo(target)));
nextLocusContext = loci.next();
nextLocusContext = getNextLocusContext();
}
// Couldn't find a next? Force the nextLocusContext to null.
if( nextLocusContext.getLocation().isBefore(target) && !loci.hasNext() )
if( nextLocusContext.getLocation().isBefore(target) && !hasNextLocusContext() )
nextLocusContext = createEmptyLocusContext( seekPoint );
return this;

View File

@ -11,6 +11,9 @@ import org.broadinstitute.sting.gatk.dataSources.shards.Shard;
import org.broadinstitute.sting.gatk.iterators.BoundedReadIterator;
import org.broadinstitute.sting.gatk.iterators.MergingSamRecordIterator2;
import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
import org.broadinstitute.sting.gatk.iterators.StingSAMIteratorAdapter;
import org.broadinstitute.sting.gatk.Reads;
import org.broadinstitute.sting.gatk.traversals.TraversalEngine;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.StingException;
@ -32,6 +35,11 @@ import java.util.List;
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*/
public class SAMDataSource implements SimpleDataSource {
/**
* Backing support for reads.
*/
private Reads reads = null;
/** our SAM data files */
private final SAMFileHeader.SortOrder SORT_ORDER = SAMFileHeader.SortOrder.coordinate;
@ -65,25 +73,18 @@ public class SAMDataSource implements SimpleDataSource {
/**
* constructor, given sam files
*
* @param samFiles the list of sam files
* @param reads the list of sam files
*/
public SAMDataSource(List<?> samFiles) throws SimpleDataSourceLoadException {
public SAMDataSource(Reads reads) throws SimpleDataSourceLoadException {
this.reads = reads;
// check the length
if (samFiles.size() < 1) {
if (reads.getReadsFiles().size() < 1) {
throw new SimpleDataSourceLoadException("SAMDataSource: you must provide a list of length greater then 0");
}
for (Object fileName : samFiles) {
File smFile;
if (samFiles.get(0) instanceof String) {
smFile = new File((String) samFiles.get(0));
} else if (samFiles.get(0) instanceof File) {
smFile = (File) fileName;
} else {
throw new SimpleDataSourceLoadException("SAMDataSource: unknown samFile list type, must be String or File");
}
for (File smFile : reads.getReadsFiles()) {
if (!smFile.canRead()) {
throw new SimpleDataSourceLoadException("SAMDataSource: Unable to load file: " + fileName);
throw new SimpleDataSourceLoadException("SAMDataSource: Unable to load file: " + smFile.getName());
}
samFileList.add(smFile);
@ -120,7 +121,7 @@ public class SAMDataSource implements SimpleDataSource {
* @param location the genome location to extract data for
* @return an iterator for that region
*/
public MergingSamRecordIterator2 seekLocus(GenomeLoc location) throws SimpleDataSourceLoadException {
public StingSAMIterator seekLocus(GenomeLoc location) throws SimpleDataSourceLoadException {
// right now this is pretty damn heavy, it copies the file list into a reader list every time
SamFileHeaderMerger headerMerger = createHeaderMerger();
@ -132,7 +133,7 @@ public class SAMDataSource implements SimpleDataSource {
iter.queryOverlapping(location.getContig(), (int) location.getStart(), (int) location.getStop() + 1);
// return the iterator
return iter;
return StingSAMIteratorAdapter.adapt( reads, iter );
}
/**
@ -144,13 +145,26 @@ public class SAMDataSource implements SimpleDataSource {
* @return an iterator for that region
*/
public StingSAMIterator seek(Shard shard) throws SimpleDataSourceLoadException {
StingSAMIterator iterator = null;
if (shard.getShardType() == Shard.ShardType.READ) {
return seekRead((ReadShard) shard);
iterator = seekRead((ReadShard) shard);
iterator = TraversalEngine.applyDecoratingIterators(true,
iterator,
reads.getDownsamplingFraction(),
reads.getMaxOnTheFlySorts(),
reads.getSafetyChecking());
} else if (shard.getShardType() == Shard.ShardType.LOCUS) {
return seekLocus(shard.getGenomeLoc());
iterator = seekLocus(shard.getGenomeLoc());
iterator = TraversalEngine.applyDecoratingIterators(false,
iterator,
reads.getDownsamplingFraction(),
reads.getMaxOnTheFlySorts(),
reads.getSafetyChecking());
} else {
throw new StingException("seek: Unknown shard type");
}
return iterator;
}
@ -204,7 +218,7 @@ public class SAMDataSource implements SimpleDataSource {
if (bound == null) {
shard.signalDone();
bound = new BoundedReadIterator(iter, 0);
bound = new BoundedReadIterator(StingSAMIteratorAdapter.adapt(reads,iter), 0);
}
return bound;
}
@ -258,7 +272,7 @@ public class SAMDataSource implements SimpleDataSource {
// we're good, increment our read cout
this.readsTaken += readCount;
return new BoundedReadIterator(iter, readCount);
return new BoundedReadIterator(StingSAMIteratorAdapter.adapt(reads,iter), readCount);
}
@ -276,7 +290,7 @@ public class SAMDataSource implements SimpleDataSource {
if (lastReadPos == null) {
lastReadPos = new GenomeLoc(iter.getHeader().getSequenceDictionary().getSequence(0).getSequenceIndex(), 0, 0);
iter.queryContained(lastReadPos.getContig(), 1, -1);
bound = new BoundedReadIterator(iter, readCount);
bound = new BoundedReadIterator(StingSAMIteratorAdapter.adapt(reads,iter), readCount);
this.readsTaken = readCount;
}
// we're not at the beginning, not at the end, so we move forward with our ghastly plan...
@ -324,7 +338,7 @@ public class SAMDataSource implements SimpleDataSource {
SamFileHeaderMerger mg = createHeaderMerger();
iter = new MergingSamRecordIterator2(mg);
iter.queryContained(lastReadPos.getContig(), 1, Integer.MAX_VALUE);
return new BoundedReadIterator(iter,readCount);
return new BoundedReadIterator(StingSAMIteratorAdapter.adapt(reads,iter),readCount);
}
}
}
@ -349,7 +363,7 @@ public class SAMDataSource implements SimpleDataSource {
throw new StingException("Danger: weve run out reads in fastMappedReadSeek");
//return null;
}
bound = new BoundedReadIterator(iter, readCount);
bound = new BoundedReadIterator(StingSAMIteratorAdapter.adapt(reads,iter), readCount);
}

View File

@ -7,6 +7,7 @@ import org.broadinstitute.sting.gatk.dataSources.shards.ShardStrategy;
import org.broadinstitute.sting.gatk.dataSources.shards.Shard;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.OutputTracker;
import org.broadinstitute.sting.gatk.Reads;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
import org.broadinstitute.sting.utils.GenomeLoc;
@ -58,7 +59,7 @@ public class HierarchicalMicroScheduler extends MicroScheduler implements Reduce
* @param refFile Reference for driving the traversal.
* @param nThreadsToUse maximum number of threads to use to do the work
*/
protected HierarchicalMicroScheduler( Walker walker, List<File> reads, File refFile, List<ReferenceOrderedData<? extends ReferenceOrderedDatum>> rods, int nThreadsToUse ) {
protected HierarchicalMicroScheduler( Walker walker, Reads reads, File refFile, List<ReferenceOrderedData<? extends ReferenceOrderedDatum>> rods, int nThreadsToUse ) {
super( walker, reads, refFile, rods );
this.threadPool = Executors.newFixedThreadPool(nThreadsToUse);
if( !(traversalEngine instanceof TraverseLociByReference) )

View File

@ -6,6 +6,7 @@ import org.broadinstitute.sting.gatk.dataSources.shards.ShardStrategy;
import org.broadinstitute.sting.gatk.walkers.Walker;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
import org.broadinstitute.sting.gatk.Reads;
import org.broadinstitute.sting.utils.GenomeLoc;
import java.io.File;
@ -20,7 +21,7 @@ public class LinearMicroScheduler extends MicroScheduler {
* @param reads Reads file(s) to process.
* @param refFile Reference for driving the traversal.
*/
protected LinearMicroScheduler( Walker walker, List<File> reads, File refFile, List<ReferenceOrderedData<? extends ReferenceOrderedDatum>> rods ) {
protected LinearMicroScheduler( Walker walker, Reads reads, File refFile, List<ReferenceOrderedData<? extends ReferenceOrderedDatum>> rods ) {
super(walker, reads, refFile, rods);
}

View File

@ -15,6 +15,7 @@ import org.broadinstitute.sting.gatk.walkers.Walker;
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
import org.broadinstitute.sting.gatk.Reads;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile;
@ -51,7 +52,7 @@ public abstract class MicroScheduler {
* @param nThreadsToUse Number of threads to utilize.
* @return The best-fit microscheduler.
*/
public static MicroScheduler create( Walker walker, List<File> reads, File ref, List<ReferenceOrderedData<? extends ReferenceOrderedDatum>> rods, int nThreadsToUse ) {
public static MicroScheduler create( Walker walker, Reads reads, File ref, List<ReferenceOrderedData<? extends ReferenceOrderedDatum>> rods, int nThreadsToUse ) {
if( walker instanceof TreeReducible && nThreadsToUse > 1 ) {
logger.info("Creating hierarchical microscheduler");
return new HierarchicalMicroScheduler( walker, reads, ref, rods, nThreadsToUse );
@ -67,11 +68,11 @@ public abstract class MicroScheduler {
* @param reads The reads.
* @param refFile File pointer to the reference.
*/
protected MicroScheduler( Walker walker, List<File> reads, File refFile, List<ReferenceOrderedData<? extends ReferenceOrderedDatum>> rods ) {
protected MicroScheduler( Walker walker, Reads reads, File refFile, List<ReferenceOrderedData<? extends ReferenceOrderedDatum>> rods ) {
if (walker instanceof ReadWalker) {
traversalEngine = new TraverseReads(reads, refFile, rods);
traversalEngine = new TraverseReads(reads.getReadsFiles(), refFile, rods);
} else {
traversalEngine = new TraverseLociByReference(reads, refFile, rods);
traversalEngine = new TraverseLociByReference(reads.getReadsFiles(), refFile, rods);
}
this.reads = getReadsDataSource( reads );
@ -132,16 +133,8 @@ public abstract class MicroScheduler {
* Gets a data source for the given set of reads.
* @return A data source for the given set of reads.
*/
private SAMDataSource getReadsDataSource( List<File> reads ) {
List<File> unpackedReads = null;
try {
unpackedReads = TraversalEngine.unpackReads(reads);
}
catch( FileNotFoundException ex ) {
throw new StingException( "Cannot unpack list of reads files", ex );
}
SAMDataSource dataSource = new SAMDataSource( unpackedReads );
private SAMDataSource getReadsDataSource( Reads reads ) {
SAMDataSource dataSource = new SAMDataSource( reads );
// Side effect: initialize the traversal engine with reads data.
// TODO: Give users a dedicated way of getting the header so that the MicroScheduler

View File

@ -5,6 +5,8 @@ import net.sf.samtools.SAMRecord;
import java.util.Iterator;
import org.broadinstitute.sting.gatk.Reads;
/**
*
* User: aaron
@ -68,6 +70,14 @@ public class BoundedReadIterator implements StingSAMIterator {
this.doNotUseThatUnmappedReadPile = useThem;
}
/**
* Retrieves information about reads sources.
* @return Info about the sources of reads.
*/
public Reads getSourceInfo() {
return iterator.getSourceInfo();
}
public SAMFileHeader getHeader() {
// todo: this is bad, we need an iterface out there for samrecords that supports getting the header,

View File

@ -5,6 +5,8 @@ import net.sf.samtools.SAMRecord;
import java.util.Iterator;
import java.util.Random;
import org.broadinstitute.sting.gatk.Reads;
public class DownsampleIterator implements StingSAMIterator {
@ -20,6 +22,15 @@ public class DownsampleIterator implements StingSAMIterator {
next = getNextRecord();
}
/**
* Retrieves information about reads sources.
* @return Info about the sources of reads.
*/
public Reads getSourceInfo() {
return it.getSourceInfo();
}
public boolean hasNext() {
return next != null;
}

View File

@ -15,7 +15,10 @@ import edu.mit.broad.picard.sam.ReservedTagConstants;
import edu.mit.broad.picard.sam.SamFileHeaderMerger;
import edu.mit.broad.picard.util.PeekableIterator;
import net.sf.samtools.*;
import net.sf.samtools.util.CloseableIterator;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.Reads;
import org.broadinstitute.sting.utils.StingException;
import java.lang.reflect.Constructor;
import java.util.Comparator;
@ -28,7 +31,7 @@ import java.util.PriorityQueue;
* iterable stream. The underlying iterators/files must all have the same sort order unless
* the requested output format is unsorted, in which case any combination is valid.
*/
public class MergingSamRecordIterator2 implements StingSAMIterator {
public class MergingSamRecordIterator2 implements CloseableIterator<SAMRecord>, Iterable<SAMRecord> {
protected PriorityQueue<ComparableSamRecordIterator> pq = null;
protected final SamFileHeaderMerger samHeaderMerger;
protected final SAMFileHeader.SortOrder sortOrder;
@ -272,6 +275,7 @@ public class MergingSamRecordIterator2 implements StingSAMIterator {
// Should replace picard class with the same name
class ComparableSamRecordIterator extends PeekableIterator<SAMRecord> implements Comparable<ComparableSamRecordIterator>, StingSAMIterator {
private Reads sourceInfo;
private final Comparator<SAMRecord> comparator;
private final SAMFileReader reader;
@ -295,6 +299,12 @@ class ComparableSamRecordIterator extends PeekableIterator<SAMRecord> implements
this.comparator = comparator;
}
public Reads getSourceInfo() {
if( sourceInfo == null )
throw new StingException("Unable to provide source info for the reads. Please upgrade to the new data sharding framework.");
return sourceInfo;
}
/** Returns the reader from which this iterator was constructed. */
public SAMFileReader getReader() {
return reader;

View File

@ -1,6 +1,8 @@
package org.broadinstitute.sting.gatk.iterators;
import org.broadinstitute.sting.utils.ComparableSAMRecord;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.gatk.Reads;
import net.sf.samtools.SAMRecord;
@ -11,11 +13,11 @@ import java.util.Iterator;
// TODO: Deprecate?
// I don't think we need this if we're only allowing sorted and indexed BAM Files in the GATK - Aaron
public class SortSamIterator implements StingSAMIterator {
private Reads sourceInfo;
private Iterator<ComparableSAMRecord> it;
Iterator<ComparableSAMRecord> it;
public SortSamIterator(Iterator<SAMRecord> unsortedIter, int maxSorts) {
public SortSamIterator(StingSAMIterator unsortedIter, int maxSorts) {
sourceInfo = unsortedIter.getSourceInfo();
ArrayList<ComparableSAMRecord> list = new ArrayList<ComparableSAMRecord>();
while (unsortedIter.hasNext()) {
list.add(new ComparableSAMRecord(unsortedIter.next()));
@ -27,6 +29,16 @@ public class SortSamIterator implements StingSAMIterator {
it = list.iterator();
}
/**
* Retrieves information about reads sources.
* @return Info about the sources of reads.
*/
public Reads getSourceInfo() {
if( sourceInfo == null )
throw new StingException("Unable to provide source info for the reads. Please upgrade to the new data sharding framework.");
return sourceInfo;
}
public boolean hasNext() { return it.hasNext(); }
public SAMRecord next() { return it.next().getRecord(); }

View File

@ -2,6 +2,7 @@ package org.broadinstitute.sting.gatk.iterators;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.util.CloseableIterator;
import org.broadinstitute.sting.gatk.Reads;
/**
*
* User: aaron
@ -28,4 +29,10 @@ import net.sf.samtools.util.CloseableIterator;
* This is the standard interface for all iterators in the Sting package that iterate over SAMRecords
*/
public interface StingSAMIterator extends CloseableIterator<SAMRecord>, Iterable<SAMRecord> {
/**
* Gets source information for the reads. Contains information about the original reads
* files, plus information about downsampling, etc.
* @return
*/
public Reads getSourceInfo();
}

View File

@ -5,6 +5,9 @@ import net.sf.samtools.util.CloseableIterator;
import java.util.Iterator;
import org.broadinstitute.sting.gatk.Reads;
import org.broadinstitute.sting.utils.StingException;
/**
*
* User: aaron
@ -31,14 +34,14 @@ import java.util.Iterator;
* <p/>
* This class adapts other SAMRecord iterators to the StingSAMIterator
*/
public class StingSAMIteratorAdapter {
public class StingSAMIteratorAdapter {
public static StingSAMIterator adapt(Iterator<SAMRecord> iter) {
return new PrivateStringSAMIterator(iter);
public static StingSAMIterator adapt(Reads sourceInfo, Iterator<SAMRecord> iter) {
return new PrivateStringSAMIterator(sourceInfo, iter);
}
public static StingSAMIterator adapt(CloseableIterator<SAMRecord> iter) {
return new PrivateStringSAMCloseableIterator(iter);
public static StingSAMIterator adapt(Reads sourceInfo, CloseableIterator<SAMRecord> iter) {
return new PrivateStringSAMCloseableIterator(sourceInfo, iter);
}
}
@ -49,12 +52,20 @@ public class StingSAMIteratorAdapter {
* methods that implement the iterable<> interface and the close() method from CloseableIterator
*/
class PrivateStringSAMIterator implements StingSAMIterator {
private Reads sourceInfo = null;
private Iterator<SAMRecord> iter = null;
PrivateStringSAMIterator(Iterator<SAMRecord> iter) {
PrivateStringSAMIterator(Reads sourceInfo, Iterator<SAMRecord> iter) {
this.sourceInfo = sourceInfo;
this.iter = iter;
}
public Reads getSourceInfo() {
if( sourceInfo == null )
throw new StingException("Unable to provide source info for the reads. Please upgrade to the new data sharding framework.");
return sourceInfo;
}
public void close() {
// do nothing, we can't close the iterator anyway.
}
@ -82,12 +93,20 @@ class PrivateStringSAMIterator implements StingSAMIterator {
* methods that implement the iterable<> interface.
*/
class PrivateStringSAMCloseableIterator implements StingSAMIterator {
private Reads sourceInfo = null;
private CloseableIterator<SAMRecord> iter = null;
PrivateStringSAMCloseableIterator(CloseableIterator<SAMRecord> iter) {
PrivateStringSAMCloseableIterator(Reads sourceInfo, CloseableIterator<SAMRecord> iter) {
this.sourceInfo = sourceInfo;
this.iter = iter;
}
public Reads getSourceInfo() {
if( sourceInfo == null )
throw new StingException("Unable to provide source info for the reads. Please upgrade to the new data sharding framework.");
return sourceInfo;
}
public void close() {
iter.close();
}

View File

@ -3,6 +3,7 @@ package org.broadinstitute.sting.gatk.iterators;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.util.RuntimeIOException;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.gatk.Reads;
import java.util.Iterator;
@ -23,6 +24,15 @@ public class VerifyingSamIterator implements StingSAMIterator {
this.it = it;
}
/**
* Retrieves information about reads sources.
* @return Info about the sources of reads.
*/
public Reads getSourceInfo() {
return it.getSourceInfo();
}
public boolean hasNext() { return this.it.hasNext(); }
public SAMRecord next() {

View File

@ -16,6 +16,7 @@ import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
import org.broadinstitute.sting.gatk.walkers.Walker;
import org.broadinstitute.sting.gatk.dataSources.shards.Shard;
import org.broadinstitute.sting.gatk.dataSources.providers.ShardDataProvider;
import org.broadinstitute.sting.gatk.Reads;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.fasta.FastaSequenceFile2;
@ -131,11 +132,11 @@ public abstract class TraversalEngine {
this.walkOverAllSites = walkOverAllSites;
}
public void setDebugging(final boolean d) {
private void setDebugging(final boolean d) {
DEBUGGING = d;
}
public void setSafetyChecking(final boolean beSafeP) {
private void setSafetyChecking(final boolean beSafeP) {
if (!beSafeP)
logger.warn("*** Turning off safety checking, I hope you know what you are doing. Errors will result in debugging assert failures and other inscrutable messages...");
this.beSafeP = beSafeP;
@ -147,26 +148,39 @@ public abstract class TraversalEngine {
this.FILTER_UNSORTED_READS = filterUnsorted;
}
public void setSortOnFly(final int maxReadsToSort) {
private void setSortOnFly(final int maxReadsToSort) {
logger.info("Sorting read file on the fly: max reads allowed is " + maxReadsToSort);
SORT_ON_FLY = true;
maxOnFlySorts = maxReadsToSort;
}
public void setSortOnFly() { setSortOnFly(100000); }
private void setSortOnFly() { setSortOnFly(100000); }
public void setDownsampleByFraction(final double fraction) {
private void setDownsampleByFraction(final double fraction) {
logger.info("Downsampling to approximately " + (fraction * 100.0) + "% of filtered reads");
DOWNSAMPLE_BY_FRACTION = true;
downsamplingFraction = fraction;
}
public void setDownsampleByCoverage(final int coverage) {
private void setDownsampleByCoverage(final int coverage) {
logger.info("Downsampling to coverage " + coverage);
DOWNSAMPLE_BY_COVERAGE = true;
downsamplingCoverage = coverage;
}
/**
* Sets up read filtering performed by the traversal engine.
* @param reads Read filters to instantiate as the traversal engine walks over the data.
* @deprecated Should only be used by old-style traversals.
*/
@Deprecated
public void setReadFilters( Reads reads) {
if( reads.getDownsamplingFraction() != null ) setDownsampleByFraction(reads.getDownsamplingFraction());
if( reads.getDownsampleToCoverage() != null ) setDownsampleByCoverage(reads.getDownsampleToCoverage());
if( reads.getMaxOnTheFlySorts() != null ) setSortOnFly(reads.getMaxOnTheFlySorts());
if( reads.getSafetyChecking() != null ) setSafetyChecking(reads.getSafetyChecking());
}
// --------------------------------------------------------------------------------------------------------------
//
// functions for dealing locations (areas of the genome we're traversing over)
@ -318,36 +332,9 @@ public abstract class TraversalEngine {
return true;
}
/**
* Unpack the files to be processed, given a list of files. That list of files can
* itself contain lists of other files to be read.
* @param inputFiles
* @return
*/
public static List<File> unpackReads( List<File> inputFiles ) throws FileNotFoundException {
List<File> unpackedReads = new ArrayList<File>();
for( File inputFile: inputFiles ) {
if( inputFile.getName().endsWith(".list") ) {
for( String fileName : new xReadLines(inputFile) )
unpackedReads.add( new File(fileName) );
}
else
unpackedReads.add( inputFile );
}
return unpackedReads;
}
protected Iterator<SAMRecord> initializeReads() {
List<File> allReadsFiles = null;
try {
allReadsFiles = unpackReads( readsFiles );
}
catch( FileNotFoundException ex ) {
throw new RuntimeException("Unable to unpack reads", ex );
}
if( allReadsFiles.size() == 1 )
samReader = initializeSAMFile(allReadsFiles.get(0));
if( readsFiles.size() == 1 )
samReader = initializeSAMFile(readsFiles.get(0));
else
samReader = null;
return wrapReadsIterator(getReadsIterator(samReader), true);
@ -358,16 +345,9 @@ public abstract class TraversalEngine {
if ( samReader == null && readsFiles.size() > 0 ) {
SAMFileHeader.SortOrder SORT_ORDER = SAMFileHeader.SortOrder.coordinate;
List<File> allReadsFiles = null;
List<SAMFileReader> readers = new ArrayList<SAMFileReader>();
try {
allReadsFiles = unpackReads( readsFiles );
}
catch(FileNotFoundException ex) {
throw new RuntimeException("Unable to unpack reads", ex);
}
for ( File readsFile: allReadsFiles ) {
for ( File readsFile: readsFiles ) {
SAMFileReader reader = initializeSAMFile(readsFile);
readers.add(reader);
}
@ -382,28 +362,52 @@ public abstract class TraversalEngine {
@Deprecated
protected StingSAMIterator wrapReadsIterator( final Iterator<SAMRecord> rawIterator, final boolean enableVerification ) {
StingSAMIterator wrappedIterator = StingSAMIteratorAdapter.adapt(rawIterator);
wrappedIterator = applyDecoratingIterators(enableVerification, wrappedIterator);
// Reads sourceInfo is gone by this point in the traversal engine. Stub in a null and rely on the iterator to
// throw an exception if reads info isn't present.
StingSAMIterator wrappedIterator = StingSAMIteratorAdapter.adapt(null,rawIterator);
wrappedIterator = applyDecoratingIterators(enableVerification,wrappedIterator);
if (THREADED_IO) {
logger.info(String.format("Enabling threaded I/O with buffer of %d reads", THREADED_IO_BUFFER_SIZE));
wrappedIterator = StingSAMIteratorAdapter.adapt(new ThreadedIterator<SAMRecord>(wrappedIterator, THREADED_IO_BUFFER_SIZE));
wrappedIterator = StingSAMIteratorAdapter.adapt(null,new ThreadedIterator<SAMRecord>(wrappedIterator, THREADED_IO_BUFFER_SIZE));
}
return wrappedIterator;
}
protected StingSAMIterator applyDecoratingIterators(boolean enableVerification, StingSAMIterator wrappedIterator) {
/**
* Repackage instance variables and call static method.
* TODO: This method's days are numbered.
* @param enableVerification
* @param wrappedIterator
* @return
*/
protected StingSAMIterator applyDecoratingIterators( final boolean enableVerification, final StingSAMIterator wrappedIterator ) {
return applyDecoratingIterators(enableVerification,
wrappedIterator,
DOWNSAMPLE_BY_FRACTION ? downsamplingFraction : null,
SORT_ON_FLY ? maxOnFlySorts : null,
beSafeP);
}
/**
* WARNING: In TraversalEngine for backward compatibility ONLY. Reads are not used as the data source, only as parameters
* for validation.
*/
public static StingSAMIterator applyDecoratingIterators(boolean enableVerification,
StingSAMIterator wrappedIterator,
Double downsamplingFraction,
Integer maxOnFlySorts,
Boolean beSafeP) {
// NOTE: this (and other filtering) should be done before on-the-fly sorting
// as there is no reason to sort something that we will end of throwing away
if (DOWNSAMPLE_BY_FRACTION)
if (downsamplingFraction != null)
wrappedIterator = new DownsampleIterator(wrappedIterator, downsamplingFraction);
if (SORT_ON_FLY)
if (maxOnFlySorts != null)
wrappedIterator = new SortSamIterator(wrappedIterator, maxOnFlySorts);
if (beSafeP && enableVerification)
if (beSafeP != null && beSafeP && enableVerification)
wrappedIterator = new VerifyingSamIterator(wrappedIterator);
return wrappedIterator;
}

View File

@ -2,11 +2,15 @@ package org.broadinstitute.sting.gatk.traversals;
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
import org.broadinstitute.sting.gatk.walkers.Walker;
import org.broadinstitute.sting.gatk.walkers.DataSource;
import org.broadinstitute.sting.gatk.LocusContext;
import org.broadinstitute.sting.gatk.WalkerManager;
import org.broadinstitute.sting.gatk.dataSources.shards.Shard;
import org.broadinstitute.sting.gatk.dataSources.providers.ReferenceLocusIterator;
import org.broadinstitute.sting.gatk.dataSources.providers.ShardDataProvider;
import org.broadinstitute.sting.gatk.dataSources.providers.SeekableLocusContextQueue;
import org.broadinstitute.sting.gatk.dataSources.providers.LocusContextQueue;
import org.broadinstitute.sting.gatk.dataSources.providers.IterableLocusContextQueue;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
@ -33,7 +37,6 @@ public class TraverseLociByReference extends TraversalEngine {
*/
protected static Logger logger = Logger.getLogger(TraversalEngine.class);
public TraverseLociByReference(List<File> reads, File ref, List<ReferenceOrderedData<? extends ReferenceOrderedDatum>> rods) {
super( reads, ref, rods );
}
@ -57,8 +60,23 @@ public class TraverseLociByReference extends TraversalEngine {
LocusWalker<M, T> locusWalker = (LocusWalker<M, T>)walker;
LocusIterator locusIterator = new ReferenceLocusIterator( dataProvider );
SeekableLocusContextQueue locusContextQueue = new SeekableLocusContextQueue( dataProvider );
LocusIterator locusIterator = null;
LocusContextQueue locusContextQueue = null;
DataSource dataSource = WalkerManager.getWalkerDataSource(walker);
switch( dataSource ) {
case REFERENCE:
locusIterator = new ReferenceLocusIterator( dataProvider );
locusContextQueue = new SeekableLocusContextQueue( dataProvider );
break;
case READS:
IterableLocusContextQueue iterableQueue = new IterableLocusContextQueue( dataProvider );
locusIterator = iterableQueue;
locusContextQueue = iterableQueue;
break;
default:
throw new UnsupportedOperationException("Unsupported traversal type: " + dataSource);
}
// We keep processing while the next reference location is within the interval
while( locusIterator.hasNext() ) {
@ -72,9 +90,6 @@ public class TraverseLociByReference extends TraversalEngine {
LocusContext locus = locusContextQueue.seek( site ).peek();
char refBase = dataProvider.getReferenceBase( site );
if ( DOWNSAMPLE_BY_COVERAGE )
locus.downsampleToCoverage(downsamplingCoverage);
final boolean keepMeP = locusWalker.filter(tracker, refBase, locus);
if (keepMeP) {
M x = locusWalker.map(tracker, refBase, locus);

View File

@ -6,7 +6,6 @@ import org.broadinstitute.sting.gatk.LocusContext;
import org.broadinstitute.sting.gatk.dataSources.providers.ShardDataProvider;
import org.broadinstitute.sting.gatk.dataSources.shards.ReadShard;
import org.broadinstitute.sting.gatk.dataSources.shards.Shard;
import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
@ -88,11 +87,8 @@ public class TraverseReads extends TraversalEngine {
ReadWalker<M, T> readWalker = (ReadWalker<M, T>) walker;
// we allow a bunch of wrapping iterators for down sampling, threadingIO, etc.
StingSAMIterator it = applyDecoratingIterators(true, dataProvider.getReadIterator());
// while we still have more reads
for (SAMRecord read : it) {
for (SAMRecord read : dataProvider.getReadIterator()) {
// our locus context
LocusContext locus = null;

View File

@ -0,0 +1,33 @@
package org.broadinstitute.sting.gatk.walkers;
import java.lang.annotation.Documented;
import java.lang.annotation.Inherited;
import java.lang.annotation.Retention;
import java.lang.annotation.RetentionPolicy;
import java.lang.annotation.Target;
import java.lang.annotation.ElementType;
/**
* User: hanna
* Date: May 14, 2009
* Time: 1:51:22 PM
* BROAD INSTITUTE SOFTWARE COPYRIGHT NOTICE AND AGREEMENT
* Software and documentation are copyright 2005 by the Broad Institute.
* All rights are reserved.
*
* Users acknowledge that this software is supplied without any warranty or support.
* The Broad Institute is not responsible for its use, misuse, or
* functionality.
*/
/**
* Allows the walker to indicate what type of data it wants to consume.
*/
@Documented
@Inherited
@Retention(RetentionPolicy.RUNTIME)
@Target(ElementType.TYPE)
public @interface By {
DataSource value();
}

View File

@ -0,0 +1,21 @@
package org.broadinstitute.sting.gatk.walkers;
/**
* User: hanna
* Date: May 14, 2009
* Time: 2:12:33 PM
* BROAD INSTITUTE SOFTWARE COPYRIGHT NOTICE AND AGREEMENT
* Software and documentation are copyright 2005 by the Broad Institute.
* All rights are reserved.
*
* Users acknowledge that this software is supplied without any warranty or support.
* The Broad Institute is not responsible for its use, misuse, or
* functionality.
*/
/**
* Allow user to choose between a number of different data sources.
*/
public enum DataSource {
READS,
REFERENCE
}

View File

@ -1,11 +1,8 @@
package org.broadinstitute.sting.gatk.walkers;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.LocusContext;
import java.util.List;
/**
* Created by IntelliJ IDEA.
* User: mdepristo
@ -13,6 +10,7 @@ import java.util.List;
* Time: 2:52:28 PM
* To change this template use File | Settings | File Templates.
*/
@By(DataSource.READS)
public abstract class LocusWalker<MapType, ReduceType> extends Walker<MapType, ReduceType> {
// Do we actually want to operate on the context?
public boolean filter(RefMetaDataTracker tracker, char ref, LocusContext context) {

View File

@ -17,6 +17,7 @@ import net.sf.samtools.SAMRecord;
* Time: 11:23:14 AM
* To change this template use File | Settings | File Templates.
*/
@By(DataSource.REFERENCE)
public class PrintLocusContextWalker extends LocusWalker<LocusContext, Integer> implements TreeReducible<Integer> {
public LocusContext map(RefMetaDataTracker tracker, char ref, LocusContext context) {
out.printf( "In map: ref = %c, loc = %s, reads = %s%n", ref,

View File

@ -82,9 +82,9 @@ public class GATKArgumentCollectionTest extends BaseTest {
collect.HAPMAPChipFile = "HAPMAPChipFile".toLowerCase();
collect.enabledThreadedIO = true;
collect.unsafe = false;
collect.maximumReadSorts = "maximumReadSorts".toLowerCase();
collect.downsampleFraction = "downsampleFraction".toLowerCase();
collect.downsampleCoverage = "downsampleCoverage".toLowerCase();
collect.maximumReadSorts = null;
collect.downsampleFraction = null;
collect.downsampleCoverage = null;
collect.intervals = "intervals".toLowerCase();
collect.walkAllLoci = true;
collect.disableThreading = false;

View File

@ -84,7 +84,8 @@ public class IterableLocusContextQueueTest extends LocusContextQueueTemplate {
if(new GenomeLoc(read).containsP(locusContext.getLocation()))
Assert.assertTrue("Target locus context does not contain reads", locusContext.getReads().contains(read) );
}
}
Assert.assertFalse("Iterator is not bounded at boundaries of shard", iterableQueue.hasNext());
}
}

View File

@ -7,8 +7,10 @@ import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
import org.broadinstitute.sting.gatk.dataSources.shards.Shard;
import org.broadinstitute.sting.gatk.dataSources.shards.LocusShard;
import org.broadinstitute.sting.gatk.Reads;
import java.io.FileNotFoundException;
import java.io.File;
import java.util.Collections;
import java.util.List;
import java.util.Iterator;
@ -294,6 +296,11 @@ public abstract class LocusContextQueueTemplate extends BaseTest {
backingIterator = backingList.iterator();
}
public Reads getSourceInfo() {
// There are no sources for these reads.
return new Reads(new ArrayList<File>());
}
public boolean hasNext() {
return backingIterator.hasNext();
}

View File

@ -7,6 +7,8 @@ import org.broadinstitute.sting.gatk.dataSources.shards.Shard;
import org.broadinstitute.sting.gatk.dataSources.shards.ShardStrategy;
import org.broadinstitute.sting.gatk.dataSources.shards.ShardStrategyFactory;
import org.broadinstitute.sting.gatk.iterators.MergingSamRecordIterator2;
import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
import org.broadinstitute.sting.gatk.Reads;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.fasta.FastaSequenceFile2;
import org.junit.After;
@ -45,7 +47,7 @@ import java.util.List;
*/
public class SAMBAMDataSourceTest extends BaseTest {
private List<String> fl;
private List<File> fl;
private FastaSequenceFile2 seq;
/**
@ -55,7 +57,7 @@ public class SAMBAMDataSourceTest extends BaseTest {
*/
@Before
public void doForEachTest() {
fl = new ArrayList<String>();
fl = new ArrayList<File>();
// sequence
seq = new FastaSequenceFile2(new File(seqLocation + "/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta"));
@ -83,18 +85,18 @@ public class SAMBAMDataSourceTest extends BaseTest {
int count = 0;
// setup the data
fl.add(oneKGLocation + "/pilot3/sams/NA12892.bam");
fl.add(new File(oneKGLocation + "/pilot3/sams/NA12892.bam"));
Reads reads = new Reads(fl);
try {
SAMDataSource data = new SAMDataSource(fl);
SAMDataSource data = new SAMDataSource(reads);
for (Shard sh : strat) {
int readCount = 0;
count++;
logger.debug("Start : " + sh.getGenomeLoc().getStart() + " stop : " + sh.getGenomeLoc().getStop() + " contig " + sh.getGenomeLoc().getContig());
logger.debug("count = " + count);
MergingSamRecordIterator2 datum = (MergingSamRecordIterator2)data.seek(sh);
StingSAMIterator datum = data.seek(sh);
// for the first couple of shards make sure we can see the reads
if (count < 5) {
@ -126,7 +128,8 @@ public class SAMBAMDataSourceTest extends BaseTest {
// setup the test files
fl.add(seqLocation + "/dirseq/analysis/cancer_exome/twoflowcell_sams/TCGA-06-0188.aligned.duplicates_marked.bam");
fl.add(new File(seqLocation + "/dirseq/analysis/cancer_exome/twoflowcell_sams/TCGA-06-0188.aligned.duplicates_marked.bam"));
Reads reads = new Reads(fl);
ArrayList<Integer> readcountPerShard = new ArrayList<Integer>();
ArrayList<Integer> readcountPerShard2 = new ArrayList<Integer>();
@ -136,7 +139,7 @@ public class SAMBAMDataSourceTest extends BaseTest {
int count = 0;
try {
SAMDataSource data = new SAMDataSource(fl);
SAMDataSource data = new SAMDataSource(reads);
for (Shard sh : strat) {
int readCount = 0;
count++;
@ -144,7 +147,7 @@ public class SAMBAMDataSourceTest extends BaseTest {
break;
}
MergingSamRecordIterator2 datum = (MergingSamRecordIterator2)data.seek(sh);
StingSAMIterator datum = data.seek(sh);
for (SAMRecord r : datum) {
readCount++;
@ -163,15 +166,17 @@ public class SAMBAMDataSourceTest extends BaseTest {
// setup the data and the counter before our second run
fl.clear();
fl.add(seqLocation + "/dirseq/analysis/cancer_exome/twoflowcell_sams/TCGA-06-0188-01A-01W.aligned.duplicates_marked.bam");
fl.add(seqLocation + "/dirseq/analysis/cancer_exome/twoflowcell_sams/TCGA-06-0188-10B-01W.aligned.duplicates_marked.bam");
fl.add(new File(seqLocation + "/dirseq/analysis/cancer_exome/twoflowcell_sams/TCGA-06-0188-01A-01W.aligned.duplicates_marked.bam"));
fl.add(new File(seqLocation + "/dirseq/analysis/cancer_exome/twoflowcell_sams/TCGA-06-0188-10B-01W.aligned.duplicates_marked.bam"));
reads = new Reads(fl);
count = 0;
// the sharding strat.
strat = ShardStrategyFactory.shatter(ShardStrategyFactory.SHATTER_STRATEGY.LINEAR, seq.getSequenceDictionary(), 100000);
logger.debug("Pile two:");
try {
SAMDataSource data = new SAMDataSource(fl);
SAMDataSource data = new SAMDataSource(reads);
for (Shard sh : strat) {
int readCount = 0;
count++;
@ -181,7 +186,7 @@ public class SAMBAMDataSourceTest extends BaseTest {
break;
}
MergingSamRecordIterator2 datum = (MergingSamRecordIterator2)data.seek(sh);
StingSAMIterator datum = data.seek(sh);
for (SAMRecord r : datum) {
readCount++;

View File

@ -7,6 +7,7 @@ import org.broadinstitute.sting.gatk.dataSources.shards.Shard;
import org.broadinstitute.sting.gatk.dataSources.shards.ShardStrategy;
import org.broadinstitute.sting.gatk.dataSources.shards.ShardStrategyFactory;
import org.broadinstitute.sting.gatk.iterators.BoundedReadIterator;
import org.broadinstitute.sting.gatk.Reads;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.fasta.FastaSequenceFile2;
import static org.junit.Assert.assertEquals;
@ -46,7 +47,7 @@ import java.util.List;
public class SAMByReadsTest extends BaseTest {
private FastaSequenceFile2 seq;
private List<String> fl;
private List<File> fl;
/**
* This function does the setup of our parser, before each method call.
@ -55,7 +56,7 @@ public class SAMByReadsTest extends BaseTest {
*/
@Before
public void doForEachTest() {
fl = new ArrayList<String>();
fl = new ArrayList<File>();
// sequence
seq = new FastaSequenceFile2(new File(seqLocation + "/references/Homo_sapiens_assembly17/v0/Homo_sapiens_assembly17.fasta"));
@ -69,13 +70,15 @@ public class SAMByReadsTest extends BaseTest {
logger.warn("Executing testTotalReadCount");
// setup the test files
fl.add("/humgen/gsa-scr1/GATK_Data/Validation_Data/index_test.bam");
fl.add(new File("/humgen/gsa-scr1/GATK_Data/Validation_Data/index_test.bam"));
Reads reads = new Reads(fl);
final int targetReadCount = 5000;
ShardStrategy shardStrategy = ShardStrategyFactory.shatterByReadCount(seq.getSequenceDictionary(),targetReadCount);
try {
SAMDataSource data = new SAMDataSource(fl);
SAMDataSource data = new SAMDataSource(reads);
// check the total read count
final int totalReads = 10000;

View File

@ -8,6 +8,7 @@ import org.broadinstitute.sting.gatk.dataSources.shards.ShardStrategy;
import org.broadinstitute.sting.gatk.dataSources.shards.ShardStrategyFactory;
import org.broadinstitute.sting.gatk.dataSources.simpleDataSources.SAMDataSource;
import org.broadinstitute.sting.gatk.dataSources.simpleDataSources.SimpleDataSourceLoadException;
import org.broadinstitute.sting.gatk.Reads;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.fasta.FastaSequenceFile2;
import static org.junit.Assert.assertEquals;
@ -48,7 +49,7 @@ import java.util.List;
public class BoundedReadIteratorTest extends BaseTest {
/** the file list and the fasta sequence */
private List<String> fl;
private List<File> fl;
private FastaSequenceFile2 seq;
/**
@ -58,7 +59,7 @@ public class BoundedReadIteratorTest extends BaseTest {
*/
@Before
public void doForEachTest() {
fl = new ArrayList<String>();
fl = new ArrayList<File>();
// sequence
seq = new FastaSequenceFile2(new File(seqLocation + "/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta"));
@ -76,14 +77,15 @@ public class BoundedReadIteratorTest extends BaseTest {
// setup the test files
fl.add(seqLocation + "/dirseq/analysis/cancer_exome/twoflowcell_sams/TCGA-06-0188.aligned.duplicates_marked.bam");
fl.add(new File(seqLocation + "/dirseq/analysis/cancer_exome/twoflowcell_sams/TCGA-06-0188.aligned.duplicates_marked.bam"));
Reads reads = new Reads(fl);
// our target read
final long boundedReadCount = 100;
long shardReadCount = 0;
try {
SAMDataSource data = new SAMDataSource(fl);
SAMDataSource data = new SAMDataSource(reads);
// make sure we have a shard
if (!strat.hasNext()) {
@ -92,8 +94,8 @@ public class BoundedReadIteratorTest extends BaseTest {
Shard sd = strat.next();
MergingSamRecordIterator2 datum = (MergingSamRecordIterator2)data.seek(sd);
MergingSamRecordIterator2 datum2 = (MergingSamRecordIterator2)data.seek(sd);
StingSAMIterator datum = data.seek(sd);
StingSAMIterator datum2 = data.seek(sd);
// check the reads in the shard
for (SAMRecord r : datum) {
@ -102,7 +104,7 @@ public class BoundedReadIteratorTest extends BaseTest {
}
// create the bounded iterator
BoundedReadIterator iter = new BoundedReadIterator(datum2, boundedReadCount);
BoundedReadIterator iter = new BoundedReadIterator(StingSAMIteratorAdapter.adapt(reads,datum2), boundedReadCount);
// now see how many reads are in the bounded iterator
int readCount = 0;

View File

@ -97,7 +97,7 @@ public class StingSAMIteratorAdapterTest extends BaseTest {
final int COUNT = 100;
MyTestIterator it = new MyTestIterator();
StingSAMIterator samIt = StingSAMIteratorAdapter.adapt(it);
StingSAMIterator samIt = StingSAMIteratorAdapter.adapt(null,it);
int countCheck = 0;
while (samIt.hasNext()) {
samIt.next();
@ -116,7 +116,7 @@ public class StingSAMIteratorAdapterTest extends BaseTest {
MyTestCloseableIterator it = new MyTestCloseableIterator();
StingSAMIterator samIt = StingSAMIteratorAdapter.adapt(it);
StingSAMIterator samIt = StingSAMIteratorAdapter.adapt(null,it);
int countCheck = 0;
while (samIt.hasNext()) {
@ -133,7 +133,7 @@ public class StingSAMIteratorAdapterTest extends BaseTest {
MyTestCloseableIterator it = new MyTestCloseableIterator();
StingSAMIterator samIt = StingSAMIteratorAdapter.adapt(it);
StingSAMIterator samIt = StingSAMIteratorAdapter.adapt(null,it);
int countCheck = 0;

View File

@ -10,6 +10,7 @@ import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
import org.broadinstitute.sting.gatk.walkers.CountReadsWalker;
import org.broadinstitute.sting.gatk.walkers.Walker;
import org.broadinstitute.sting.gatk.Reads;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.fasta.FastaSequenceFile2;
import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile;
@ -121,15 +122,7 @@ public class TraverseReadsTest extends BaseTest {
ref.getSequenceDictionary(),
readSize);
List<File> unpackedReads = null;
try {
unpackedReads = TraversalEngine.unpackReads(bamList);
}
catch (FileNotFoundException ex) {
throw new RuntimeException(ex);
}
SAMDataSource dataSource = new SAMDataSource(unpackedReads);
SAMDataSource dataSource = new SAMDataSource(new Reads(bamList));
dataSource.viewUnmappedReads(false);
countReadWalker.initialize();
@ -176,15 +169,8 @@ public class TraverseReadsTest extends BaseTest {
ShardStrategy shardStrategy = ShardStrategyFactory.shatter(ShardStrategyFactory.SHATTER_STRATEGY.READS,
ref.getSequenceDictionary(),
readSize);
List<File> unpackedReads = null;
try {
unpackedReads = TraversalEngine.unpackReads(bamList);
}
catch (FileNotFoundException ex) {
throw new RuntimeException(ex);
}
SAMDataSource dataSource = new SAMDataSource(unpackedReads);
SAMDataSource dataSource = new SAMDataSource(new Reads(bamList));
dataSource.viewUnmappedReads(true);
countReadWalker.initialize();