1. Added a by-interval traversal.
2. Added a shell for the indel cleaner walker (it's currently being used to test the interval traversal). 3. Fixed small bug in downsampling (make sure to downsample the offsets too) 4. GenomeAnalysisTK.execute => anyone object to my change to "instanceof" instead of trying to catch a ClassCastException (yuck)? git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@524 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
1984bb2d13
commit
13d4692d2e
|
|
@ -16,6 +16,7 @@ import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
|
|||
import org.broadinstitute.sting.gatk.traversals.*;
|
||||
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.IntervalWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.Walker;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
|
|
@ -247,8 +248,8 @@ public class GenomeAnalysisTK extends CommandLineProgram {
|
|||
|
||||
MicroManager microManager = null;
|
||||
|
||||
// Try to get the walker specified
|
||||
try {
|
||||
// Get the walker specified
|
||||
if ( my_walker instanceof LocusWalker ) {
|
||||
LocusWalker<?, ?> walker = (LocusWalker<?, ?>) my_walker;
|
||||
|
||||
if ( REF_FILE_ARG == null )
|
||||
|
|
@ -277,10 +278,10 @@ public class GenomeAnalysisTK extends CommandLineProgram {
|
|||
else
|
||||
this.engine = new TraverseByLoci(INPUT_FILE, REF_FILE_ARG, rods);
|
||||
}
|
||||
}
|
||||
catch (java.lang.ClassCastException e) {
|
||||
// I guess we're a read walker LOL
|
||||
ReadWalker<?, ?> walker = (ReadWalker<?, ?>) my_walker;
|
||||
} else if ( my_walker instanceof IntervalWalker ) {
|
||||
this.engine = new TraverseByIntervals(INPUT_FILE, REF_FILE_ARG, rods);
|
||||
} else {
|
||||
// we're a read walker
|
||||
this.engine = new TraverseByReads(INPUT_FILE, REF_FILE_ARG, rods);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -120,9 +120,11 @@ public class LocusContext {
|
|||
i++;
|
||||
}
|
||||
|
||||
ArrayList downsampledReads = new ArrayList();
|
||||
ArrayList<SAMRecord> downsampledReads = new ArrayList<SAMRecord>();
|
||||
ArrayList<Integer> downsampledOffsets = new ArrayList<Integer>();
|
||||
Iterator positionIter = positions.iterator();
|
||||
Iterator readsIter = reads.iterator();
|
||||
Iterator<SAMRecord> readsIter = reads.iterator();
|
||||
Iterator<Integer> offsetsIter = offsets.iterator();
|
||||
int currentRead = 0;
|
||||
while ( positionIter.hasNext() ) {
|
||||
int nextReadToKeep = (Integer)positionIter.next();
|
||||
|
|
@ -130,13 +132,16 @@ public class LocusContext {
|
|||
// fast-forward to the right read
|
||||
while ( currentRead < nextReadToKeep ) {
|
||||
readsIter.next();
|
||||
offsetsIter.next();
|
||||
currentRead++;
|
||||
}
|
||||
|
||||
downsampledReads.add(readsIter.next());
|
||||
downsampledOffsets.add(offsetsIter.next());
|
||||
currentRead++;
|
||||
}
|
||||
|
||||
reads = downsampledReads;
|
||||
offsets = downsampledOffsets;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -496,8 +496,7 @@ public abstract class TraversalEngine {
|
|||
|
||||
/**
|
||||
* Class to filter out un-handle-able reads from the stream. We currently are skipping
|
||||
* unmapped reads, non-primary reads, unaligned reads, and those with indels. We should
|
||||
* really change this to handle indel containing reads.
|
||||
* unmapped reads, non-primary reads, unaligned reads, and duplicate reads.
|
||||
*/
|
||||
public static class locusStreamFilterFunc implements SamRecordFilter {
|
||||
SAMRecord lastRead = null;
|
||||
|
|
|
|||
|
|
@ -0,0 +1,156 @@
|
|||
package org.broadinstitute.sting.gatk.traversals;
|
||||
|
||||
import org.broadinstitute.sting.gatk.walkers.IntervalWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.Walker;
|
||||
import org.broadinstitute.sting.gatk.LocusContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.iterators.ReferenceIterator;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Iterator;
|
||||
import java.util.ArrayList;
|
||||
import java.io.File;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import net.sf.samtools.util.CloseableIterator;
|
||||
import edu.mit.broad.picard.filter.FilteringIterator;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: ebanks
|
||||
* Date: Apr 23, 2009
|
||||
* Time: 10:26:03 AM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public class TraverseByIntervals extends TraversalEngine {
|
||||
|
||||
public TraverseByIntervals(File reads, File ref, List<ReferenceOrderedData<? extends ReferenceOrderedDatum>> rods) {
|
||||
super(reads, ref, rods);
|
||||
}
|
||||
|
||||
public <M,T> T traverse(Walker<M,T> walker, ArrayList<GenomeLoc> locations) {
|
||||
if ( walker instanceof IntervalWalker ) {
|
||||
IntervalWalker<M, T> intervalWalker = (IntervalWalker<M, T>)walker;
|
||||
T sum = traverseByIntervals(intervalWalker, locations);
|
||||
return sum;
|
||||
} else {
|
||||
throw new IllegalArgumentException("Walker isn't an interval walker!");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Traverse by intervals -- the key driver of linearly ordered traversal of intervals. Provides reads, RODs, and
|
||||
* the reference base for each interval in the reference to the intervalWalker walker. Supports all of the
|
||||
* interaction contract implied by the interval walker
|
||||
*
|
||||
* @param walker An interval walker object
|
||||
* @param <M> MapType -- the result of calling map() on walker
|
||||
* @param <T> ReduceType -- the result of calling reduce() on the walker
|
||||
* @return 0 on success
|
||||
*/
|
||||
protected <M, T> T traverseByIntervals(IntervalWalker<M, T> walker, ArrayList<GenomeLoc> locations) {
|
||||
logger.debug("Entering traverseByIntervals");
|
||||
|
||||
samReader = initializeSAMFile(readsFile);
|
||||
|
||||
verifySortOrder(true);
|
||||
|
||||
walker.initialize();
|
||||
|
||||
T sum = walker.reduceInit();
|
||||
|
||||
if ( locations.isEmpty() ) {
|
||||
logger.debug("There are no intervals provided for the traversal");
|
||||
} else {
|
||||
if ( ! samReader.hasIndex() )
|
||||
Utils.scareUser("Processing locations were requested, but no index was found for the input SAM/BAM file. This operation is potentially dangerously slow, aborting.");
|
||||
|
||||
for ( GenomeLoc interval : locations ) {
|
||||
logger.debug(String.format("Processing interval %s", interval.toString()));
|
||||
|
||||
CloseableIterator<SAMRecord> readIter = samReader.queryOverlapping( interval.getContig(),
|
||||
(int)interval.getStart(),
|
||||
(int)interval.getStop());
|
||||
|
||||
Iterator<SAMRecord> wrappedIter = WrapReadsIterator( readIter, false );
|
||||
sum = carryWalkerOverInterval(walker, wrappedIter, sum, interval);
|
||||
readIter.close();
|
||||
}
|
||||
}
|
||||
|
||||
printOnTraversalDone("intervals", sum);
|
||||
walker.onTraversalDone(sum);
|
||||
return sum;
|
||||
}
|
||||
|
||||
protected <M, T> T carryWalkerOverInterval(IntervalWalker<M, T> walker, Iterator<SAMRecord> readIter, T sum, GenomeLoc interval ) {
|
||||
logger.debug(String.format("TraverseByIntervals.carryWalkerOverInterval Genomic interval is %s", interval));
|
||||
|
||||
// prepare the read filtering read iterator and provide it to a new interval iterator
|
||||
FilteringIterator filterIter = new FilteringIterator(readIter, new locusStreamFilterFunc());
|
||||
|
||||
ArrayList<SAMRecord> reads = new ArrayList<SAMRecord>();
|
||||
ArrayList<Integer> offsets = new ArrayList<Integer>();
|
||||
boolean done = false;
|
||||
while (filterIter.hasNext() && !done) {
|
||||
TraversalStatistics.nRecords++;
|
||||
SAMRecord read = filterIter.next();
|
||||
reads.add(read);
|
||||
offsets.add((int)(read.getAlignmentStart() - interval.getStart()));
|
||||
if (this.maxReads > 0 && TraversalStatistics.nRecords > this.maxReads) {
|
||||
logger.warn(String.format("Maximum number of reads encountered, terminating traversal " + TraversalStatistics.nRecords));
|
||||
done = true;
|
||||
}
|
||||
}
|
||||
|
||||
LocusContext locus = new LocusContext(interval, reads, offsets);
|
||||
if ( DOWNSAMPLE_BY_COVERAGE )
|
||||
locus.downsampleToCoverage(downsamplingCoverage);
|
||||
|
||||
ReferenceIterator refSite = refIter.seekForward(locus.getLocation());
|
||||
locus.setReferenceContig(refSite.getCurrentContig());
|
||||
|
||||
// Iterate forward to get all reference ordered data covering this interval
|
||||
final RefMetaDataTracker tracker = getReferenceOrderedDataAtLocus(locus.getLocation());
|
||||
|
||||
sum = walkAtinterval( walker, sum, locus, refSite, tracker );
|
||||
|
||||
//System.out.format("Working at %s\n", locus.getLocation().toString());
|
||||
|
||||
printProgress("intervals", locus.getLocation());
|
||||
|
||||
return sum;
|
||||
}
|
||||
|
||||
protected <M, T> T walkAtinterval( final IntervalWalker<M, T> walker,
|
||||
T sum,
|
||||
final LocusContext locus,
|
||||
final ReferenceIterator refSite,
|
||||
final RefMetaDataTracker tracker ) {
|
||||
ReferenceIterator refSiteCopy = refSite;
|
||||
StringBuffer refBases = new StringBuffer(refSiteCopy.getBaseAsString());
|
||||
int locusLength = (int)(locus.getLocation().getStop() - locus.getLocation().getStart());
|
||||
for ( int i = 0; i < locusLength; i++ ) {
|
||||
refSiteCopy = refSiteCopy.next();
|
||||
refBases.append(refSiteCopy.getBaseAsChar());
|
||||
}
|
||||
|
||||
//logger.debug(String.format(" Reference: %s:%d %c", refSite.getCurrentContig().getName(), refSite.getPosition(), refBase));
|
||||
|
||||
//
|
||||
// Execute our contract with the walker. Call filter, map, and reduce
|
||||
//
|
||||
final boolean keepMeP = walker.filter(tracker, refBases.toString(), locus);
|
||||
if (keepMeP) {
|
||||
M x = walker.map(tracker, refBases.toString(), locus);
|
||||
sum = walker.reduce(x, sum);
|
||||
}
|
||||
|
||||
//printProgress("intervals", interval.getLocation());
|
||||
return sum;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,25 @@
|
|||
package org.broadinstitute.sting.gatk.walkers;
|
||||
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.LocusContext;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: ebanks
|
||||
* Date: Apr 23, 2009
|
||||
* Time: 2:52:28 PM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public abstract class IntervalWalker<MapType, ReduceType> extends Walker<MapType, ReduceType> {
|
||||
// Do we actually want to operate on the context?
|
||||
public boolean filter(RefMetaDataTracker tracker, String ref, LocusContext context) {
|
||||
return true; // We are keeping all the reads
|
||||
}
|
||||
|
||||
// Map over the org.broadinstitute.sting.gatk.LocusContext
|
||||
public abstract MapType map(RefMetaDataTracker tracker, String ref, LocusContext context);
|
||||
|
||||
// Given result of map function
|
||||
public abstract ReduceType reduceInit();
|
||||
public abstract ReduceType reduce(MapType value, ReduceType sum);
|
||||
}
|
||||
|
|
@ -0,0 +1,35 @@
|
|||
|
||||
package org.broadinstitute.sting.playground.gatk.walkers;
|
||||
|
||||
import org.broadinstitute.sting.gatk.refdata.*;
|
||||
import org.broadinstitute.sting.gatk.walkers.IntervalWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.WalkerName;
|
||||
import org.broadinstitute.sting.gatk.LocusContext;
|
||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||
|
||||
import net.sf.samtools.*;
|
||||
|
||||
@WalkerName("IntervalCleaner")
|
||||
public class IntervalCleanerWalker extends IntervalWalker<Integer, Integer> {
|
||||
@Argument(fullName="maxReadLength", shortName="maxRead", required=false, defaultValue="-1")
|
||||
public int maxReadLength;
|
||||
|
||||
public void initialize() {}
|
||||
|
||||
public Integer map(RefMetaDataTracker tracker, String ref, LocusContext context) {
|
||||
out.println(context.getLocation() + " (" + context.getReads().size() + ") => " + ref);
|
||||
return 1;
|
||||
}
|
||||
|
||||
public Integer reduceInit() {
|
||||
return 0;
|
||||
}
|
||||
|
||||
public Integer reduce(Integer value, Integer sum) {
|
||||
return sum + value;
|
||||
}
|
||||
|
||||
public void onTraversalDone(Integer result) {
|
||||
out.println("Saw " + result + " intervals");
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue