Framework for ROD walkers -- totally experiment and not working right now
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1600 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
bd75a8d168
commit
6e13a36059
|
|
@ -507,11 +507,14 @@ public class GenomeAnalysisEngine {
|
|||
ReferenceSequenceFile drivingDataSource,
|
||||
GenomeLocSortedSet intervals,
|
||||
Integer maxIterations) {
|
||||
final long SHARD_SIZE = 100000L;
|
||||
long SHARD_SIZE = 100000L;
|
||||
|
||||
ShardStrategy shardStrategy = null;
|
||||
ShardStrategyFactory.SHATTER_STRATEGY shardType;
|
||||
|
||||
if (walker instanceof LocusWalker) {
|
||||
if ( walker instanceof RodWalker ) SHARD_SIZE *= 100;
|
||||
|
||||
if (intervals != null) {
|
||||
shardType = (walker.isReduceByInterval()) ?
|
||||
ShardStrategyFactory.SHATTER_STRATEGY.INTERVAL :
|
||||
|
|
@ -525,7 +528,6 @@ public class GenomeAnalysisEngine {
|
|||
shardStrategy = ShardStrategyFactory.shatter(ShardStrategyFactory.SHATTER_STRATEGY.LINEAR,
|
||||
drivingDataSource.getSequenceDictionary(),
|
||||
SHARD_SIZE, maxIterations);
|
||||
|
||||
} else if (walker instanceof ReadWalker ||
|
||||
walker instanceof DuplicateWalker) {
|
||||
|
||||
|
|
|
|||
|
|
@ -22,19 +22,6 @@ import org.apache.log4j.Logger;
|
|||
*/
|
||||
|
||||
public class CoveredLocusView extends LocusView {
|
||||
/**
|
||||
* Gets the position to which the last seek was requested.
|
||||
*/
|
||||
private GenomeLoc seekPoint;
|
||||
|
||||
/**
|
||||
* What's the context for the last locus accessed?
|
||||
* @param provider
|
||||
*/
|
||||
private AlignmentContext nextLocusContext = null;
|
||||
|
||||
private static Logger logger = Logger.getLogger(CoveredLocusView.class);
|
||||
|
||||
/**
|
||||
* Create a new queue of locus contexts.
|
||||
* @param provider
|
||||
|
|
|
|||
|
|
@ -25,11 +25,11 @@ import java.util.Collections;
|
|||
/**
|
||||
* A view into the reference-ordered data in the provider.
|
||||
*/
|
||||
public class ReferenceOrderedView implements View {
|
||||
public class ManagingReferenceOrderedView implements ReferenceOrderedView {
|
||||
/**
|
||||
* The provider that's supplying our backing data.
|
||||
*/
|
||||
private final ShardDataProvider provider;
|
||||
//private final ShardDataProvider provider;
|
||||
|
||||
/**
|
||||
* The data sources along with their current states.
|
||||
|
|
@ -40,8 +40,8 @@ public class ReferenceOrderedView implements View {
|
|||
* Create a new view of reference-ordered data.
|
||||
* @param provider
|
||||
*/
|
||||
public ReferenceOrderedView( ShardDataProvider provider ) {
|
||||
this.provider = provider;
|
||||
public ManagingReferenceOrderedView( ShardDataProvider provider ) {
|
||||
//this.provider = provider;
|
||||
for( ReferenceOrderedDataSource dataSource: provider.getReferenceOrderedData() )
|
||||
states.add( new ReferenceOrderedDataState( dataSource, (RODIterator)dataSource.seek(provider.getShard()) ) );
|
||||
|
||||
|
|
@ -0,0 +1,220 @@
|
|||
package org.broadinstitute.sting.gatk.datasources.providers;
|
||||
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.refdata.RODIterator;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
|
||||
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.MergingIterator;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
/**
|
||||
* User: hanna
|
||||
* Date: May 21, 2009
|
||||
* Time: 2:49:17 PM
|
||||
* BROAD INSTITUTE SOFTWARE COPYRIGHT NOTICE AND AGREEMENT
|
||||
* Software and documentation are copyright 2005 by the Broad Institute.
|
||||
* All rights are reserved.
|
||||
*
|
||||
* Users acknowledge that this software is supplied without any warranty or support.
|
||||
* The Broad Institute is not responsible for its use, misuse, or
|
||||
* functionality.
|
||||
*/
|
||||
|
||||
/**
|
||||
* A view into the reference-ordered data in the provider.
|
||||
*/
|
||||
public class RodLocusView extends LocusView implements ReferenceOrderedView {
|
||||
/**
|
||||
* The provider that's supplying our backing data.
|
||||
*/
|
||||
//private final ShardDataProvider provider;
|
||||
|
||||
/**
|
||||
* The data sources along with their current states.
|
||||
*/
|
||||
private MergingIterator<ReferenceOrderedDatum> rodQueue = null;
|
||||
|
||||
RefMetaDataTracker tracker = null;
|
||||
GenomeLoc lastLoc = null;
|
||||
ReferenceOrderedDatum interval = null;
|
||||
|
||||
// broken support for multi-locus rods
|
||||
//List<ReferenceOrderedDatum> multiLocusRODs = new LinkedList<ReferenceOrderedDatum>();
|
||||
|
||||
/**
|
||||
* Enable debugging output -- todo remove me
|
||||
*/
|
||||
final static boolean DEBUG = false;
|
||||
|
||||
final static String INTERVAL_ROD_NAME = "interval";
|
||||
|
||||
/**
|
||||
* Create a new view of reference-ordered data.
|
||||
*
|
||||
* @param provider
|
||||
*/
|
||||
public RodLocusView( ShardDataProvider provider ) {
|
||||
super(provider);
|
||||
|
||||
GenomeLoc loc = provider.getShard().getGenomeLoc();
|
||||
|
||||
List<Iterator<ReferenceOrderedDatum>> iterators = new LinkedList<Iterator<ReferenceOrderedDatum>>();
|
||||
for( ReferenceOrderedDataSource dataSource: provider.getReferenceOrderedData() ) {
|
||||
if ( DEBUG ) System.out.printf("Shard is %s%n", loc);
|
||||
RODIterator it = (RODIterator)dataSource.seek(provider.getShard());
|
||||
ReferenceOrderedDatum x = it.seekForward(loc);
|
||||
|
||||
// we need to special case the interval so we don't always think there's a rod at the first location
|
||||
if ( dataSource.getName().equals(INTERVAL_ROD_NAME) ) {
|
||||
if ( interval != null )
|
||||
throw new RuntimeException("BUG: interval local variable already assigned " + interval);
|
||||
interval = x;
|
||||
} else {
|
||||
iterators.add( (Iterator<ReferenceOrderedDatum>)it );
|
||||
}
|
||||
}
|
||||
|
||||
rodQueue = new MergingIterator<ReferenceOrderedDatum>(iterators);
|
||||
}
|
||||
|
||||
public RefMetaDataTracker getReferenceOrderedDataAtLocus( GenomeLoc loc ) {
|
||||
return tracker;
|
||||
}
|
||||
|
||||
public boolean hasNext() {
|
||||
if ( ! rodQueue.hasNext() )
|
||||
return false;
|
||||
else {
|
||||
ReferenceOrderedDatum peeked = rodQueue.peek();
|
||||
return ! peeked.getLocation().isPast(shard.getGenomeLoc());
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the next covered locus context in the shard.
|
||||
* @return Next covered locus context in the shard.
|
||||
* @throw NoSuchElementException if no such element exists.
|
||||
*/
|
||||
public AlignmentContext next() {
|
||||
if ( DEBUG ) System.out.printf("In RodLocusView.next()...%n");
|
||||
ReferenceOrderedDatum datum = rodQueue.next();
|
||||
if ( DEBUG ) System.out.printf("In RodLocusView.next(); datum = %s...%n", datum.getLocation());
|
||||
|
||||
if ( DEBUG ) System.out.printf("In RodLocusView.next(): creating tracker...%n");
|
||||
|
||||
// Update the tracker here for use
|
||||
Collection<ReferenceOrderedDatum> allRODsHere = getSpanningRods(datum);
|
||||
tracker = createTracker(allRODsHere);
|
||||
|
||||
GenomeLoc rodSite = datum.getLocation();
|
||||
GenomeLoc site = GenomeLocParser.createGenomeLoc( rodSite.getContigIndex(), rodSite.getStart(), rodSite.getStart());
|
||||
|
||||
if ( DEBUG ) System.out.printf("rodLocusView.next() is at %s%n", site);
|
||||
|
||||
// calculate the number of skipped bases, and update lastLoc so we can do that again in the next()
|
||||
long skippedBases = getSkippedBases( rodSite );
|
||||
lastLoc = site;
|
||||
return new AlignmentContext(site, new ArrayList<SAMRecord>(), new ArrayList<Integer>(), skippedBases);
|
||||
}
|
||||
|
||||
private RefMetaDataTracker createTracker( Collection<ReferenceOrderedDatum> allRodsHere ) {
|
||||
RefMetaDataTracker t = new RefMetaDataTracker();
|
||||
for ( ReferenceOrderedDatum element : allRodsHere ) {
|
||||
if ( ! t.hasROD(element.getName()) )
|
||||
t.bind(element.getName(), element);
|
||||
}
|
||||
|
||||
// special case the interval again -- add it into the ROD
|
||||
if ( interval != null ) { t.bind(interval.getName(), interval); }
|
||||
|
||||
return t;
|
||||
}
|
||||
|
||||
private Collection<ReferenceOrderedDatum> getSpanningRods(ReferenceOrderedDatum marker) {
|
||||
return rodQueue.allElementsLTE(marker);
|
||||
}
|
||||
|
||||
// private Collection<ReferenceOrderedDatum> getSpanningRods(ReferenceOrderedDatum marker) {
|
||||
// Collection<ReferenceOrderedDatum> allFromQueue = rodQueue.allElementsLTE(marker);
|
||||
// Collection<ReferenceOrderedDatum> all = new LinkedList<ReferenceOrderedDatum>(allFromQueue);
|
||||
//
|
||||
// Iterator<ReferenceOrderedDatum> it = multiLocusRODs.iterator();
|
||||
// while ( it.hasNext() ) {
|
||||
// ReferenceOrderedDatum mlr = it.next();
|
||||
// if ( mlr.getLocation().overlapsP(marker.getLocation()) ) {
|
||||
// all.add(mlr);
|
||||
// } else {
|
||||
// it.remove(); // no long covering this site
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// for ( ReferenceOrderedDatum datum : allFromQueue ) {
|
||||
// if ( ! datum.getLocation().isSingleBP() ) {
|
||||
// System.out.printf("Adding multi-locus %s%n", datum);
|
||||
// multiLocusRODs.add(datum);
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// return all;
|
||||
// }
|
||||
|
||||
private long getSkippedBases( GenomeLoc currentPos ) {
|
||||
long skippedBases = 0;
|
||||
|
||||
if ( lastLoc == null ) {
|
||||
// special case -- we're at the start
|
||||
//System.out.printf("Cur=%s, shard=%s%n", currentPos, shard.getGenomeLoc());
|
||||
skippedBases = currentPos.getStart() - shard.getGenomeLoc().getStart();
|
||||
} else {
|
||||
//System.out.printf("Cur=%s, last=%s%n", currentPos, lastLoc);
|
||||
skippedBases = currentPos.minus(lastLoc) - 1;
|
||||
}
|
||||
|
||||
if ( skippedBases < -1 ) { // minus 1 value is ok
|
||||
throw new RuntimeException(String.format("BUG: skipped bases is < 0: cur=%s vs. last=%s", currentPos, lastLoc));
|
||||
}
|
||||
return Math.max(skippedBases, 0);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the location one after the last position we will traverse through
|
||||
* @return
|
||||
*/
|
||||
public GenomeLoc getLocOneBeyondShard() {
|
||||
return GenomeLocParser.createGenomeLoc( shard.getGenomeLoc().getContigIndex(),
|
||||
shard.getGenomeLoc().getStop()+1,
|
||||
shard.getGenomeLoc().getStop()+1);
|
||||
}
|
||||
|
||||
/**
|
||||
* How many bases are we skipping from the current location to the end of the interval / shard
|
||||
* if we have no more elements
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public long getLastSkippedBases() {
|
||||
if ( hasNext() )
|
||||
throw new RuntimeException("BUG: getLastSkippedBases called when there are elements remaining.");
|
||||
|
||||
return getSkippedBases(getLocOneBeyondShard());
|
||||
}
|
||||
|
||||
public RefMetaDataTracker getTracker() {
|
||||
return tracker;
|
||||
}
|
||||
|
||||
/**
|
||||
* Closes the current view.
|
||||
*/
|
||||
public void close() {
|
||||
//rodQueue.close();
|
||||
|
||||
rodQueue = null;
|
||||
tracker = null;
|
||||
}
|
||||
}
|
||||
|
|
@ -35,6 +35,7 @@ public abstract class BasicReferenceOrderedDatum implements ReferenceOrderedDatu
|
|||
public abstract GenomeLoc getLocation();
|
||||
|
||||
public int compareTo( ReferenceOrderedDatum that ) {
|
||||
//System.out.printf("Comparing %s to %s => %d%n", getLocation(), that.getLocation(), getLocation().compareTo(that.getLocation()));
|
||||
return getLocation().compareTo(that.getLocation());
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -158,7 +158,7 @@ public class rodDbSNP extends BasicReferenceOrderedDatum implements AllelicVaria
|
|||
String contig = parts[1];
|
||||
long start = Long.parseLong(parts[2]) + 1; // The final is 0 based
|
||||
long stop = Long.parseLong(parts[3]) + 1; // The final is 0 based
|
||||
loc = GenomeLocParser.parseGenomeLoc(contig, start, stop-1);
|
||||
loc = GenomeLocParser.parseGenomeLoc(contig, start, Math.max(start, stop-1));
|
||||
|
||||
name = parts[4];
|
||||
strand = parts[6];
|
||||
|
|
|
|||
|
|
@ -10,6 +10,7 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
|||
import org.broadinstitute.sting.gatk.walkers.DataSource;
|
||||
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.Walker;
|
||||
import org.broadinstitute.sting.gatk.walkers.RodWalker;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
|
||||
|
|
@ -19,6 +20,8 @@ import java.util.ArrayList;
|
|||
* A simple solution to iterating over all reference positions over a series of genomic locations.
|
||||
*/
|
||||
public class TraverseLoci extends TraversalEngine {
|
||||
final private static String UNIT_STRING = "sites";
|
||||
|
||||
|
||||
/**
|
||||
* our log, which we want to capture anything from this class
|
||||
|
|
@ -45,32 +48,59 @@ public class TraverseLoci extends TraversalEngine {
|
|||
LocusWalker<M, T> locusWalker = (LocusWalker<M, T>)walker;
|
||||
|
||||
LocusView locusView = getLocusView( walker, dataProvider );
|
||||
LocusReferenceView referenceView = new LocusReferenceView( walker, dataProvider );
|
||||
ReferenceOrderedView referenceOrderedDataView = new ReferenceOrderedView( dataProvider );
|
||||
|
||||
// We keep processing while the next reference location is within the interval
|
||||
while( locusView.hasNext() ) {
|
||||
AlignmentContext locus = locusView.next();
|
||||
|
||||
TraversalStatistics.nRecords++;
|
||||
if ( WalkerManager.getWalkerDataSource(walker) == DataSource.REFERENCE_ORDERED_DATA )
|
||||
throw new RuntimeException("Engine currently doesn't support RodWalkers");
|
||||
|
||||
// Iterate forward to get all reference ordered data covering this locus
|
||||
final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus(locus.getLocation());
|
||||
if ( locusView.hasNext() ) { // trivial optimization to avoid unnecessary processing when there's nothing here at all
|
||||
|
||||
ReferenceContext refContext = referenceView.getReferenceContext(locus.getLocation());
|
||||
//ReferenceOrderedView referenceOrderedDataView = new ReferenceOrderedView( dataProvider );
|
||||
ReferenceOrderedView referenceOrderedDataView = null;
|
||||
if ( WalkerManager.getWalkerDataSource(walker) != DataSource.REFERENCE_ORDERED_DATA )
|
||||
referenceOrderedDataView = new ManagingReferenceOrderedView( dataProvider );
|
||||
else
|
||||
referenceOrderedDataView = (RodLocusView)locusView;
|
||||
|
||||
final boolean keepMeP = locusWalker.filter(tracker, refContext, locus);
|
||||
if (keepMeP) {
|
||||
M x = locusWalker.map(tracker, refContext, locus);
|
||||
LocusReferenceView referenceView = new LocusReferenceView( walker, dataProvider );
|
||||
|
||||
// We keep processing while the next reference location is within the interval
|
||||
while( locusView.hasNext() ) {
|
||||
AlignmentContext locus = locusView.next();
|
||||
|
||||
TraversalStatistics.nRecords++;
|
||||
|
||||
// Iterate forward to get all reference ordered data covering this locus
|
||||
final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus(locus.getLocation());
|
||||
|
||||
ReferenceContext refContext = referenceView.getReferenceContext(locus.getLocation());
|
||||
|
||||
final boolean keepMeP = locusWalker.filter(tracker, refContext, locus);
|
||||
if (keepMeP) {
|
||||
M x = locusWalker.map(tracker, refContext, locus);
|
||||
sum = locusWalker.reduce(x, sum);
|
||||
}
|
||||
|
||||
if (this.maximumIterations > 0 && TraversalStatistics.nRecords > this.maximumIterations) {
|
||||
logger.warn(String.format("Maximum number of reads encountered, terminating traversal " + TraversalStatistics.nRecords));
|
||||
break;
|
||||
}
|
||||
|
||||
printProgress(UNIT_STRING, locus.getLocation());
|
||||
}
|
||||
}
|
||||
|
||||
// We have a final map call to execute here to clean up the skipped based from the
|
||||
// last position in the ROD to that in the interval
|
||||
if ( WalkerManager.getWalkerDataSource(walker) == DataSource.REFERENCE_ORDERED_DATA ) {
|
||||
RodLocusView rodLocusView = (RodLocusView)locusView;
|
||||
long nSkipped = rodLocusView.getLastSkippedBases();
|
||||
if ( nSkipped > 0 ) {
|
||||
// no sense in making the call if you don't have anything interesting to say
|
||||
AlignmentContext ac = new AlignmentContext(rodLocusView.getLocOneBeyondShard(), null, null, nSkipped);
|
||||
M x = locusWalker.map(null, null, ac);
|
||||
sum = locusWalker.reduce(x, sum);
|
||||
}
|
||||
|
||||
if (this.maximumIterations > 0 && TraversalStatistics.nRecords > this.maximumIterations) {
|
||||
logger.warn(String.format("Maximum number of reads encountered, terminating traversal " + TraversalStatistics.nRecords));
|
||||
break;
|
||||
}
|
||||
|
||||
printProgress("loci", locus.getLocation());
|
||||
}
|
||||
|
||||
return sum;
|
||||
|
|
@ -78,12 +108,12 @@ public class TraverseLoci extends TraversalEngine {
|
|||
|
||||
/**
|
||||
* Temporary override of printOnTraversalDone.
|
||||
* TODO: Add some sort of TE.getName() function once all TraversalEngines are ported.
|
||||
*
|
||||
* @param sum Result of the computation.
|
||||
* @param <T> Type of the result.
|
||||
*/
|
||||
public <T> void printOnTraversalDone( T sum ) {
|
||||
printOnTraversalDone( "loci", sum );
|
||||
printOnTraversalDone( UNIT_STRING, sum );
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -97,6 +127,8 @@ public class TraverseLoci extends TraversalEngine {
|
|||
return new CoveredLocusView(dataProvider);
|
||||
else if( dataSource == DataSource.REFERENCE )
|
||||
return new AllLocusView(dataProvider);
|
||||
else if( dataSource == DataSource.REFERENCE_ORDERED_DATA )
|
||||
return new RodLocusView(dataProvider);
|
||||
else
|
||||
throw new UnsupportedOperationException("Unsupported traversal type: " + dataSource);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -2,10 +2,7 @@ package org.broadinstitute.sting.gatk.traversals;
|
|||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.datasources.providers.LocusReferenceView;
|
||||
import org.broadinstitute.sting.gatk.datasources.providers.ReadView;
|
||||
import org.broadinstitute.sting.gatk.datasources.providers.ReferenceOrderedView;
|
||||
import org.broadinstitute.sting.gatk.datasources.providers.ShardDataProvider;
|
||||
import org.broadinstitute.sting.gatk.datasources.providers.*;
|
||||
import org.broadinstitute.sting.gatk.datasources.shards.Shard;
|
||||
import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
|
|
@ -39,7 +36,7 @@ public class TraverseLocusWindows extends TraversalEngine {
|
|||
|
||||
ReadView readView = new ReadView( dataProvider );
|
||||
LocusReferenceView referenceView = new LocusReferenceView( walker, dataProvider );
|
||||
ReferenceOrderedView referenceOrderedDataView = new ReferenceOrderedView( dataProvider );
|
||||
ReferenceOrderedView referenceOrderedDataView = new ManagingReferenceOrderedView( dataProvider );
|
||||
|
||||
AlignmentContext locus = getLocusContext(readView.iterator(), interval);
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,43 @@
|
|||
package org.broadinstitute.sting.gatk.walkers;
|
||||
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Same as CountRod but this one is a reference walker
|
||||
*/
|
||||
public class CountRodByRefWalker extends RefWalker<CountRodWalker.Datum, Pair<ExpandingArrayList<Long>, Long>> {
|
||||
@Argument(fullName = "verbose", shortName = "v", doc="If true, Countrod will print out detailed information about the rods it finds and locations", required = false)
|
||||
public boolean verbose = false;
|
||||
|
||||
@Argument(fullName = "showSkipped", shortName = "s", doc="If true, CountRod will print out the skippped locations", required = false)
|
||||
public boolean showSkipped = false;
|
||||
|
||||
CountRodWalker crw = new CountRodWalker();
|
||||
|
||||
public void initialize() {
|
||||
crw.verbose = verbose;
|
||||
crw.showSkipped = showSkipped;
|
||||
crw.out = out;
|
||||
crw.err = err;
|
||||
}
|
||||
|
||||
public CountRodWalker.Datum map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
return crw.map(tracker, ref, context);
|
||||
}
|
||||
|
||||
public Pair<ExpandingArrayList<Long>, Long> reduceInit() {
|
||||
return crw.reduceInit();
|
||||
}
|
||||
|
||||
public Pair<ExpandingArrayList<Long>, Long> reduce(CountRodWalker.Datum point, Pair<ExpandingArrayList<Long>, Long> sum) {
|
||||
return crw.reduce(point, sum);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,110 @@
|
|||
package org.broadinstitute.sting.gatk.walkers;
|
||||
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Collection;
|
||||
import java.util.LinkedList;
|
||||
|
||||
public class CountRodWalker extends RodWalker<CountRodWalker.Datum, Pair<ExpandingArrayList<Long>, Long>> {
|
||||
@Argument(fullName = "verbose", shortName = "v", doc="If true, Countrod will print out detailed information about the rods it finds and locations", required = false)
|
||||
public boolean verbose = false;
|
||||
|
||||
@Argument(fullName = "showSkipped", shortName = "s", doc="If true, CountRod will print out the skippped locations", required = false)
|
||||
public boolean showSkipped = false;
|
||||
|
||||
public class Datum {
|
||||
public long nRodsAtThisLocation = 0;
|
||||
public long nSkippedBases =0;
|
||||
public long nTotalBases = 0;
|
||||
|
||||
public Datum( long nRodsAtThisLocation, long nSkippedBases, long nTotalBases ) {
|
||||
this.nRodsAtThisLocation = nRodsAtThisLocation;
|
||||
this.nSkippedBases = nSkippedBases;
|
||||
this.nTotalBases = nTotalBases;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return String.format("<%d %d %d>", nRodsAtThisLocation, nSkippedBases, nTotalBases);
|
||||
}
|
||||
}
|
||||
|
||||
public Datum map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
GenomeLoc cur = context.getLocation();
|
||||
|
||||
if ( verbose && showSkipped ) {
|
||||
for ( int i = 0; i < context.getSkippedBases(); i++ ) {
|
||||
out.printf("%s: skipped%n", GenomeLocParser.incPos(cur, i - context.getSkippedBases()));
|
||||
}
|
||||
}
|
||||
|
||||
long nRodsHere = 0;
|
||||
long nTotalBases = 0;
|
||||
|
||||
if ( ref == null ) {
|
||||
// we're getting the last skipped update
|
||||
if ( verbose )
|
||||
out.printf("Last position was %s: skipping %d bases%n",
|
||||
context.getLocation(), context.getSkippedBases() );
|
||||
nRodsHere = -1; // don't update this
|
||||
nTotalBases = context.getSkippedBases();
|
||||
} else {
|
||||
Collection<ReferenceOrderedDatum> rods = new LinkedList<ReferenceOrderedDatum>();
|
||||
for ( ReferenceOrderedDatum rod : tracker.getBoundRods() ) {
|
||||
//System.out.printf("Considering rod %s%n", rod);
|
||||
if ( rod.getLocation().getStart() == context.getLocation().getStart() && ! rod.getName().equals("interval") ) {
|
||||
// only consider the first element
|
||||
//System.out.printf("adding it%n");
|
||||
rods.add(rod);
|
||||
}
|
||||
}
|
||||
|
||||
nRodsHere = rods.size();
|
||||
|
||||
if ( nRodsHere > 0 ) {
|
||||
if ( verbose ) {
|
||||
List<String> names = new ArrayList<String>();
|
||||
for ( ReferenceOrderedDatum rod : rods ) {
|
||||
names.add(rod.getName());
|
||||
}
|
||||
|
||||
//System.out.printf("context is %s", context.getSkippedBases());
|
||||
out.printf("At %s: found %d rod(s) [%s] after skipping %d bases%n",
|
||||
context.getLocation(), nRodsHere, Utils.join(",", names), context.getSkippedBases() );
|
||||
}
|
||||
}
|
||||
|
||||
nTotalBases = context.getSkippedBases() + 1;
|
||||
}
|
||||
|
||||
return new Datum(nRodsHere, context.getSkippedBases(), nTotalBases);
|
||||
}
|
||||
|
||||
public Pair<ExpandingArrayList<Long>, Long> reduceInit() {
|
||||
return new Pair<ExpandingArrayList<Long>, Long>(new ExpandingArrayList<Long>(), 0l);
|
||||
}
|
||||
|
||||
private void updateCounts(ExpandingArrayList<Long> counts, long nRods, long nObs) {
|
||||
if ( nRods >= 0 ) {
|
||||
long prev = counts.get((int)nRods) == null ? 0l : counts.get((int)nRods);
|
||||
counts.set((int)nRods, nObs + prev);
|
||||
}
|
||||
}
|
||||
|
||||
public Pair<ExpandingArrayList<Long>, Long> reduce(Datum point, Pair<ExpandingArrayList<Long>, Long> sum) {
|
||||
ExpandingArrayList<Long> counts = sum.getFirst();
|
||||
updateCounts(counts, point.nRodsAtThisLocation, 1);
|
||||
updateCounts(counts, 0, point.nSkippedBases);
|
||||
|
||||
Pair<ExpandingArrayList<Long>, Long> r = new Pair<ExpandingArrayList<Long>, Long>(counts, point.nTotalBases + sum.getSecond());
|
||||
|
||||
//System.out.printf("Reduce: %s %s => %s%n", point, sum, r);
|
||||
return r;
|
||||
}
|
||||
}
|
||||
|
|
@ -18,5 +18,6 @@ package org.broadinstitute.sting.gatk.walkers;
|
|||
public enum DataSource {
|
||||
READS,
|
||||
REFERENCE,
|
||||
REFERENCE_BASES // Do I actually need the reference bases passed to the walker?
|
||||
REFERENCE_BASES, // Do I actually need the reference bases passed to the walker?
|
||||
REFERENCE_ORDERED_DATA
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,14 @@
|
|||
package org.broadinstitute.sting.gatk.walkers;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: mdepristo
|
||||
* Date: Feb 22, 2009
|
||||
* Time: 2:52:28 PM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
@By(DataSource.REFERENCE_ORDERED_DATA)
|
||||
@Requires({DataSource.REFERENCE, DataSource.REFERENCE_ORDERED_DATA})
|
||||
@Allows({DataSource.REFERENCE, DataSource.REFERENCE_ORDERED_DATA})
|
||||
public abstract class RodWalker<MapType, ReduceType> extends LocusWalker<MapType, ReduceType> {
|
||||
}
|
||||
|
|
@ -54,7 +54,7 @@ public class ReferenceOrderedViewTest extends BaseTest {
|
|||
public void testNoBindings() {
|
||||
Shard shard = new LocusShard(GenomeLocParser.createGenomeLoc("chrM",1,30));
|
||||
ShardDataProvider provider = new ShardDataProvider(shard, null, seq, Collections.<ReferenceOrderedDataSource>emptyList());
|
||||
ReferenceOrderedView view = new ReferenceOrderedView( provider );
|
||||
ReferenceOrderedView view = new ManagingReferenceOrderedView( provider );
|
||||
|
||||
RefMetaDataTracker tracker = view.getReferenceOrderedDataAtLocus(GenomeLocParser.createGenomeLoc("chrM",10));
|
||||
Assert.assertNull("The tracker should not have produced any data", tracker.lookup("tableTest",null));
|
||||
|
|
@ -72,7 +72,7 @@ public class ReferenceOrderedViewTest extends BaseTest {
|
|||
Shard shard = new LocusShard(GenomeLocParser.createGenomeLoc("chrM",1,30));
|
||||
|
||||
ShardDataProvider provider = new ShardDataProvider(shard, null, seq, Collections.singletonList(dataSource));
|
||||
ReferenceOrderedView view = new ReferenceOrderedView( provider );
|
||||
ReferenceOrderedView view = new ManagingReferenceOrderedView( provider );
|
||||
|
||||
RefMetaDataTracker tracker = view.getReferenceOrderedDataAtLocus(GenomeLocParser.createGenomeLoc("chrM",20));
|
||||
TabularROD datum = (TabularROD)tracker.lookup("tableTest",null);
|
||||
|
|
@ -98,7 +98,7 @@ public class ReferenceOrderedViewTest extends BaseTest {
|
|||
Shard shard = new LocusShard(GenomeLocParser.createGenomeLoc("chrM",1,30));
|
||||
|
||||
ShardDataProvider provider = new ShardDataProvider(shard, null, seq, Arrays.asList(dataSource1,dataSource2));
|
||||
ReferenceOrderedView view = new ReferenceOrderedView( provider );
|
||||
ReferenceOrderedView view = new ManagingReferenceOrderedView( provider );
|
||||
|
||||
RefMetaDataTracker tracker = view.getReferenceOrderedDataAtLocus(GenomeLocParser.createGenomeLoc("chrM",20));
|
||||
TabularROD datum1 = (TabularROD)tracker.lookup("tableTest1",null);
|
||||
|
|
|
|||
|
|
@ -23,17 +23,20 @@ class Status:
|
|||
def __init__(self, file, exists, size):
|
||||
self.file = file
|
||||
self.exists = exists
|
||||
self.size = size
|
||||
self._size = size
|
||||
|
||||
if not exists: self.status = "missing"
|
||||
if size == 0: self.status = "no-data"
|
||||
else: self.status = "exists: bytes=" + str(self.size)
|
||||
else: self.status = "exists: bytes=" + str(self.size())
|
||||
|
||||
def __str__(self):
|
||||
return self.status
|
||||
return self.file + " " + self.status
|
||||
|
||||
def size(self):
|
||||
return self._size
|
||||
|
||||
def viewSize(self):
|
||||
return MergeBAMsUtils.greek(self.size)
|
||||
return MergeBAMsUtils.greek(self.size())
|
||||
|
||||
|
||||
def md5(file):
|
||||
|
|
@ -50,10 +53,10 @@ class ComparedFiles:
|
|||
self.ftpStat = ftpStat
|
||||
|
||||
def size(self):
|
||||
if self.localStat.size <> 0:
|
||||
return self.localStat.size
|
||||
if self.ftpStat.size <> 0:
|
||||
return self.ftpStat.size
|
||||
if self.localStat.size() <> 0:
|
||||
return self.localStat.size()
|
||||
if self.ftpStat.size() <> 0:
|
||||
return self.ftpStat.size()
|
||||
else:
|
||||
return 0
|
||||
|
||||
|
|
@ -67,7 +70,7 @@ def modTimeStr(t):
|
|||
if t == 0:
|
||||
return 'N/A'
|
||||
else:
|
||||
return time.strftime("%m/%d/%y", time.localtime(t))
|
||||
return time.strftime("%m/%d/%y", time.localtime(int(t)))
|
||||
|
||||
def getSizeForFile(dir, filename):
|
||||
global CACHED_LIST
|
||||
|
|
@ -141,9 +144,10 @@ def validateFile(relPath, localRoot, ftpRoot):
|
|||
return compared
|
||||
|
||||
def compareFileStatus(localStat, ftpStat):
|
||||
if DEBUG: print 'comparing', localStat, ftpStat
|
||||
if localStat.exists:
|
||||
if ftpStat.exists:
|
||||
if localStat.size == ftpStat.size:
|
||||
if localStat.size() == ftpStat.size():
|
||||
status = 'in-sync'
|
||||
else:
|
||||
status = 'size-mismatch'
|
||||
|
|
@ -165,6 +169,8 @@ def filesInLocalPath(root, subdir):
|
|||
if subdir <> None:
|
||||
for fullroot, dirs, files in os.walk(os.path.join(root, subdir)):
|
||||
for file in filter( regex.match, files ):
|
||||
#if file <> "NA12761.SLX.WUGSC.Mosaik.SRP000033.2009_08.bam.bai":
|
||||
# continue
|
||||
fullpath = os.path.join(fullroot, file)
|
||||
path = fullpath.split(root)[1]
|
||||
#print 'adding relpath=', path, 'fullpath=', fullpath
|
||||
|
|
@ -266,12 +272,15 @@ if __name__ == "__main__":
|
|||
statusForFileListing = ['size-mismatch', 'local-file-missing']
|
||||
maxFilesToList = 10
|
||||
if status in statusForFileListing:
|
||||
print 'SUMMARY: listing the first', maxFilesToList, 'of', n
|
||||
print 'SUMMARY: listing the first', min(maxFilesToList, n), 'of', n
|
||||
for file in itertools.islice(filesOfStatus, maxFilesToList):
|
||||
print 'SUMMARY: File: %8s %12s %s' % ( MergeBAMsUtils.greek(file.size()), modTimeStr(file.modTime()), file.file)
|
||||
if status == 'size-mismatch':
|
||||
print 'SUMMARY: File: ftp=%d bytes local=%d bytes %12s %s' % ( file.ftpStat.size(), file.localStat.size() , modTimeStr(file.modTime()), file.file)
|
||||
else:
|
||||
print 'SUMMARY: File: %8s %12s %s' % ( MergeBAMsUtils.greek(file.size()), modTimeStr(file.modTime()), file.file)
|
||||
if n > 0:
|
||||
fileSizes = MergeBAMsUtils.greek(reduce(operator.__add__, map( ComparedFiles.size, filesOfStatus ), 0 ))
|
||||
mostRecentMod = modTimeStr(apply(max, map( ComparedFiles.modTime, filesOfStatus )))
|
||||
|
||||
print 'SUMMARY: total size %s' % ( fileSizes )
|
||||
print 'SUMMARY: last modification time %s' % ( mostRecentMod )
|
||||
fileSizes = MergeBAMsUtils.greek(reduce(operator.__add__, map( ComparedFiles.size, filesOfStatus ), 0 ))
|
||||
mostRecentMod = modTimeStr(apply(max, map( ComparedFiles.modTime, filesOfStatus ) + [0]))
|
||||
|
||||
print 'SUMMARY: total size %s' % ( fileSizes )
|
||||
print 'SUMMARY: last modification time %s' % ( mostRecentMod )
|
||||
|
|
|
|||
|
|
@ -2,7 +2,7 @@
|
|||
/humgen/gsa-scr1/GATK_Data/Validation_Data/MV1994.bam /humgen/gsa-scr1/GATK_Data/Validation_Data/Escherichia_coli_K12_MG1655.fasta *
|
||||
|
||||
# daughter chrom 6 deep coverage first 10 MB
|
||||
/broad/1KG/DCC/ftp/pilot_data/NA12878/alignment/NA12878.chrom6.SLX.SRP000032.2009_06.bam /broad/1KG/reference/human_b36_both.fasta 6:1-10,000,00
|
||||
/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam /broad/1KG/reference/human_b36_both.fasta 1:10,500,000-11,000,000
|
||||
|
||||
# daughter chrom 1 deep coverage
|
||||
/broad/1KG/DCC/ftp/pilot_data/NA12878/alignment/NA12878.chrom1.SLX.SRP000032.2009_06.bam /broad/1KG/reference/human_b36_both.fasta *
|
||||
|
|
|
|||
Loading…
Reference in New Issue