Framework for ROD walkers -- totally experiment and not working right now

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1600 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2009-09-12 19:13:15 +00:00
parent bd75a8d168
commit 6e13a36059
15 changed files with 484 additions and 68 deletions

View File

@ -507,11 +507,14 @@ public class GenomeAnalysisEngine {
ReferenceSequenceFile drivingDataSource,
GenomeLocSortedSet intervals,
Integer maxIterations) {
final long SHARD_SIZE = 100000L;
long SHARD_SIZE = 100000L;
ShardStrategy shardStrategy = null;
ShardStrategyFactory.SHATTER_STRATEGY shardType;
if (walker instanceof LocusWalker) {
if ( walker instanceof RodWalker ) SHARD_SIZE *= 100;
if (intervals != null) {
shardType = (walker.isReduceByInterval()) ?
ShardStrategyFactory.SHATTER_STRATEGY.INTERVAL :
@ -525,7 +528,6 @@ public class GenomeAnalysisEngine {
shardStrategy = ShardStrategyFactory.shatter(ShardStrategyFactory.SHATTER_STRATEGY.LINEAR,
drivingDataSource.getSequenceDictionary(),
SHARD_SIZE, maxIterations);
} else if (walker instanceof ReadWalker ||
walker instanceof DuplicateWalker) {

View File

@ -22,19 +22,6 @@ import org.apache.log4j.Logger;
*/
public class CoveredLocusView extends LocusView {
/**
* Gets the position to which the last seek was requested.
*/
private GenomeLoc seekPoint;
/**
* What's the context for the last locus accessed?
* @param provider
*/
private AlignmentContext nextLocusContext = null;
private static Logger logger = Logger.getLogger(CoveredLocusView.class);
/**
* Create a new queue of locus contexts.
* @param provider

View File

@ -25,11 +25,11 @@ import java.util.Collections;
/**
* A view into the reference-ordered data in the provider.
*/
public class ReferenceOrderedView implements View {
public class ManagingReferenceOrderedView implements ReferenceOrderedView {
/**
* The provider that's supplying our backing data.
*/
private final ShardDataProvider provider;
//private final ShardDataProvider provider;
/**
* The data sources along with their current states.
@ -40,8 +40,8 @@ public class ReferenceOrderedView implements View {
* Create a new view of reference-ordered data.
* @param provider
*/
public ReferenceOrderedView( ShardDataProvider provider ) {
this.provider = provider;
public ManagingReferenceOrderedView( ShardDataProvider provider ) {
//this.provider = provider;
for( ReferenceOrderedDataSource dataSource: provider.getReferenceOrderedData() )
states.add( new ReferenceOrderedDataState( dataSource, (RODIterator)dataSource.seek(provider.getShard()) ) );

View File

@ -0,0 +1,220 @@
package org.broadinstitute.sting.gatk.datasources.providers;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.RODIterator;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.MergingIterator;
import org.broadinstitute.sting.utils.GenomeLocParser;
import java.util.*;
import net.sf.samtools.SAMRecord;
/**
* User: hanna
* Date: May 21, 2009
* Time: 2:49:17 PM
* BROAD INSTITUTE SOFTWARE COPYRIGHT NOTICE AND AGREEMENT
* Software and documentation are copyright 2005 by the Broad Institute.
* All rights are reserved.
*
* Users acknowledge that this software is supplied without any warranty or support.
* The Broad Institute is not responsible for its use, misuse, or
* functionality.
*/
/**
* A view into the reference-ordered data in the provider.
*/
public class RodLocusView extends LocusView implements ReferenceOrderedView {
/**
* The provider that's supplying our backing data.
*/
//private final ShardDataProvider provider;
/**
* The data sources along with their current states.
*/
private MergingIterator<ReferenceOrderedDatum> rodQueue = null;
RefMetaDataTracker tracker = null;
GenomeLoc lastLoc = null;
ReferenceOrderedDatum interval = null;
// broken support for multi-locus rods
//List<ReferenceOrderedDatum> multiLocusRODs = new LinkedList<ReferenceOrderedDatum>();
/**
* Enable debugging output -- todo remove me
*/
final static boolean DEBUG = false;
final static String INTERVAL_ROD_NAME = "interval";
/**
* Create a new view of reference-ordered data.
*
* @param provider
*/
public RodLocusView( ShardDataProvider provider ) {
super(provider);
GenomeLoc loc = provider.getShard().getGenomeLoc();
List<Iterator<ReferenceOrderedDatum>> iterators = new LinkedList<Iterator<ReferenceOrderedDatum>>();
for( ReferenceOrderedDataSource dataSource: provider.getReferenceOrderedData() ) {
if ( DEBUG ) System.out.printf("Shard is %s%n", loc);
RODIterator it = (RODIterator)dataSource.seek(provider.getShard());
ReferenceOrderedDatum x = it.seekForward(loc);
// we need to special case the interval so we don't always think there's a rod at the first location
if ( dataSource.getName().equals(INTERVAL_ROD_NAME) ) {
if ( interval != null )
throw new RuntimeException("BUG: interval local variable already assigned " + interval);
interval = x;
} else {
iterators.add( (Iterator<ReferenceOrderedDatum>)it );
}
}
rodQueue = new MergingIterator<ReferenceOrderedDatum>(iterators);
}
public RefMetaDataTracker getReferenceOrderedDataAtLocus( GenomeLoc loc ) {
return tracker;
}
public boolean hasNext() {
if ( ! rodQueue.hasNext() )
return false;
else {
ReferenceOrderedDatum peeked = rodQueue.peek();
return ! peeked.getLocation().isPast(shard.getGenomeLoc());
}
}
/**
* Returns the next covered locus context in the shard.
* @return Next covered locus context in the shard.
* @throw NoSuchElementException if no such element exists.
*/
public AlignmentContext next() {
if ( DEBUG ) System.out.printf("In RodLocusView.next()...%n");
ReferenceOrderedDatum datum = rodQueue.next();
if ( DEBUG ) System.out.printf("In RodLocusView.next(); datum = %s...%n", datum.getLocation());
if ( DEBUG ) System.out.printf("In RodLocusView.next(): creating tracker...%n");
// Update the tracker here for use
Collection<ReferenceOrderedDatum> allRODsHere = getSpanningRods(datum);
tracker = createTracker(allRODsHere);
GenomeLoc rodSite = datum.getLocation();
GenomeLoc site = GenomeLocParser.createGenomeLoc( rodSite.getContigIndex(), rodSite.getStart(), rodSite.getStart());
if ( DEBUG ) System.out.printf("rodLocusView.next() is at %s%n", site);
// calculate the number of skipped bases, and update lastLoc so we can do that again in the next()
long skippedBases = getSkippedBases( rodSite );
lastLoc = site;
return new AlignmentContext(site, new ArrayList<SAMRecord>(), new ArrayList<Integer>(), skippedBases);
}
private RefMetaDataTracker createTracker( Collection<ReferenceOrderedDatum> allRodsHere ) {
RefMetaDataTracker t = new RefMetaDataTracker();
for ( ReferenceOrderedDatum element : allRodsHere ) {
if ( ! t.hasROD(element.getName()) )
t.bind(element.getName(), element);
}
// special case the interval again -- add it into the ROD
if ( interval != null ) { t.bind(interval.getName(), interval); }
return t;
}
private Collection<ReferenceOrderedDatum> getSpanningRods(ReferenceOrderedDatum marker) {
return rodQueue.allElementsLTE(marker);
}
// private Collection<ReferenceOrderedDatum> getSpanningRods(ReferenceOrderedDatum marker) {
// Collection<ReferenceOrderedDatum> allFromQueue = rodQueue.allElementsLTE(marker);
// Collection<ReferenceOrderedDatum> all = new LinkedList<ReferenceOrderedDatum>(allFromQueue);
//
// Iterator<ReferenceOrderedDatum> it = multiLocusRODs.iterator();
// while ( it.hasNext() ) {
// ReferenceOrderedDatum mlr = it.next();
// if ( mlr.getLocation().overlapsP(marker.getLocation()) ) {
// all.add(mlr);
// } else {
// it.remove(); // no long covering this site
// }
// }
//
// for ( ReferenceOrderedDatum datum : allFromQueue ) {
// if ( ! datum.getLocation().isSingleBP() ) {
// System.out.printf("Adding multi-locus %s%n", datum);
// multiLocusRODs.add(datum);
// }
// }
//
// return all;
// }
private long getSkippedBases( GenomeLoc currentPos ) {
long skippedBases = 0;
if ( lastLoc == null ) {
// special case -- we're at the start
//System.out.printf("Cur=%s, shard=%s%n", currentPos, shard.getGenomeLoc());
skippedBases = currentPos.getStart() - shard.getGenomeLoc().getStart();
} else {
//System.out.printf("Cur=%s, last=%s%n", currentPos, lastLoc);
skippedBases = currentPos.minus(lastLoc) - 1;
}
if ( skippedBases < -1 ) { // minus 1 value is ok
throw new RuntimeException(String.format("BUG: skipped bases is < 0: cur=%s vs. last=%s", currentPos, lastLoc));
}
return Math.max(skippedBases, 0);
}
/**
* Get the location one after the last position we will traverse through
* @return
*/
public GenomeLoc getLocOneBeyondShard() {
return GenomeLocParser.createGenomeLoc( shard.getGenomeLoc().getContigIndex(),
shard.getGenomeLoc().getStop()+1,
shard.getGenomeLoc().getStop()+1);
}
/**
* How many bases are we skipping from the current location to the end of the interval / shard
* if we have no more elements
*
* @return
*/
public long getLastSkippedBases() {
if ( hasNext() )
throw new RuntimeException("BUG: getLastSkippedBases called when there are elements remaining.");
return getSkippedBases(getLocOneBeyondShard());
}
public RefMetaDataTracker getTracker() {
return tracker;
}
/**
* Closes the current view.
*/
public void close() {
//rodQueue.close();
rodQueue = null;
tracker = null;
}
}

View File

@ -35,6 +35,7 @@ public abstract class BasicReferenceOrderedDatum implements ReferenceOrderedDatu
public abstract GenomeLoc getLocation();
public int compareTo( ReferenceOrderedDatum that ) {
//System.out.printf("Comparing %s to %s => %d%n", getLocation(), that.getLocation(), getLocation().compareTo(that.getLocation()));
return getLocation().compareTo(that.getLocation());
}

View File

@ -158,7 +158,7 @@ public class rodDbSNP extends BasicReferenceOrderedDatum implements AllelicVaria
String contig = parts[1];
long start = Long.parseLong(parts[2]) + 1; // The final is 0 based
long stop = Long.parseLong(parts[3]) + 1; // The final is 0 based
loc = GenomeLocParser.parseGenomeLoc(contig, start, stop-1);
loc = GenomeLocParser.parseGenomeLoc(contig, start, Math.max(start, stop-1));
name = parts[4];
strand = parts[6];

View File

@ -10,6 +10,7 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.DataSource;
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
import org.broadinstitute.sting.gatk.walkers.Walker;
import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.Utils;
@ -19,6 +20,8 @@ import java.util.ArrayList;
* A simple solution to iterating over all reference positions over a series of genomic locations.
*/
public class TraverseLoci extends TraversalEngine {
final private static String UNIT_STRING = "sites";
/**
* our log, which we want to capture anything from this class
@ -45,32 +48,59 @@ public class TraverseLoci extends TraversalEngine {
LocusWalker<M, T> locusWalker = (LocusWalker<M, T>)walker;
LocusView locusView = getLocusView( walker, dataProvider );
LocusReferenceView referenceView = new LocusReferenceView( walker, dataProvider );
ReferenceOrderedView referenceOrderedDataView = new ReferenceOrderedView( dataProvider );
// We keep processing while the next reference location is within the interval
while( locusView.hasNext() ) {
AlignmentContext locus = locusView.next();
TraversalStatistics.nRecords++;
if ( WalkerManager.getWalkerDataSource(walker) == DataSource.REFERENCE_ORDERED_DATA )
throw new RuntimeException("Engine currently doesn't support RodWalkers");
// Iterate forward to get all reference ordered data covering this locus
final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus(locus.getLocation());
if ( locusView.hasNext() ) { // trivial optimization to avoid unnecessary processing when there's nothing here at all
ReferenceContext refContext = referenceView.getReferenceContext(locus.getLocation());
//ReferenceOrderedView referenceOrderedDataView = new ReferenceOrderedView( dataProvider );
ReferenceOrderedView referenceOrderedDataView = null;
if ( WalkerManager.getWalkerDataSource(walker) != DataSource.REFERENCE_ORDERED_DATA )
referenceOrderedDataView = new ManagingReferenceOrderedView( dataProvider );
else
referenceOrderedDataView = (RodLocusView)locusView;
final boolean keepMeP = locusWalker.filter(tracker, refContext, locus);
if (keepMeP) {
M x = locusWalker.map(tracker, refContext, locus);
LocusReferenceView referenceView = new LocusReferenceView( walker, dataProvider );
// We keep processing while the next reference location is within the interval
while( locusView.hasNext() ) {
AlignmentContext locus = locusView.next();
TraversalStatistics.nRecords++;
// Iterate forward to get all reference ordered data covering this locus
final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus(locus.getLocation());
ReferenceContext refContext = referenceView.getReferenceContext(locus.getLocation());
final boolean keepMeP = locusWalker.filter(tracker, refContext, locus);
if (keepMeP) {
M x = locusWalker.map(tracker, refContext, locus);
sum = locusWalker.reduce(x, sum);
}
if (this.maximumIterations > 0 && TraversalStatistics.nRecords > this.maximumIterations) {
logger.warn(String.format("Maximum number of reads encountered, terminating traversal " + TraversalStatistics.nRecords));
break;
}
printProgress(UNIT_STRING, locus.getLocation());
}
}
// We have a final map call to execute here to clean up the skipped based from the
// last position in the ROD to that in the interval
if ( WalkerManager.getWalkerDataSource(walker) == DataSource.REFERENCE_ORDERED_DATA ) {
RodLocusView rodLocusView = (RodLocusView)locusView;
long nSkipped = rodLocusView.getLastSkippedBases();
if ( nSkipped > 0 ) {
// no sense in making the call if you don't have anything interesting to say
AlignmentContext ac = new AlignmentContext(rodLocusView.getLocOneBeyondShard(), null, null, nSkipped);
M x = locusWalker.map(null, null, ac);
sum = locusWalker.reduce(x, sum);
}
if (this.maximumIterations > 0 && TraversalStatistics.nRecords > this.maximumIterations) {
logger.warn(String.format("Maximum number of reads encountered, terminating traversal " + TraversalStatistics.nRecords));
break;
}
printProgress("loci", locus.getLocation());
}
return sum;
@ -78,12 +108,12 @@ public class TraverseLoci extends TraversalEngine {
/**
* Temporary override of printOnTraversalDone.
* TODO: Add some sort of TE.getName() function once all TraversalEngines are ported.
*
* @param sum Result of the computation.
* @param <T> Type of the result.
*/
public <T> void printOnTraversalDone( T sum ) {
printOnTraversalDone( "loci", sum );
printOnTraversalDone( UNIT_STRING, sum );
}
/**
@ -97,6 +127,8 @@ public class TraverseLoci extends TraversalEngine {
return new CoveredLocusView(dataProvider);
else if( dataSource == DataSource.REFERENCE )
return new AllLocusView(dataProvider);
else if( dataSource == DataSource.REFERENCE_ORDERED_DATA )
return new RodLocusView(dataProvider);
else
throw new UnsupportedOperationException("Unsupported traversal type: " + dataSource);
}

View File

@ -2,10 +2,7 @@ package org.broadinstitute.sting.gatk.traversals;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.datasources.providers.LocusReferenceView;
import org.broadinstitute.sting.gatk.datasources.providers.ReadView;
import org.broadinstitute.sting.gatk.datasources.providers.ReferenceOrderedView;
import org.broadinstitute.sting.gatk.datasources.providers.ShardDataProvider;
import org.broadinstitute.sting.gatk.datasources.providers.*;
import org.broadinstitute.sting.gatk.datasources.shards.Shard;
import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
@ -39,7 +36,7 @@ public class TraverseLocusWindows extends TraversalEngine {
ReadView readView = new ReadView( dataProvider );
LocusReferenceView referenceView = new LocusReferenceView( walker, dataProvider );
ReferenceOrderedView referenceOrderedDataView = new ReferenceOrderedView( dataProvider );
ReferenceOrderedView referenceOrderedDataView = new ManagingReferenceOrderedView( dataProvider );
AlignmentContext locus = getLocusContext(readView.iterator(), interval);

View File

@ -0,0 +1,43 @@
package org.broadinstitute.sting.gatk.walkers;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import java.util.ArrayList;
import java.util.List;
/**
* Same as CountRod but this one is a reference walker
*/
public class CountRodByRefWalker extends RefWalker<CountRodWalker.Datum, Pair<ExpandingArrayList<Long>, Long>> {
@Argument(fullName = "verbose", shortName = "v", doc="If true, Countrod will print out detailed information about the rods it finds and locations", required = false)
public boolean verbose = false;
@Argument(fullName = "showSkipped", shortName = "s", doc="If true, CountRod will print out the skippped locations", required = false)
public boolean showSkipped = false;
CountRodWalker crw = new CountRodWalker();
public void initialize() {
crw.verbose = verbose;
crw.showSkipped = showSkipped;
crw.out = out;
crw.err = err;
}
public CountRodWalker.Datum map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
return crw.map(tracker, ref, context);
}
public Pair<ExpandingArrayList<Long>, Long> reduceInit() {
return crw.reduceInit();
}
public Pair<ExpandingArrayList<Long>, Long> reduce(CountRodWalker.Datum point, Pair<ExpandingArrayList<Long>, Long> sum) {
return crw.reduce(point, sum);
}
}

View File

@ -0,0 +1,110 @@
package org.broadinstitute.sting.gatk.walkers;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import java.util.ArrayList;
import java.util.List;
import java.util.Collection;
import java.util.LinkedList;
public class CountRodWalker extends RodWalker<CountRodWalker.Datum, Pair<ExpandingArrayList<Long>, Long>> {
@Argument(fullName = "verbose", shortName = "v", doc="If true, Countrod will print out detailed information about the rods it finds and locations", required = false)
public boolean verbose = false;
@Argument(fullName = "showSkipped", shortName = "s", doc="If true, CountRod will print out the skippped locations", required = false)
public boolean showSkipped = false;
public class Datum {
public long nRodsAtThisLocation = 0;
public long nSkippedBases =0;
public long nTotalBases = 0;
public Datum( long nRodsAtThisLocation, long nSkippedBases, long nTotalBases ) {
this.nRodsAtThisLocation = nRodsAtThisLocation;
this.nSkippedBases = nSkippedBases;
this.nTotalBases = nTotalBases;
}
public String toString() {
return String.format("<%d %d %d>", nRodsAtThisLocation, nSkippedBases, nTotalBases);
}
}
public Datum map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
GenomeLoc cur = context.getLocation();
if ( verbose && showSkipped ) {
for ( int i = 0; i < context.getSkippedBases(); i++ ) {
out.printf("%s: skipped%n", GenomeLocParser.incPos(cur, i - context.getSkippedBases()));
}
}
long nRodsHere = 0;
long nTotalBases = 0;
if ( ref == null ) {
// we're getting the last skipped update
if ( verbose )
out.printf("Last position was %s: skipping %d bases%n",
context.getLocation(), context.getSkippedBases() );
nRodsHere = -1; // don't update this
nTotalBases = context.getSkippedBases();
} else {
Collection<ReferenceOrderedDatum> rods = new LinkedList<ReferenceOrderedDatum>();
for ( ReferenceOrderedDatum rod : tracker.getBoundRods() ) {
//System.out.printf("Considering rod %s%n", rod);
if ( rod.getLocation().getStart() == context.getLocation().getStart() && ! rod.getName().equals("interval") ) {
// only consider the first element
//System.out.printf("adding it%n");
rods.add(rod);
}
}
nRodsHere = rods.size();
if ( nRodsHere > 0 ) {
if ( verbose ) {
List<String> names = new ArrayList<String>();
for ( ReferenceOrderedDatum rod : rods ) {
names.add(rod.getName());
}
//System.out.printf("context is %s", context.getSkippedBases());
out.printf("At %s: found %d rod(s) [%s] after skipping %d bases%n",
context.getLocation(), nRodsHere, Utils.join(",", names), context.getSkippedBases() );
}
}
nTotalBases = context.getSkippedBases() + 1;
}
return new Datum(nRodsHere, context.getSkippedBases(), nTotalBases);
}
public Pair<ExpandingArrayList<Long>, Long> reduceInit() {
return new Pair<ExpandingArrayList<Long>, Long>(new ExpandingArrayList<Long>(), 0l);
}
private void updateCounts(ExpandingArrayList<Long> counts, long nRods, long nObs) {
if ( nRods >= 0 ) {
long prev = counts.get((int)nRods) == null ? 0l : counts.get((int)nRods);
counts.set((int)nRods, nObs + prev);
}
}
public Pair<ExpandingArrayList<Long>, Long> reduce(Datum point, Pair<ExpandingArrayList<Long>, Long> sum) {
ExpandingArrayList<Long> counts = sum.getFirst();
updateCounts(counts, point.nRodsAtThisLocation, 1);
updateCounts(counts, 0, point.nSkippedBases);
Pair<ExpandingArrayList<Long>, Long> r = new Pair<ExpandingArrayList<Long>, Long>(counts, point.nTotalBases + sum.getSecond());
//System.out.printf("Reduce: %s %s => %s%n", point, sum, r);
return r;
}
}

View File

@ -18,5 +18,6 @@ package org.broadinstitute.sting.gatk.walkers;
public enum DataSource {
READS,
REFERENCE,
REFERENCE_BASES // Do I actually need the reference bases passed to the walker?
REFERENCE_BASES, // Do I actually need the reference bases passed to the walker?
REFERENCE_ORDERED_DATA
}

View File

@ -0,0 +1,14 @@
package org.broadinstitute.sting.gatk.walkers;
/**
* Created by IntelliJ IDEA.
* User: mdepristo
* Date: Feb 22, 2009
* Time: 2:52:28 PM
* To change this template use File | Settings | File Templates.
*/
@By(DataSource.REFERENCE_ORDERED_DATA)
@Requires({DataSource.REFERENCE, DataSource.REFERENCE_ORDERED_DATA})
@Allows({DataSource.REFERENCE, DataSource.REFERENCE_ORDERED_DATA})
public abstract class RodWalker<MapType, ReduceType> extends LocusWalker<MapType, ReduceType> {
}

View File

@ -54,7 +54,7 @@ public class ReferenceOrderedViewTest extends BaseTest {
public void testNoBindings() {
Shard shard = new LocusShard(GenomeLocParser.createGenomeLoc("chrM",1,30));
ShardDataProvider provider = new ShardDataProvider(shard, null, seq, Collections.<ReferenceOrderedDataSource>emptyList());
ReferenceOrderedView view = new ReferenceOrderedView( provider );
ReferenceOrderedView view = new ManagingReferenceOrderedView( provider );
RefMetaDataTracker tracker = view.getReferenceOrderedDataAtLocus(GenomeLocParser.createGenomeLoc("chrM",10));
Assert.assertNull("The tracker should not have produced any data", tracker.lookup("tableTest",null));
@ -72,7 +72,7 @@ public class ReferenceOrderedViewTest extends BaseTest {
Shard shard = new LocusShard(GenomeLocParser.createGenomeLoc("chrM",1,30));
ShardDataProvider provider = new ShardDataProvider(shard, null, seq, Collections.singletonList(dataSource));
ReferenceOrderedView view = new ReferenceOrderedView( provider );
ReferenceOrderedView view = new ManagingReferenceOrderedView( provider );
RefMetaDataTracker tracker = view.getReferenceOrderedDataAtLocus(GenomeLocParser.createGenomeLoc("chrM",20));
TabularROD datum = (TabularROD)tracker.lookup("tableTest",null);
@ -98,7 +98,7 @@ public class ReferenceOrderedViewTest extends BaseTest {
Shard shard = new LocusShard(GenomeLocParser.createGenomeLoc("chrM",1,30));
ShardDataProvider provider = new ShardDataProvider(shard, null, seq, Arrays.asList(dataSource1,dataSource2));
ReferenceOrderedView view = new ReferenceOrderedView( provider );
ReferenceOrderedView view = new ManagingReferenceOrderedView( provider );
RefMetaDataTracker tracker = view.getReferenceOrderedDataAtLocus(GenomeLocParser.createGenomeLoc("chrM",20));
TabularROD datum1 = (TabularROD)tracker.lookup("tableTest1",null);

View File

@ -23,17 +23,20 @@ class Status:
def __init__(self, file, exists, size):
self.file = file
self.exists = exists
self.size = size
self._size = size
if not exists: self.status = "missing"
if size == 0: self.status = "no-data"
else: self.status = "exists: bytes=" + str(self.size)
else: self.status = "exists: bytes=" + str(self.size())
def __str__(self):
return self.status
return self.file + " " + self.status
def size(self):
return self._size
def viewSize(self):
return MergeBAMsUtils.greek(self.size)
return MergeBAMsUtils.greek(self.size())
def md5(file):
@ -50,10 +53,10 @@ class ComparedFiles:
self.ftpStat = ftpStat
def size(self):
if self.localStat.size <> 0:
return self.localStat.size
if self.ftpStat.size <> 0:
return self.ftpStat.size
if self.localStat.size() <> 0:
return self.localStat.size()
if self.ftpStat.size() <> 0:
return self.ftpStat.size()
else:
return 0
@ -67,7 +70,7 @@ def modTimeStr(t):
if t == 0:
return 'N/A'
else:
return time.strftime("%m/%d/%y", time.localtime(t))
return time.strftime("%m/%d/%y", time.localtime(int(t)))
def getSizeForFile(dir, filename):
global CACHED_LIST
@ -141,9 +144,10 @@ def validateFile(relPath, localRoot, ftpRoot):
return compared
def compareFileStatus(localStat, ftpStat):
if DEBUG: print 'comparing', localStat, ftpStat
if localStat.exists:
if ftpStat.exists:
if localStat.size == ftpStat.size:
if localStat.size() == ftpStat.size():
status = 'in-sync'
else:
status = 'size-mismatch'
@ -165,6 +169,8 @@ def filesInLocalPath(root, subdir):
if subdir <> None:
for fullroot, dirs, files in os.walk(os.path.join(root, subdir)):
for file in filter( regex.match, files ):
#if file <> "NA12761.SLX.WUGSC.Mosaik.SRP000033.2009_08.bam.bai":
# continue
fullpath = os.path.join(fullroot, file)
path = fullpath.split(root)[1]
#print 'adding relpath=', path, 'fullpath=', fullpath
@ -266,12 +272,15 @@ if __name__ == "__main__":
statusForFileListing = ['size-mismatch', 'local-file-missing']
maxFilesToList = 10
if status in statusForFileListing:
print 'SUMMARY: listing the first', maxFilesToList, 'of', n
print 'SUMMARY: listing the first', min(maxFilesToList, n), 'of', n
for file in itertools.islice(filesOfStatus, maxFilesToList):
print 'SUMMARY: File: %8s %12s %s' % ( MergeBAMsUtils.greek(file.size()), modTimeStr(file.modTime()), file.file)
if status == 'size-mismatch':
print 'SUMMARY: File: ftp=%d bytes local=%d bytes %12s %s' % ( file.ftpStat.size(), file.localStat.size() , modTimeStr(file.modTime()), file.file)
else:
print 'SUMMARY: File: %8s %12s %s' % ( MergeBAMsUtils.greek(file.size()), modTimeStr(file.modTime()), file.file)
if n > 0:
fileSizes = MergeBAMsUtils.greek(reduce(operator.__add__, map( ComparedFiles.size, filesOfStatus ), 0 ))
mostRecentMod = modTimeStr(apply(max, map( ComparedFiles.modTime, filesOfStatus )))
print 'SUMMARY: total size %s' % ( fileSizes )
print 'SUMMARY: last modification time %s' % ( mostRecentMod )
fileSizes = MergeBAMsUtils.greek(reduce(operator.__add__, map( ComparedFiles.size, filesOfStatus ), 0 ))
mostRecentMod = modTimeStr(apply(max, map( ComparedFiles.modTime, filesOfStatus ) + [0]))
print 'SUMMARY: total size %s' % ( fileSizes )
print 'SUMMARY: last modification time %s' % ( mostRecentMod )

View File

@ -2,7 +2,7 @@
/humgen/gsa-scr1/GATK_Data/Validation_Data/MV1994.bam /humgen/gsa-scr1/GATK_Data/Validation_Data/Escherichia_coli_K12_MG1655.fasta *
# daughter chrom 6 deep coverage first 10 MB
/broad/1KG/DCC/ftp/pilot_data/NA12878/alignment/NA12878.chrom6.SLX.SRP000032.2009_06.bam /broad/1KG/reference/human_b36_both.fasta 6:1-10,000,00
/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam /broad/1KG/reference/human_b36_both.fasta 1:10,500,000-11,000,000
# daughter chrom 1 deep coverage
/broad/1KG/DCC/ftp/pilot_data/NA12878/alignment/NA12878.chrom1.SLX.SRP000032.2009_06.bam /broad/1KG/reference/human_b36_both.fasta *